Compare commits

..

2 Commits

Author SHA1 Message Date
Robert McRackan
f55a3ca008 Search engine bug fix and unit tests 2021-04-02 11:27:16 -04:00
Robert McRackan
726b36de4d * bug fix: when user creates a tag which is also a reserved bool word (eg: israted), searching for this tag breaks the search
* add unit tests for search engine
2021-04-01 15:45:19 -04:00
6 changed files with 182 additions and 18 deletions

View File

@@ -86,6 +86,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "0 Libation Tests", "0 Libat
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InternalUtilities.Tests", "_Tests\InternalUtilities.Tests\InternalUtilities.Tests.csproj", "{8447C956-B03E-4F59-9DD4-877793B849D9}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LibationSearchEngine.Tests", "_Tests\LibationSearchEngine.Tests\LibationSearchEngine.Tests.csproj", "{C5B21768-C7C9-4FCB-AC1E-187B223D5A98}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -208,6 +210,10 @@ Global
{8447C956-B03E-4F59-9DD4-877793B849D9}.Debug|Any CPU.Build.0 = Debug|Any CPU
{8447C956-B03E-4F59-9DD4-877793B849D9}.Release|Any CPU.ActiveCfg = Release|Any CPU
{8447C956-B03E-4F59-9DD4-877793B849D9}.Release|Any CPU.Build.0 = Release|Any CPU
{C5B21768-C7C9-4FCB-AC1E-187B223D5A98}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C5B21768-C7C9-4FCB-AC1E-187B223D5A98}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C5B21768-C7C9-4FCB-AC1E-187B223D5A98}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C5B21768-C7C9-4FCB-AC1E-187B223D5A98}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -242,6 +248,7 @@ Global
{E7EFD64D-6630-4426-B09C-B6862A92E3FD} = {F0CBB7A7-D3FB-41FF-8F47-CF3F6A592249}
{F3B04A3A-20C8-4582-A54A-715AF6A5D859} = {8679CAC8-9164-4007-BDD2-F004810EDA14}
{8447C956-B03E-4F59-9DD4-877793B849D9} = {67E66E82-5532-4440-AFB3-9FB1DF9DEF53}
{C5B21768-C7C9-4FCB-AC1E-187B223D5A98} = {67E66E82-5532-4440-AFB3-9FB1DF9DEF53}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {615E00ED-BAEF-4E8E-A92A-9B82D87942A9}

View File

@@ -13,7 +13,7 @@
<!-- <PublishSingleFile>true</PublishSingleFile> -->
<RuntimeIdentifier>win-x64</RuntimeIdentifier>
<Version>4.2.1.1</Version>
<Version>4.2.3.1</Version>
</PropertyGroup>
<ItemGroup>

View File

@@ -42,5 +42,58 @@ namespace LibationSearchEngine
// positive look behind: beginning space { [ :
// positive look ahead: end space ] }
public static Regex NumbersRegex { get; } = new Regex(@"(?<=^|\s|\{|\[|:)(\d+\.?\d*)(?=$|\s|\]|\})", RegexOptions.Compiled);
/// <summary>
/// proper bools are single keywords which are turned into keyword:True
/// if bordered by colons or inside brackets, they are not stand-alone bool keywords
/// the negative lookbehind and lookahead patterns prevent bugs where a bool keyword is also a user-defined tag:
/// [israted]
/// parseTag => tags:israted
/// replaceBools => tags:israted:True
/// or
/// [israted]
/// replaceBools => israted:True
/// parseTag => [israted:True]
/// also don't want to apply :True where the value already exists:
/// israted:false => israted:false:True
///
/// despite using parans, lookahead and lookbehind are zero-length assertions which do not capture. therefore the bool search keyword is still $1 since it's the first and only capture
/// </summary>
private static string boolPattern_parameterized { get; }
= @"
### IMPORTANT: 'ignore whitespace' is only partially honored in character sets
### - new lines are ok
### - ANY leading whitespace is treated like actual matching spaces :(
### can't begin with colon. incorrect syntax
### can't begin with open bracket: this signals the start of a tag
(?<! # begin negative lookbehind
[:\[] # char set: colon and open bracket, escaped
\s* # optional space
) # end negative lookbehind
\b # word boundary
({0}) # captured bool search keyword. this is the $1 reference used in regex.Replace
\b # word boundary
### can't end with colon. this signals that the bool's value already exists
### can't begin with close bracket: this signals the end of a tag
(?! # begin negative lookahead
\s* # optional space
[:\]] # char set: colon and close bracket, escaped
) # end negative lookahead
";
private static Dictionary<string, Regex> boolRegexDic { get; } = new Dictionary<string, Regex>();
public static Regex GetBoolRegex(string boolSearch)
{
if (boolRegexDic.TryGetValue(boolSearch, out var regex))
return regex;
var boolPattern = string.Format(boolPattern_parameterized, boolSearch);
regex = new Regex(boolPattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.Compiled);
boolRegexDic.Add(boolSearch, regex);
return regex;
}
}
}

View File

@@ -347,32 +347,33 @@ namespace LibationSearchEngine
public SearchResultSet Search(string searchString)
{
Serilog.Log.Logger.Debug("original search string: {@DebugInfo}", new { searchString });
searchString = FormatSearchQuery(searchString);
Serilog.Log.Logger.Debug("formatted search string: {@DebugInfo}", new { searchString });
var results = generalSearch(searchString);
Serilog.Log.Logger.Debug("Hit(s): {@DebugInfo}", new { count = results.Docs.Count() });
displayResults(results);
return results;
}
public static string FormatSearchQuery(string searchString)
{
if (string.IsNullOrWhiteSpace(searchString))
searchString = ALL_QUERY;
#region apply formatting
searchString = parseTag(searchString);
return ALL_QUERY;
searchString = replaceBools(searchString);
searchString = parseTag(searchString);
// in ranges " TO " must be uppercase
searchString = searchString.Replace(" to ", " TO ");
searchString = padNumbers(searchString);
searchString = lowerFieldNames(searchString);
#endregion
Serilog.Log.Logger.Debug("formatted search string: {@DebugInfo}", new { searchString });
var results = generalSearch(searchString);
Serilog.Log.Logger.Debug("Hit(s): {@DebugInfo}", new { count = results.Docs.Count() });
displayResults(results);
return results;
return searchString;
}
#region format query string
@@ -395,9 +396,10 @@ namespace LibationSearchEngine
private static string replaceBools(string searchString)
{
// negative look-ahead for optional spaces then colon. don't want to double-up. eg:"israted:false" => "israted:false:True"
foreach (var boolSearch in boolIndexRules.Keys)
searchString = Regex.Replace(searchString, $@"\b({boolSearch})\b(?!\s*:)", @"$1:True", RegexOptions.IgnoreCase);
searchString =
LuceneRegex.GetBoolRegex(boolSearch)
.Replace(searchString, @"$1:True");
return searchString;
}
@@ -434,7 +436,7 @@ namespace LibationSearchEngine
return searchString;
}
#endregion
#endregion
private SearchResultSet generalSearch(string searchString)
{

View File

@@ -0,0 +1,24 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net5.0</TargetFramework>
<IsPackable>false</IsPackable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.9.4" />
<PackageReference Include="MSTest.TestAdapter" Version="2.2.3" />
<PackageReference Include="MSTest.TestFramework" Version="2.2.3" />
<PackageReference Include="coverlet.collector" Version="3.0.3">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\Dinah.Core\_Tests\TestCommon\TestCommon.csproj" />
<ProjectReference Include="..\..\LibationSearchEngine\LibationSearchEngine.csproj" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,78 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Threading;
using System.Threading.Tasks;
using Dinah.Core;
using FluentAssertions;
using FluentAssertions.Common;
using LibationSearchEngine;
using Microsoft.VisualStudio.TestPlatform.Common.Filtering;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using Moq;
using Moq.Protected;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using TestCommon;
namespace SearchEngineTests
{
[TestClass]
public class FormatSearchQuery
{
[TestMethod]
// null, empty, whitespace -- *:*
[DataRow(null, "*:*")]
[DataRow("", "*:*")]
[DataRow(" ", "*:*")]
// tag surrounded by spaces
[DataRow("[foo]", "tags:foo")]
[DataRow(" [foo]", " tags:foo")]
[DataRow("[foo] ", "tags:foo ")]
[DataRow(" [foo] ", " tags:foo ")]
[DataRow("-[foo]", "-tags:foo")]
[DataRow(" -[foo]", " -tags:foo")]
[DataRow("-[foo] ", "-tags:foo ")]
[DataRow(" -[foo] ", " -tags:foo ")]
// tag case irrelevant
[DataRow("[FoO]", "tags:FoO")]
// bool keyword surrounded by spaces
[DataRow("israted", "israted:True")]
[DataRow(" israted", " israted:True")]
[DataRow("israted ", "israted:True ")]
[DataRow(" israted ", " israted:True ")]
[DataRow("-israted", "-israted:True")]
[DataRow(" -israted", " -israted:True")]
[DataRow("-israted ", "-israted:True ")]
[DataRow(" -israted ", " -israted:True ")]
// bool keyword. Append :True
[DataRow("israted", "israted:True")]
// bool keyword with [:bool]. Do not add :True
[DataRow("israted:True", "israted:True")]
[DataRow("isRated:false", "israted:false")]
// tag which happens to be a bool keyword >> parse as tag
[DataRow("[israted]", "tags:israted")]
// numbers with "to". TO all caps, numbers [8.2] format
[DataRow("1 to 10", "00000001.00 TO 00000010.00")]
[DataRow("19990101 to 20001231", "19990101.00 TO 20001231.00")]
// field to lowercase
[DataRow("Author:Doyle", "author:Doyle")]
// bool field to lowercase
[DataRow("IsRated", "israted:True")]
[DataRow("-isRATED", "-israted:True")]
public void FormattingTest(string input, string output)
=> SearchEngine.FormatSearchQuery(input).Should().Be(output);
}
}