-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d3ea15d
commit d77b543
Showing
11 changed files
with
892 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,6 @@ | ||
# SuffixTreeSharp | ||
[![NuGet Status](http://nugetstatus.com/SuffixTreeSharp.png)](http://nugetstatus.com/packages/SuffixTreeSharp) | ||
|
||
Generalized Suffix Tree in pure C# | ||
|
||
Targetting .NET Standard 1.6 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFramework>netcoreapp3.1</TargetFramework> | ||
|
||
<IsPackable>false</IsPackable> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.9.4" /> | ||
<PackageReference Include="MSTest.TestAdapter" Version="2.2.3" /> | ||
<PackageReference Include="MSTest.TestFramework" Version="2.2.3" /> | ||
<PackageReference Include="coverlet.collector" Version="3.0.2" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\SuffixTreeSharp\SuffixTreeSharp.csproj" /> | ||
</ItemGroup> | ||
|
||
</Project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,249 @@ | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using Microsoft.VisualStudio.TestTools.UnitTesting; | ||
|
||
namespace SuffixTreeSharp.Test | ||
{ | ||
[TestClass] | ||
public class SuffixTreeTest | ||
{ | ||
public static void AssertEmpty<T>(ICollection<T> collection) | ||
{ | ||
Assert.IsTrue(collection.Count == 0, "Expected empty collection."); | ||
} | ||
|
||
[TestMethod] | ||
public void TestBasicTreeGeneration() | ||
{ | ||
var input = new GeneralizedSuffixTree(); | ||
|
||
var word = "cacao"; | ||
input.Put(word, 0); | ||
|
||
/* Test that every substring is contained within the tree */ | ||
foreach (var s in word.GetSubstrings()) | ||
{ | ||
Assert.IsTrue(input.Search(s).Contains(0)); | ||
} | ||
|
||
AssertEmpty(input.Search("caco")); | ||
AssertEmpty(input.Search("cacaoo")); | ||
AssertEmpty(input.Search("ccacao")); | ||
|
||
input = new GeneralizedSuffixTree(); | ||
word = "bookkeeper"; | ||
input.Put(word, 0); | ||
foreach (var s in word.GetSubstrings()) | ||
{ | ||
Assert.IsTrue(input.Search(s).Contains(0)); | ||
} | ||
|
||
AssertEmpty(input.Search("books")); | ||
AssertEmpty(input.Search("boke")); | ||
AssertEmpty(input.Search("ookepr")); | ||
} | ||
|
||
[TestMethod] | ||
public void TestWeirdword() | ||
{ | ||
var input = new GeneralizedSuffixTree(); | ||
|
||
var word = "cacacato"; | ||
input.Put(word, 0); | ||
|
||
/* Test that every substring is contained within the tree */ | ||
foreach (var s in word.GetSubstrings()) | ||
{ | ||
Assert.IsTrue(input.Search(s).Contains(0)); | ||
} | ||
} | ||
|
||
[TestMethod] | ||
public void TestDouble() | ||
{ | ||
// Test whether the tree can handle repetitions | ||
var input = new GeneralizedSuffixTree(); | ||
var word = "cacao"; | ||
input.Put(word, 0); | ||
input.Put(word, 1); | ||
|
||
foreach (var s in word.GetSubstrings()) | ||
{ | ||
Assert.IsTrue(input.Search(s).Contains(0)); | ||
Assert.IsTrue(input.Search(s).Contains(1)); | ||
} | ||
} | ||
|
||
[TestMethod] | ||
public void TestBananaAddition() | ||
{ | ||
var input = new GeneralizedSuffixTree(); | ||
var words = new[] { "banana", "bano", "ba" }; | ||
for (var i = 0; i < words.Length; ++i) | ||
{ | ||
input.Put(words[i], i); | ||
|
||
foreach (var s in words[i].GetSubstrings()) | ||
{ | ||
var result = input.Search(s); | ||
Assert.IsNotNull(result, "result null for string " + s + " after adding " + words[i]); | ||
Assert.IsTrue(result.Contains(i), "substring " + s + " not found after adding " + words[i]); | ||
} | ||
} | ||
|
||
// verify post-addition | ||
for (var i = 0; i < words.Length; ++i) | ||
{ | ||
foreach (var s in words[i].GetSubstrings()) | ||
{ | ||
Assert.IsTrue(input.Search(s).Contains(i)); | ||
} | ||
} | ||
|
||
// add again, to see if it's stable | ||
for (var i = 0; i < words.Length; ++i) | ||
{ | ||
input.Put(words[i], i + words.Length); | ||
|
||
foreach (var s in words[i].GetSubstrings()) | ||
{ | ||
Assert.IsTrue(input.Search(s).Contains(i + words.Length)); | ||
} | ||
} | ||
} | ||
|
||
[TestMethod] | ||
public void TestAddition() | ||
{ | ||
var input = new GeneralizedSuffixTree(); | ||
var words = new[] { "cacaor", "caricato", "cacato", "cacata", "caricata", "cacao", "banana" }; | ||
for (var i = 0; i < words.Length; ++i) | ||
{ | ||
input.Put(words[i], i); | ||
|
||
foreach (var s in words[i].GetSubstrings()) | ||
{ | ||
var result = input.Search(s); | ||
Assert.IsNotNull(result, "result null for string " + s + " after adding " + words[i]); | ||
Assert.IsTrue(result.Contains(i), "substring " + s + " not found after adding " + words[i]); | ||
} | ||
} | ||
|
||
// verify post-addition | ||
for (var i = 0; i < words.Length; ++i) | ||
{ | ||
foreach (var s in words[i].GetSubstrings()) | ||
{ | ||
var result = input.Search(s); | ||
Assert.IsNotNull(result, "result null for string " + s + " after adding " + words[i]); | ||
Assert.IsTrue(result.Contains(i), "substring " + s + " not found after adding " + words[i]); | ||
} | ||
} | ||
|
||
// add again, to see if it's stable | ||
for (var i = 0; i < words.Length; ++i) | ||
{ | ||
input.Put(words[i], i + words.Length); | ||
|
||
foreach (var s in words[i].GetSubstrings()) | ||
{ | ||
Assert.IsTrue(input.Search(s).Contains(i + words.Length)); | ||
} | ||
} | ||
|
||
// input.computeCount(); | ||
// TestResultsCount(input.getRoot()); | ||
|
||
AssertEmpty(input.Search("aoca")); | ||
} | ||
|
||
[TestMethod] | ||
public void TestSampleAddition() | ||
{ | ||
var input = new GeneralizedSuffixTree(); | ||
var words = new[] | ||
{ | ||
"libertypike", | ||
"franklintn", | ||
"carothersjohnhenryhouse", | ||
"carothersezealhouse", | ||
"acrossthetauntonriverfromdightonindightonrockstatepark", | ||
"dightonma", | ||
"dightonrock", | ||
"6mineoflowgaponlowgapfork", | ||
"lowgapky", | ||
"lemasterjohnjandellenhouse", | ||
"lemasterhouse", | ||
"70wilburblvd", | ||
"poughkeepsieny", | ||
"freerhouse", | ||
"701laurelst", | ||
"conwaysc", | ||
"hollidayjwjrhouse", | ||
"mainandappletonsts", | ||
"menomoneefallswi", | ||
"mainstreethistoricdistrict", | ||
"addressrestricted", | ||
"brownsmillsnj", | ||
"hanoverfurnace", | ||
"hanoverbogironfurnace", | ||
"sofsavannahatfergusonaveandbethesdard", | ||
"savannahga", | ||
"bethesdahomeforboys", | ||
"bethesda" | ||
}; | ||
for (var i = 0; i < words.Length; ++i) | ||
{ | ||
input.Put(words[i], i); | ||
|
||
foreach (var s in words[i].GetSubstrings()) | ||
{ | ||
var result = input.Search(s); | ||
Assert.IsNotNull(result, "result null for string " + s + " after adding " + words[i]); | ||
Assert.IsTrue(result.Contains(i), "substring " + s + " not found after adding " + words[i]); | ||
} | ||
} | ||
|
||
// verify post-addition | ||
for (var i = 0; i < words.Length; ++i) | ||
{ | ||
foreach (var s in words[i].GetSubstrings()) | ||
{ | ||
Assert.IsTrue(input.Search(s).Contains(i)); | ||
} | ||
} | ||
|
||
// add again, to see if it's stable | ||
for (var i = 0; i < words.Length; ++i) | ||
{ | ||
input.Put(words[i], i + words.Length); | ||
|
||
foreach (var s in words[i].GetSubstrings()) | ||
{ | ||
Assert.IsTrue(input.Search(s).Contains(i + words.Length)); | ||
} | ||
} | ||
|
||
// input.computeCount(); | ||
// TestResultsCount(input.getRoot()); | ||
|
||
AssertEmpty(input.Search("aoca")); | ||
} | ||
|
||
// private void TestResultsCount(Node n) { | ||
// for (Edge e : n.getEdges().values()) { | ||
// assertEquals(n.getData(-1).size(), n.getResultCount()); | ||
// TestResultsCount(e.getDest()); | ||
// } | ||
// } | ||
|
||
/* Testing a Test method :) */ | ||
[TestMethod] | ||
public void TestGetSubstrings() | ||
{ | ||
var exp = new[] { "w", "r", "d", "wr", "rd", "wrd" }.ToHashSet(); | ||
var ret = "wrd".GetSubstrings(); | ||
Assert.IsTrue(ret.SetEquals(exp)); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Text; | ||
|
||
namespace SuffixTreeSharp.Test | ||
{ | ||
public static class Utils | ||
{ | ||
/** | ||
* Normalize an input string | ||
* | ||
* @param input the input string to normalize | ||
* @return <tt>input</tt> all lower-case, withoutput any non alphanumeric character | ||
*/ | ||
public static string Normalize(this string input) | ||
{ | ||
var output = new StringBuilder(); | ||
var l = input.ToLower(); | ||
foreach (var c in l.Where(c => c >= 'a' && c <= 'z' || c >= '0' && c <= '9')) | ||
{ | ||
output.Append(c); | ||
} | ||
return output.ToString(); | ||
} | ||
|
||
/** | ||
* Computes the set of all the substrings contained within the <tt>str</tt> | ||
* | ||
* It is fairly inefficient, but it is used just in tests ;) | ||
* @param str the string to compute substrings of | ||
* @return the set of all possible substrings of str | ||
*/ | ||
public static HashSet<string> GetSubstrings(this string str) | ||
{ | ||
var ret = new HashSet<string>(); | ||
// compute all substrings | ||
for (var len = 1; len <= str.Length; ++len) | ||
{ | ||
for (var start = 0; start + len <= str.Length; ++start) | ||
{ | ||
var itstr = str.Substring(start, len); | ||
ret.Add(itstr); | ||
} | ||
} | ||
|
||
return ret; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
|
||
Microsoft Visual Studio Solution File, Format Version 12.00 | ||
# Visual Studio Version 16 | ||
VisualStudioVersion = 16.0.31911.196 | ||
MinimumVisualStudioVersion = 10.0.40219.1 | ||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SuffixTreeSharp", "SuffixTreeSharp\SuffixTreeSharp.csproj", "{C0986C3D-E80F-4753-B0AD-F185EB838A1D}" | ||
EndProject | ||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SuffixTreeSharp.Test", "SuffixTreeSharp.Test\SuffixTreeSharp.Test.csproj", "{902F9192-EED9-44B6-8B39-222B725545E3}" | ||
EndProject | ||
Global | ||
GlobalSection(SolutionConfigurationPlatforms) = preSolution | ||
Debug|Any CPU = Debug|Any CPU | ||
Release|Any CPU = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(ProjectConfigurationPlatforms) = postSolution | ||
{C0986C3D-E80F-4753-B0AD-F185EB838A1D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{C0986C3D-E80F-4753-B0AD-F185EB838A1D}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{C0986C3D-E80F-4753-B0AD-F185EB838A1D}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{C0986C3D-E80F-4753-B0AD-F185EB838A1D}.Release|Any CPU.Build.0 = Release|Any CPU | ||
{902F9192-EED9-44B6-8B39-222B725545E3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{902F9192-EED9-44B6-8B39-222B725545E3}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{902F9192-EED9-44B6-8B39-222B725545E3}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{902F9192-EED9-44B6-8B39-222B725545E3}.Release|Any CPU.Build.0 = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(SolutionProperties) = preSolution | ||
HideSolutionNode = FALSE | ||
EndGlobalSection | ||
GlobalSection(ExtensibilityGlobals) = postSolution | ||
SolutionGuid = {3879536C-DBC5-462A-82F3-AB8CED5E0F58} | ||
EndGlobalSection | ||
EndGlobal |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Text; | ||
|
||
namespace SuffixTreeSharp | ||
{ | ||
public class CombinedSearchTrees : ISearchTree | ||
{ | ||
public readonly List<ISearchTree> SearchTrees = new List<ISearchTree>(); | ||
|
||
public ISet<int> Search(string word) | ||
{ | ||
ISet<int> searchResults = new HashSet<int>(); | ||
return SearchTrees.Select(searchTree => searchTree.Search(word)).Aggregate(searchResults, Union); | ||
} | ||
|
||
/// <summary> | ||
/// Efficiently get all the elements from both sets. | ||
/// Note that this implementation will alter the original sets. | ||
/// </summary> | ||
private static ISet<int> Union(ISet<int> set1, ISet<int> set2) | ||
{ | ||
if (set1.Count > set2.Count) | ||
{ | ||
set1.UnionWith(set2); | ||
return set1; | ||
} | ||
|
||
set2.UnionWith(set1); | ||
return set2; | ||
} | ||
} | ||
} |
Oops, something went wrong.