Skip to content

Commit

Permalink
Strongly-typed product token with parse validation
Browse files Browse the repository at this point in the history
  • Loading branch information
drmathias committed Aug 27, 2023
1 parent 20cd6c2 commit 65bbc5a
Show file tree
Hide file tree
Showing 12 changed files with 596 additions and 244 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Table of Contents


Parse _robots.txt_ and _sitemaps_ using dotnet.
Supports the proposed [RFC9309](https://datatracker.ietf.org/doc/html/rfc9309#name-the-allow-and-disallow-line) standard, as well as the following common, non-standard directives:
Supports the proposed [RFC9309](https://datatracker.ietf.org/doc/html/rfc9309) standard, as well as the following common, non-standard directives:

- Sitemap
- Host
Expand Down Expand Up @@ -145,8 +145,8 @@ var sitemap = await robotsTxt.LoadSitemapAsync(modifiedSince);

```csharp
var robotsTxt = await robotWebClient.LoadRobotsTxtAsync();
// if rules for the specific User-Agent are not present, it falls back to the wildcard *
var anyRulesDefined = robotsTxt.TryGetRules("SomeBotUserAgent", out var rules);
// if rules for the specific robot are not present, it falls back to the wildcard *
var anyRulesDefined = robotsTxt.TryGetRules(ProductToken.Parse("SomeBot"), out var rules);
// even if no wildcard rules exist, an empty rule-checker is returned
var isAllowed = rules.IsAllowed("/some/path");
```
Expand All @@ -163,8 +163,9 @@ var hasHostDirective = robotsTxt.TryGetHost(out var host);

```csharp
var robotsTxt = await robotWebClient.LoadRobotsTxtAsync();
// if rules for the specific robot are not present, it falls back to the wildcard *
// if no Crawl-delay directive exists, crawl delay will be 0
var hasCrawlDelayDirective = robotsTxt.TryGetCrawlDelay(out var crawlDelay);
var hasCrawlDelayDirective = robotsTxt.TryGetCrawlDelay(ProductToken.Parse("SomeBot"), out var crawlDelay);
```

# Contributing
Expand Down
8 changes: 4 additions & 4 deletions src/Robots.Txt.Parser/Http/RobotWebClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public async Task<IRobotsTxt> LoadRobotsTxtAsync(CancellationToken cancellationT
If a server status code indicates that the robots.txt file is unavailable to the crawler,
then the crawler MAY access any resources on the server.
*/
return new RobotsTxt(this, new Dictionary<string, HashSet<UrlRule>>(), new Dictionary<string, int>(), null, new HashSet<Uri>());
return new RobotsTxt(this, new Dictionary<ProductToken, HashSet<UrlRule>>(), new Dictionary<ProductToken, int>(), null, new HashSet<Uri>());
}

if (statusCodeNumber >= 500)
Expand All @@ -68,11 +68,11 @@ then the crawler MAY access any resources on the server.
crawler MUST assume complete disallow. For example, in the context of HTTP, server errors are identified by status codes in
the 500-599 range.
*/
var userAgentRules = new Dictionary<string, HashSet<UrlRule>>
var userAgentRules = new Dictionary<ProductToken, HashSet<UrlRule>>
{
{ "*", new HashSet<UrlRule> { new (RuleType.Disallow, "/") } }
{ ProductToken.Wildcard, new HashSet<UrlRule> { new (RuleType.Disallow, "/") } }
};
return new RobotsTxt(this, userAgentRules, new Dictionary<string, int>(), null, new HashSet<Uri>());
return new RobotsTxt(this, userAgentRules, new Dictionary<ProductToken, int>(), null, new HashSet<Uri>());
}

var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
Expand Down
75 changes: 75 additions & 0 deletions src/Robots.Txt.Parser/ProductToken.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
using System;
using System.Text.RegularExpressions;

namespace Robots.Txt.Parser;

/// <summary>
/// Crawler name, used as the User-agent value within a robots.txt file
/// </summary>
public partial class ProductToken : IEquatable<string>, IEquatable<ProductToken>
{
public static readonly ProductToken Wildcard = new("*");
private static Regex ValidationPattern = ProductTokenValidationRegex();

private readonly string _value;

private ProductToken(string value) => _value = value;

public static ProductToken Parse(string value)
{
if (value != Wildcard._value && !ValidationPattern.IsMatch(value))
{
throw new ArgumentOutOfRangeException(
nameof(value),
"Must contain only uppercase and lowercase letters (\"a-z\" and \"A-Z\"), underscores (\"_\"), and hyphens (\"-\")");
}

return new ProductToken(value);
}

public static bool TryParse(string value, out ProductToken productToken)
{
productToken = Wildcard;
if (value != Wildcard._value && !ValidationPattern.IsMatch(value)) return false;
productToken = new ProductToken(value);
return true;
}

/*
Crawlers set their own name, which is called a product token, to find relevant groups.
The product token MUST contain only uppercase and lowercase letters ("a-z" and "A-Z"), underscores ("_"), and hyphens ("-").
*/
[GeneratedRegex("^[a-zA-Z-_]+$", RegexOptions.Compiled)]
private static partial Regex ProductTokenValidationRegex();

/*
Crawlers MUST use case-insensitive matching to find the group that matches the product token and then obey the rules of the group.
*/

/// <summary>
/// Assesses product token equality. Product tokens are case-insensitive.
/// </summary>
/// <param name="obj">Comparison value</param>
/// <returns>True if the product token is equal; otherwise false</returns>
public override bool Equals(object? obj) =>
obj is string otherString
? Equals(otherString)
: obj is ProductToken otherToken && Equals(otherToken);

/// <summary>
/// Assesses product token equality. Product tokens are case-insensitive.
/// </summary>
/// <param name="other">Comparison value</param>
/// <returns>True if the product token is equal; otherwise false</returns>
public bool Equals(string? other) => other is not null && _value.Equals(other, StringComparison.InvariantCultureIgnoreCase);

/// <summary>
/// Assesses product token equality. Product tokens are case-insensitive.
/// </summary>
/// <param name="other">Comparison value</param>
/// <returns>True if the product token is equal; otherwise false</returns>
public bool Equals(ProductToken? other) =>
other is not null && _value.Equals(other._value, StringComparison.InvariantCultureIgnoreCase);

public override int GetHashCode() => _value.ToUpperInvariant().GetHashCode();
}
22 changes: 11 additions & 11 deletions src/Robots.Txt.Parser/RobotsTxt.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public interface IRobotsTxt
/// <param name="userAgent">User-Agent header to retrieve rules for</param>
/// <param name="crawlDelay">The crawl delay in seconds</param>
/// <returns>True if a crawl delay directive exists; otherwise false</returns>
bool TryGetCrawlDelay(string userAgent, out int crawlDelay);
bool TryGetCrawlDelay(ProductToken userAgent, out int crawlDelay);

/// <summary>
/// Retrieves the website host
Expand All @@ -37,22 +37,22 @@ public interface IRobotsTxt
/// <param name="userAgent">User-Agent header to retrieve rules for</param>
/// <param name="ruleChecker">A rule checker for the User-Agent</param>
/// <returns>True if any rules are found; otherwise false</returns>
bool TryGetRules(string userAgent, out IRobotRuleChecker ruleChecker);
bool TryGetRules(ProductToken userAgent, out IRobotRuleChecker ruleChecker);
}

public class RobotsTxt : IRobotsTxt
{
private readonly IRobotClient _client;

private readonly IReadOnlyDictionary<string, HashSet<UrlRule>> _userAgentRules;
private readonly IReadOnlyDictionary<string, int> _userAgentCrawlDirectives;
private readonly HashSet<string> _userAgents;
private readonly IReadOnlyDictionary<ProductToken, HashSet<UrlRule>> _userAgentRules;
private readonly IReadOnlyDictionary<ProductToken, int> _userAgentCrawlDirectives;
private readonly HashSet<ProductToken> _userAgents;
private readonly string? _host;
private readonly HashSet<Uri> _sitemapUrls;

internal RobotsTxt(IRobotClient client,
IReadOnlyDictionary<string, HashSet<UrlRule>> userAgentRules,
IReadOnlyDictionary<string, int> userAgentCrawlDirectives,
IReadOnlyDictionary<ProductToken, HashSet<UrlRule>> userAgentRules,
IReadOnlyDictionary<ProductToken, int> userAgentCrawlDirectives,
string? host,
HashSet<Uri> sitemapUrls)
{
Expand All @@ -71,13 +71,13 @@ internal RobotsTxt(IRobotClient client,
: await _client.LoadSitemapsAsync(new[] { new Uri(_client.BaseAddress, "/sitemap.xml") }, modifiedSince, cancellationToken);

/// <inheritdoc />
public bool TryGetCrawlDelay(string userAgent, out int crawlDelay)
public bool TryGetCrawlDelay(ProductToken userAgent, out int crawlDelay)
{
var userAgentMatch = _userAgentCrawlDirectives.TryGetValue(userAgent, out crawlDelay);
if (!userAgentMatch)
{
if (_userAgents.Contains(userAgent)) return false;
return _userAgentCrawlDirectives.TryGetValue("*", out crawlDelay);
return _userAgentCrawlDirectives.TryGetValue(ProductToken.Wildcard, out crawlDelay);
}

return true;
Expand All @@ -91,9 +91,9 @@ public bool TryGetHost(out string host)
}

/// <inheritdoc />
public bool TryGetRules(string userAgent, out IRobotRuleChecker ruleChecker)
public bool TryGetRules(ProductToken userAgent, out IRobotRuleChecker ruleChecker)
{
if (!_userAgentRules.TryGetValue(userAgent, out var rules) && !_userAgentRules.TryGetValue("*", out rules))
if (!_userAgentRules.TryGetValue(userAgent, out var rules) && !_userAgentRules.TryGetValue(ProductToken.Wildcard, out rules))
{
ruleChecker = new RobotRuleChecker(new HashSet<UrlRule>());
return false;
Expand Down
15 changes: 9 additions & 6 deletions src/Robots.Txt.Parser/RobotsTxtParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ public async Task<IRobotsTxt> ReadFromStreamAsync(Stream stream, CancellationTok
/*
Crawlers MUST use case-insensitive matching to find the group that matches the product token
*/
var currentUserAgents = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var userAgentRules = new Dictionary<string, HashSet<UrlRule>>(StringComparer.OrdinalIgnoreCase);
var userAgentCrawlDirectives = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
var currentUserAgents = new HashSet<ProductToken>();
var userAgentRules = new Dictionary<ProductToken, HashSet<UrlRule>>();
var userAgentCrawlDirectives = new Dictionary<ProductToken, int>();

/*
The file MUST be UTF-8 encoded
Expand All @@ -67,9 +67,12 @@ The file MUST be UTF-8 encoded
{
if (!previousLineWasUserAgent) currentUserAgents.Clear();
var currentUserAgent = GetValueOfDirective(line, UserAgentDirective);
currentUserAgents.Add(currentUserAgent);
userAgentRules.TryAdd(currentUserAgent, new HashSet<UrlRule>());
previousLineWasUserAgent = true;
if (ProductToken.TryParse(currentUserAgent, out var productToken))
{
currentUserAgents.Add(productToken);
userAgentRules.TryAdd(productToken, new HashSet<UrlRule>());
previousLineWasUserAgent = true;
}
continue;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public async Task LoadRobotsTxtAsync_5XXResponse_AssumeDisallowAll(int statusCod

// Assert
robotsTxt.Should().NotBe(null);
var hasRules = robotsTxt.TryGetRules("SomeBot", out var rules);
var hasRules = robotsTxt.TryGetRules(ProductToken.Parse("SomeBot"), out var rules);
hasRules.Should().Be(true);
rules.IsAllowed("/").Should().Be(false);
}
Expand All @@ -67,7 +67,7 @@ public async Task LoadRobotsTxtAsync_4XXResponse_AssumeAllowAll(int statusCode)

// Assert
robotsTxt.Should().NotBe(null);
var hasRules = robotsTxt.TryGetRules("SomeBot", out var rules);
var hasRules = robotsTxt.TryGetRules(ProductToken.Parse("SomeBot"), out var rules);
hasRules.Should().Be(false);
rules.IsAllowed("/").Should().Be(true);
}
Expand Down
Loading

0 comments on commit 65bbc5a

Please sign in to comment.