Skip to content

Commit

Permalink
Byte limit for parsing XML sitemaps
Browse files Browse the repository at this point in the history
  • Loading branch information
drmathias committed Aug 28, 2023
1 parent ab58b84 commit 981423b
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 54 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ There is also the possibility to extend this library to support protocols other
# Features

| Name | Supported | Priority |
|------|-----------|---------|
|------|-----------|----------|
| HTTP/HTTPS | ✔️ | |
| FTPS/FTPS || 0.1 |
| Wildcard (`*`) User-agent | ✔️ | |
Expand Down
2 changes: 1 addition & 1 deletion src/Robots.Txt.Parser/Http/RobotWebClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ MediaTypeNames.Text.Xml or MediaTypeNames.Application.Xml or _
continue;
}

if (parsedSitemap is SitemapRoot sitemapRoot)
if (parsedSitemap is SitemapIndex sitemapRoot)
{
var sitemaps = await (this as IRobotWebClient).LoadSitemapsAsync(sitemapRoot.SitemapUris, modifiedSince, cancellationToken);
if (sitemaps is not null) sitemap = sitemaps.Combine(sitemaps);
Expand Down
4 changes: 2 additions & 2 deletions src/Robots.Txt.Parser/ISitemap.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ internal Sitemap Combine(Sitemap other)
}
}

internal class SitemapRoot : Sitemap
internal class SitemapIndex : Sitemap
{
public SitemapRoot(HashSet<Uri> sitemapUris) : base(new HashSet<UrlSetItem>())
public SitemapIndex(HashSet<Uri> sitemapUris) : base(new HashSet<UrlSetItem>())
{
SitemapUris = sitemapUris;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,47 @@

namespace Robots.Txt.Parser;

public record SitemapItem
{
internal SitemapItem(Uri Location, DateTime? LastModified)
{
this.Location = Location;
this.LastModified = LastModified;
}

/// <summary>
/// URL location
/// </summary>
public Uri Location { get; }

/// <summary>
/// Date and time that the contents of the URL was last modified
/// </summary>
public DateTime? LastModified { get; }
}

/// <summary>
/// Url item described in a sitemap
/// </summary>
/// <param name="Location">URL location</param>
/// <param name="LastModified">Date that the contents of the URL was last modified</param>
/// <param name="ChangeFrequency">Hint for how often the URL is expected to change</param>
/// <param name="Priority">Hint for the priority that should be assigned to the URL</param>
public record UrlSetItem(Uri Location, DateTime? LastModified, ChangeFrequency? ChangeFrequency, decimal? Priority);
public record UrlSetItem : SitemapItem
{
internal UrlSetItem(Uri location, DateTime? lastModified, ChangeFrequency? changeFrequency, decimal? priority)
: base(location, lastModified)
{
ChangeFrequency = changeFrequency;
Priority = priority;
}

internal record SitemapItem(Uri Location, DateTime? LastModified);
/// <summary>
/// Hint for how often the URL is expected to change
/// </summary>
public ChangeFrequency? ChangeFrequency { get; }

/// <summary>
/// Hint for the priority that should be assigned to the URL
/// </summary>
public decimal? Priority { get; }
}

/// <summary>
/// Change frequency values used in the sitemap specification
Expand Down
110 changes: 70 additions & 40 deletions src/Robots.Txt.Parser/SitemapParser.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using System.Xml;
using System.Xml.Linq;

namespace Robots.Txt.Parser;
Expand All @@ -12,6 +13,8 @@ namespace Robots.Txt.Parser;
/// </summary>
public class SitemapParser
{
private const int ByteCount50MiB = 52_428_800;

private static readonly XNamespace sitemapNamespace = "http://www.sitemaps.org/schemas/sitemap/0.9";

/// <summary>
Expand All @@ -26,58 +29,85 @@ public static async Task<Sitemap> ReadFromStreamAsync(Stream stream, DateTime? m
{
try
{
var document = await XDocument.LoadAsync(stream, LoadOptions.None, cancellationToken);
var urlSetElement = document.Element(sitemapNamespace + "urlset");
if (urlSetElement is not null) return ReadUrlSet(urlSetElement, modifiedSince);
using var reader = XmlReader.Create(stream, new XmlReaderSettings { Async = true });
await reader.MoveToContentAsync();

var sitemapIndexElement = document.Element(sitemapNamespace + "sitemapindex");
if (sitemapIndexElement is not null) return ReadSitemapIndex(sitemapIndexElement, modifiedSince);
return reader switch
{
XmlReader when reader.NamespaceURI == sitemapNamespace && reader.Name == "urlset"
=> await ParseUrlSet(stream, reader, modifiedSince, cancellationToken),
XmlReader when reader.NamespaceURI == sitemapNamespace && reader.Name == "sitemapindex"
=> await ParseSitemapIndex(stream, reader, modifiedSince, cancellationToken),
_ => throw new SitemapException("Unable to find root sitemap element")
};
}
catch (Exception e) when (e is not SitemapException)
{
throw new SitemapException("Unable to parse sitemap", e);
}

throw new SitemapException("Unable to find root sitemap element");
}

private static SitemapRoot ReadSitemapIndex(XElement sitemapIndexElement, DateTime? modifiedSince)
private static async Task<SitemapIndex> ParseSitemapIndex(Stream stream, XmlReader reader, DateTime? modifiedSince, CancellationToken cancellationToken)
{
var sitemapElements = sitemapIndexElement.Elements(sitemapNamespace + "sitemap");
var sitemaps = sitemapElements
.Select(sitemapElement =>
await reader.ReadAsync();

var uris = new HashSet<Uri>();
while (!reader.EOF && reader.ReadState is ReadState.Interactive && !cancellationToken.IsCancellationRequested)
{
if (reader.NodeType is not XmlNodeType.Element || reader.Name != "sitemap" || reader.NamespaceURI != sitemapNamespace)
{
var location = new Uri(sitemapElement.Element(sitemapNamespace + "loc")!.Value);
var lastModifiedString = sitemapElement.Element(sitemapNamespace + "lastmod")?.Value;
DateTime? lastModified = lastModifiedString is not null ? DateTime.Parse(lastModifiedString) : null;
return new SitemapItem(location, lastModified);
})
.Where(sitemap => modifiedSince is null || sitemap.LastModified is null || sitemap.LastModified >= modifiedSince)
.Select(sitemap => sitemap.Location)
.ToHashSet();
return new SitemapRoot(sitemaps);
await reader.ReadAsync();
continue;
}

var node = (XElement)await XNode.ReadFromAsync(reader, cancellationToken);

if (stream.Position > ByteCount50MiB) throw new SitemapException("Reached parsing limit");

var lastModifiedString = node.Element(sitemapNamespace + "lastmod")?.Value;
DateTime? lastModified = lastModifiedString is not null ? DateTime.Parse(lastModifiedString) : null;

if (modifiedSince is not null && lastModified is not null && lastModified < modifiedSince) continue;

var location = new Uri(node.Element(sitemapNamespace + "loc")!.Value);

uris.Add(location);
}
return new SitemapIndex(uris);
}

private static Sitemap ReadUrlSet(XElement urlSetElement, DateTime? modifiedSince)
private static async Task<Sitemap> ParseUrlSet(Stream stream, XmlReader reader, DateTime? modifiedSince, CancellationToken cancellationToken)
{
var urlElements = urlSetElement.Elements(sitemapNamespace + "url");
var urlSet = urlElements
.Select(urlElement =>
await reader.ReadAsync();

var items = new HashSet<UrlSetItem>();
while (!reader.EOF && reader.ReadState is ReadState.Interactive && !cancellationToken.IsCancellationRequested)
{
if (reader.NodeType is not XmlNodeType.Element || reader.Name != "url" || reader.NamespaceURI != sitemapNamespace)
{
var location = new Uri(urlElement.Element(sitemapNamespace + "loc")!.Value);
var lastModifiedString = urlElement.Element(sitemapNamespace + "lastmod")?.Value;
var changeFrequencyString = urlElement.Element(sitemapNamespace + "changefreq")?.Value;
var priorityString = urlElement.Element(sitemapNamespace + "priority")?.Value;
DateTime? lastModified = lastModifiedString is not null ? DateTime.Parse(lastModifiedString) : null;
ChangeFrequency? changeFrequency = changeFrequencyString is not null
? Enum.Parse<ChangeFrequency>(changeFrequencyString, ignoreCase: true)
: null;
decimal? priority = priorityString is not null ? decimal.Parse(priorityString) : null;
return new UrlSetItem(location, lastModified, changeFrequency, priority);
})
.Where(url => modifiedSince is null || url.LastModified is null || url.LastModified >= modifiedSince)
.ToHashSet();

return new Sitemap(urlSet);
await reader.ReadAsync();
continue;
}

var node = (XElement)await XNode.ReadFromAsync(reader, cancellationToken);

if (stream.Position > ByteCount50MiB) throw new SitemapException("Reached parsing limit");

var lastModifiedString = node.Element(sitemapNamespace + "lastmod")?.Value;
DateTime? lastModified = lastModifiedString is not null ? DateTime.Parse(lastModifiedString) : null;

if (modifiedSince is not null && lastModified is not null && lastModified < modifiedSince) continue;

var location = new Uri(node.Element(sitemapNamespace + "loc")!.Value);
var changeFrequencyString = node.Element(sitemapNamespace + "changefreq")?.Value;
var priorityString = node.Element(sitemapNamespace + "priority")?.Value;
ChangeFrequency? changeFrequency = changeFrequencyString is not null
? Enum.Parse<ChangeFrequency>(changeFrequencyString, ignoreCase: true)
: null;
decimal? priority = priorityString is not null ? decimal.Parse(priorityString) : null;

items.Add(new UrlSetItem(location, lastModified, changeFrequency, priority));
}
return new Sitemap(items);
}
}
22 changes: 18 additions & 4 deletions tests/Robots.Txt.Parser.Tests.Unit/SitemapParserTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,20 @@ namespace Robots.Txt.Parser.Tests.Unit;

public class SitemapParserTests
{
[Fact]
public async Task ReadFromStreamAsync_EmptyFile_ThrowSitemapException()
{
// Arrange
var file = @"";
var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));

// Act
var parse = async () => await SitemapParser.ReadFromStreamAsync(stream);

// Assert
await parse.Should().ThrowExactlyAsync<SitemapException>();
}

[Fact]
public async Task ReadFromStreamAsync_ImproperXmlFormat_ThrowSitemapException()
{
Expand Down Expand Up @@ -176,7 +190,7 @@ public async Task ReadFromStreamAsync_SitemapIndexNoModifiedDateFilter_ParseCorr
var sitemap = await SitemapParser.ReadFromStreamAsync(stream);

// Assert
var sitemapRoot = sitemap.Should().BeOfType<SitemapRoot>().Subject;
var sitemapRoot = sitemap.Should().BeOfType<SitemapIndex>().Subject;
sitemap.UrlSet.Should().BeEmpty();
sitemapRoot.SitemapUris.Should().BeEquivalentTo(new[]
{
Expand Down Expand Up @@ -207,7 +221,7 @@ public async Task ReadFromStreamAsync_SitemapIndexEarlierModifiedDateFilter_Pars
var sitemap = await SitemapParser.ReadFromStreamAsync(stream, new DateTime(2023, 08, 22));

// Assert
var sitemapRoot = sitemap.Should().BeOfType<SitemapRoot>().Subject;
var sitemapRoot = sitemap.Should().BeOfType<SitemapIndex>().Subject;
sitemap.UrlSet.Should().BeEmpty();
sitemapRoot.SitemapUris.Should().BeEquivalentTo(new[]
{
Expand Down Expand Up @@ -238,7 +252,7 @@ public async Task ReadFromStreamAsync_SitemapIndexSameModifiedDateFilter_ParseCo
var sitemap = await SitemapParser.ReadFromStreamAsync(stream, new DateTime(2023, 08, 23));

// Assert
var sitemapRoot = sitemap.Should().BeOfType<SitemapRoot>().Subject;
var sitemapRoot = sitemap.Should().BeOfType<SitemapIndex>().Subject;
sitemap.UrlSet.Should().BeEmpty();
sitemapRoot.SitemapUris.Should().BeEquivalentTo(new[]
{
Expand Down Expand Up @@ -269,7 +283,7 @@ public async Task ReadFromStreamAsync_SitemapIndexExceedsModifiedDateFilter_Pars
var sitemap = await SitemapParser.ReadFromStreamAsync(stream, new DateTime(2023, 08, 24));

// Assert
var sitemapRoot = sitemap.Should().BeOfType<SitemapRoot>().Subject;
var sitemapRoot = sitemap.Should().BeOfType<SitemapIndex>().Subject;
sitemap.UrlSet.Should().BeEmpty();
sitemapRoot.SitemapUris.Should().BeEquivalentTo(new[] { new Uri("https://www.github.com/people.xml") });
}
Expand Down

0 comments on commit 981423b

Please sign in to comment.