Skip to content

Commit

Permalink
Merge pull request hmol#3 from ed-graham/feature/follow-redirects
Browse files Browse the repository at this point in the history
Follow specified redirects when flag set
  • Loading branch information
ed-graham authored Sep 16, 2018
2 parents 6882365 + 0e99fe7 commit 6248312
Show file tree
Hide file tree
Showing 10 changed files with 58 additions and 9 deletions.
2 changes: 2 additions & 0 deletions LinkCrawler/LinkCrawler/App.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
<appSettings>
<add key="OnlyReportBrokenLinksToOutput" value="false"/>
<add key="CheckImages" value="true"/>
<add key="FollowRedirects" value="false"/>
<add key="BaseUrl" value="https://github.com"/>
<add key="SuccessHttpStatusCodes" value="1xx,2xx,3xx"/>
<add key="InterestingHttpStatusCodes" value="*"/>
<add key="RedirectHttpStatusCodes" value="3xx"/>
<!--explanation of regex below: http://regexr.com/3cqt9 -->
<add key="ValidUrlRegex" value="(^http[s]?:\/{2})|(^www)|(^\/{1,2})"/>
<add key="PrintSummary" value="true"/>
Expand Down
16 changes: 15 additions & 1 deletion LinkCrawler/LinkCrawler/LinkCrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public class LinkCrawler
{
public string BaseUrl { get; set; }
public bool CheckImages { get; set; }
public bool FollowRedirects { get; set; }
public RestRequest RestRequest { get; set; }
public IEnumerable<IOutput> Outputs { get; set; }
public IValidUrlParser ValidUrlParser { get; set; }
Expand All @@ -30,6 +31,7 @@ public LinkCrawler(IEnumerable<IOutput> outputs, IValidUrlParser validUrlParser,
Outputs = outputs;
ValidUrlParser = validUrlParser;
CheckImages = settings.CheckImages;
FollowRedirects = settings.FollowRedirects;
UrlList = new List<LinkModel>();
RestRequest = new RestRequest(Method.GET).SetHeader("Accept", "*/*");
OnlyReportBrokenLinksToOutput = settings.OnlyReportBrokenLinksToOutput;
Expand All @@ -47,7 +49,7 @@ public void Start()
public void SendRequest(string crawlUrl, string referrerUrl = "")
{
var requestModel = new RequestModel(crawlUrl, referrerUrl, BaseUrl);
var restClient = new RestClient(new Uri(crawlUrl)) { FollowRedirects = false };
var restClient = new RestClient(new Uri(crawlUrl)) { FollowRedirects = false }; // we don't want RestSharp following the redirects, otherwise we won't see them

restClient.ExecuteAsync(RestRequest, response =>
{
Expand All @@ -63,6 +65,18 @@ public void ProcessResponse(IResponseModel responseModel)
{
WriteOutput(responseModel);

// follow 3xx redirects
if (FollowRedirects && responseModel.IsRedirect)
{
string redirectUrl;
if (responseModel.Location.StartsWith("/"))
redirectUrl = responseModel.RequestedUrl.GetUrlBase() + responseModel.Location;
else
redirectUrl = responseModel.Location;
SendRequest(redirectUrl, responseModel.RequestedUrl);
}

// follow internal links in response
if (responseModel.ShouldCrawl)
CrawlForLinksInResponse(responseModel);
}
Expand Down
2 changes: 2 additions & 0 deletions LinkCrawler/LinkCrawler/Models/IResponseModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ public interface IResponseModel
string Markup { get; }
string RequestedUrl { get; }
string ReferrerUrl { get; }
string Location { get; }
HttpStatusCode StatusCode { get; }
int StatusCodeNumber { get; }
bool IsSuccess { get; }
bool IsInteresting { get; }
bool IsRedirect{ get; }
bool ShouldCrawl { get; }
string ToString();
}
Expand Down
6 changes: 5 additions & 1 deletion LinkCrawler/LinkCrawler/Models/ResponseModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using System;
using System.Net;
using LinkCrawler.Utils.Settings;
using System.Collections.Generic;

namespace LinkCrawler.Models
{
Expand All @@ -17,6 +18,7 @@ public class ResponseModel : IResponseModel
public int StatusCodeNumber { get { return (int)StatusCode; } }
public bool IsSuccess { get; }
public bool IsInteresting { get; }
public bool IsRedirect { get; }
public bool ShouldCrawl { get; }
public string ErrorMessage { get; }

Expand All @@ -27,11 +29,13 @@ public ResponseModel(IRestResponse restResponse, RequestModel requestModel, ISet
RequestedUrl = requestModel.Url;
Location = restResponse.GetHeaderByName("Location"); // returns null if no Location header present in the response
ErrorMessage = restResponse.ErrorMessage;
IsSuccess = settings.IsSuccess(StatusCode);
IsInteresting = settings.IsInteresting(StatusCode);
IsRedirect = settings.IsRedirect(StatusCode);

IsSuccess = settings.IsSuccess(StatusCode);
if (!IsSuccess)
return;

Markup = restResponse.Content;
ShouldCrawl = IsSuccess && requestModel.IsInternalUrl && restResponse.IsHtmlDocument();
}
Expand Down
9 changes: 2 additions & 7 deletions LinkCrawler/LinkCrawler/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using System;
using LinkCrawler.Utils.Parsers;
using LinkCrawler.Utils.Settings;
using LinkCrawler.Utils.Extensions;

namespace LinkCrawler
{
Expand All @@ -21,13 +22,7 @@ static void Main(string[] args)
if (result)
{
// make sure the base URL is just a domain
int prefixLength = parsed.IndexOf("//") + 2;
if (parsed.Substring(prefixLength).IndexOf("/") > 0)
{
parsed = parsed.Substring(0, parsed.Substring(prefixLength).IndexOf("/") + prefixLength);
}
linkCrawler.BaseUrl = parsed;
validUrlParser.BaseUrl = parsed;
linkCrawler.BaseUrl = validUrlParser.BaseUrl = parsed.GetUrlBase();
linkCrawler.ValidUrlParser = validUrlParser;
}
}
Expand Down
10 changes: 10 additions & 0 deletions LinkCrawler/LinkCrawler/Utils/Extensions/StringExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,15 @@ public static string TrimEnd(this string input, string suffixToRemove)
}
return input;
}

public static string GetUrlBase(this string url)
{
int prefixLength = url.IndexOf("//") + 2;
if (url.Substring(prefixLength).IndexOf("/") > 0)
{
return url.Substring(0, url.Substring(prefixLength).IndexOf("/") + prefixLength);
}
else return url;
}
}
}
2 changes: 2 additions & 0 deletions LinkCrawler/LinkCrawler/Utils/Settings/Constants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ public static class AppSettings
public const string ValidUrlRegex = "ValidUrlRegex";
public const string OnlyReportBrokenLinksToOutput = "OnlyReportBrokenLinksToOutput";
public const string CheckImages = "CheckImages";
public const string FollowRedirects = "FollowRedirects";
public const string SlackWebHookUrl = "Slack.WebHook.Url";
public const string SlackWebHookBotName = "Slack.WebHook.Bot.Name";
public const string SlackWebHookBotIconEmoji = "Slack.WebHook.Bot.IconEmoji";
Expand All @@ -17,6 +18,7 @@ public static class AppSettings
public const string CsvDelimiter = "Csv.Delimiter";
public const string SuccessHttpStatusCodes = "SuccessHttpStatusCodes";
public const string InterestingHttpStatusCodes = "InterestingHttpStatusCodes";
public const string RedirectHttpStatusCodes = "RedirectHttpStatusCodes";
public const string OutputProviders = "outputProviders";
public const string PrintSummary = "PrintSummary";
}
Expand Down
4 changes: 4 additions & 0 deletions LinkCrawler/LinkCrawler/Utils/Settings/ISettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ public interface ISettings

bool CheckImages { get; }

bool FollowRedirects { get; }

bool OnlyReportBrokenLinksToOutput { get; }

string SlackWebHookUrl { get; }
Expand All @@ -30,6 +32,8 @@ public interface ISettings

bool IsInteresting(HttpStatusCode statusCode);

bool IsRedirect(HttpStatusCode statusCode);

bool PrintSummary { get; }
}
}
7 changes: 7 additions & 0 deletions LinkCrawler/LinkCrawler/Utils/Settings/MockSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ public class MockSettings : ISettings {

public bool CheckImages => true;

public bool FollowRedirects => true;

public string CsvDelimiter => ";";

public string CsvFilePath => @"C:\tmp\output.csv";
Expand Down Expand Up @@ -38,6 +40,11 @@ public bool IsSuccess(HttpStatusCode statusCode) {
}

public bool IsInteresting(HttpStatusCode statusCode)
{
return statusCode.IsMatch("*");
}

public bool IsRedirect(HttpStatusCode statusCode)
{
return statusCode.IsMatch("3xx");
}
Expand Down
9 changes: 9 additions & 0 deletions LinkCrawler/LinkCrawler/Utils/Settings/Settings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ public class Settings : ISettings
public bool CheckImages =>
ConfigurationManager.AppSettings[Constants.AppSettings.CheckImages].ToBool();

public bool FollowRedirects =>
ConfigurationManager.AppSettings[Constants.AppSettings.FollowRedirects].ToBool();

public bool OnlyReportBrokenLinksToOutput =>
ConfigurationManager.AppSettings[Constants.AppSettings.OnlyReportBrokenLinksToOutput].ToBool();

Expand Down Expand Up @@ -53,5 +56,11 @@ public bool IsInteresting(HttpStatusCode statusCode)
var configuredCodes = ConfigurationManager.AppSettings[Constants.AppSettings.InterestingHttpStatusCodes] ?? "*";
return statusCode.IsMatch(configuredCodes);
}

public bool IsRedirect(HttpStatusCode statusCode)
{
var configuredCodes = ConfigurationManager.AppSettings[Constants.AppSettings.RedirectHttpStatusCodes] ?? "3xx";
return statusCode.IsMatch(configuredCodes);
}
}
}

0 comments on commit 6248312

Please sign in to comment.