From af022bb1473e27ef17f587bf8a5956dfec2f3f15 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sun, 22 Oct 2023 19:38:49 -0700 Subject: [PATCH] NUTCH-3015 Add more CI steps to GitHub master-build.yml --- .github/workflows/dependency-check.yml | 36 +++++++++++ .github/workflows/master-build.yml | 60 ++++++++++++++----- .gitignore | 1 + build.xml | 34 ++++++++--- .../dependency-check-suppressions.xml | 5 -- src/java/overview.html | 16 +++++ .../creativecommons/conf/crawl-urlfilter.txt | 15 +++++ .../creativecommons/conf/nutch-site.xml | 16 +++++ src/plugin/creativecommons/data/anchor.html | 16 +++++ src/plugin/creativecommons/data/rdf.html | 16 +++++ src/plugin/creativecommons/data/rel.html | 16 +++++ src/plugin/exchange-jexl/README.md | 17 ++++++ src/plugin/index-links/README.md | 17 ++++++ .../sample/testIndexReplace.html | 16 +++++ src/plugin/indexer-cloudsearch/README.md | 17 ++++++ .../indexer-cloudsearch/createCSDomain.sh | 15 +++++ src/plugin/indexer-csv/README.md | 17 ++++++ src/plugin/indexer-dummy/README.md | 17 ++++++ src/plugin/indexer-elastic/README.md | 17 ++++++ ...wto_upgrade_es.txt => howto_upgrade_es.md} | 17 ++++++ src/plugin/indexer-opensearch-1x/README.md | 17 ++++++ ...search.txt => howto_upgrade_opensearch.md} | 17 ++++++ src/plugin/indexer-rabbit/README.md | 17 ++++++ src/plugin/indexer-solr/README.md | 17 ++++++ ...upgrade_solr.txt => howto_upgrade_solr.md} | 17 ++++++ src/plugin/lib-selenium/README.md | 17 ++++++ .../lib-selenium/howto_upgrade_selenium.md | 32 ++++++++++ .../lib-selenium/howto_upgrade_selenium.txt | 15 ----- src/plugin/parse-ext/command | 15 +++++ .../sample/parse_embedded_js_test.html | 16 +++++ .../parse-js/sample/parse_pure_js_test.js | 15 +++++ .../parse-metatags/sample/testMetatags.html | 16 +++++ .../sample/testMultivalueMetatags.html | 16 +++++ ...upgrade_tika.txt => howto_upgrade_tika.md} | 17 ++++++ src/plugin/parse-tika/sample/nutch.html | 16 +++++ .../data/regex-parsefilter.txt | 15 +++++ .../protocol-file/sample/testprotocolfile.txt | 15 +++++ .../sample/testprotocolfile_(encoded).txt | 15 +++++ .../protocol-interactiveselenium/README.md | 17 ++++++ ...ade_okhttp.txt => howto_upgrade_okhttp.md} | 17 ++++++ src/plugin/protocol-selenium/README.md | 17 ++++++ .../sample/Benchmarks.rules | 15 +++++ .../sample/Benchmarks.urls | 15 +++++ .../sample/IntranetCrawling.rules | 15 +++++ .../sample/IntranetCrawling.urls | 15 +++++ .../sample/WholeWebCrawling.rules | 15 +++++ .../sample/WholeWebCrawling.urls | 15 +++++ src/plugin/urlfilter-domain/data/hosts.txt | 15 +++++ .../urlfilter-domaindenylist/data/hosts.txt | 15 +++++ src/plugin/urlfilter-fast/README.md | 16 +++++ .../urlfilter-fast/sample/Benchmarks.urls | 15 +++++ .../sample/fast-urlfilter-benchmark.txt | 15 +++++ .../sample/fast-urlfilter-test.txt | 15 +++++ src/plugin/urlfilter-fast/sample/test.urls | 15 +++++ src/plugin/urlfilter-ignoreexempt/README.md | 17 ++++++ .../urlfilter-regex/sample/Benchmarks.rules | 15 +++++ .../urlfilter-regex/sample/Benchmarks.urls | 15 +++++ .../sample/IntranetCrawling.rules | 15 +++++ .../sample/IntranetCrawling.urls | 15 +++++ .../sample/WholeWebCrawling.rules | 15 +++++ .../sample/WholeWebCrawling.urls | 15 +++++ .../urlfilter-regex/sample/nutch1838.rules | 15 +++++ .../urlfilter-regex/sample/nutch1838.urls | 15 +++++ src/plugin/urlnormalizer-host/data/hosts.txt | 15 +++++ .../urlnormalizer-protocol/data/protocols.txt | 15 +++++ .../sample/regex-normalize-default.test | 15 +++++ .../sample/regex-normalize-scope1.test | 15 +++++ .../urlnormalizer-slash/data/slashes.txt | 15 +++++ src/test/crawl-tests.xml | 16 +++++ src/test/filter-all.txt | 15 +++++ src/test/log4j.properties | 15 +++++ src/test/nutch-site.xml | 16 +++++ .../fetch-test-site/dup_of_pagea.html | 16 +++++ .../fetch-test-site/exception.html | 16 +++++ src/testresources/fetch-test-site/index.html | 16 +++++ .../fetch-test-site/nested_spider_trap.html | 16 +++++ src/testresources/fetch-test-site/pagea.html | 16 +++++ src/testresources/fetch-test-site/pageb.html | 16 +++++ src/testresources/fetch-test-site/robots.txt | 14 +++++ 79 files changed, 1272 insertions(+), 45 deletions(-) create mode 100644 .github/workflows/dependency-check.yml rename src/plugin/indexer-elastic/{howto_upgrade_es.txt => howto_upgrade_es.md} (61%) rename src/plugin/indexer-opensearch-1x/{howto_upgrade_opensearch.txt => howto_upgrade_opensearch.md} (62%) rename src/plugin/indexer-solr/{howto_upgrade_solr.txt => howto_upgrade_solr.md} (60%) create mode 100644 src/plugin/lib-selenium/howto_upgrade_selenium.md delete mode 100644 src/plugin/lib-selenium/howto_upgrade_selenium.txt rename src/plugin/parse-tika/{howto_upgrade_tika.txt => howto_upgrade_tika.md} (73%) rename src/plugin/protocol-okhttp/{howto_upgrade_okhttp.txt => howto_upgrade_okhttp.md} (52%) diff --git a/.github/workflows/dependency-check.yml b/.github/workflows/dependency-check.yml new file mode 100644 index 0000000000..ae20abef99 --- /dev/null +++ b/.github/workflows/dependency-check.yml @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: master pr build + +on: + schedule: + - cron: '0 0 * * *' # every day at midnight + +jobs: + dependency-check: + strategy: + matrix: + java: ['11'] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + - name: Dependency check + run: ant clean dependency-check -buildfile build.xml diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index ba1d470ece..3703868d80 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -1,4 +1,3 @@ -# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -13,28 +12,57 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -name: master pr build +name: master pull request ci -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] +on: [push, pull_request] + branches: [master] jobs: - build: - runs-on: ubuntu-latest + javadoc: strategy: matrix: - java: [ '11' ] - + java: ['11'] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + - name: Javadoc + run: ant clean javadoc -buildfile build.xml + rat: + strategy: + matrix: + java: ['11'] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + - name: Run Apache Rat + run: ant clean run-rat -buildfile build.xml + - name: Cache unknown licenses + run: sed -n 18p build/apache-rat-report.txt=$unknown_licenses + - name: Fail if any unknown licenses + if: ${{ '${{ env.unknown_licenses }}' != '0 Unknown Licenses' }} + run: exit 1 + test: + strategy: + matrix: + java: ['11'] + os: [ubuntu-latest, windows-latest, macos-latest] + runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} - - name: Build with Ant - run: ant clean nightly javadoc -buildfile build.xml + - name: Test + run: ant clean test -buildfile build.xml diff --git a/.gitignore b/.gitignore index b466908527..12365dd0d4 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ csvindexwriter lib/spotbugs-* ivy/dependency-check-ant/* .gradle* +ivy/apache-rat-* diff --git a/build.xml b/build.xml index b44581405a..c4e14230a4 100644 --- a/build.xml +++ b/build.xml @@ -38,7 +38,7 @@ - + @@ -48,7 +48,7 @@ - + @@ -640,13 +640,15 @@ - + + reportformat="ALL" + assemblyAnalyzerEnabled="false" + failBuildOnCVSS="1"> @@ -1025,7 +1027,7 @@ - - + @@ -1047,8 +1049,22 @@ - - + + + + + + + + + + + + + + + + diff --git a/ivy/dependency-check-ant/dependency-check-suppressions.xml b/ivy/dependency-check-ant/dependency-check-suppressions.xml index e7de8febb2..a7f4ca16df 100644 --- a/ivy/dependency-check-ant/dependency-check-suppressions.xml +++ b/ivy/dependency-check-ant/dependency-check-suppressions.xml @@ -1,8 +1,3 @@ - - only applies to tika-server < 1.18 - ^org\.(apache\.tika:tika-(core|parsers)|gagravarr:vorbis-java-tika):.*$ - CVE-2018-1335 - diff --git a/src/java/overview.html b/src/java/overview.html index 11321417ba..3de53a7d28 100644 --- a/src/java/overview.html +++ b/src/java/overview.html @@ -1,3 +1,19 @@ + Apache Nutch diff --git a/src/plugin/creativecommons/conf/crawl-urlfilter.txt b/src/plugin/creativecommons/conf/crawl-urlfilter.txt index 324617f07a..eb6786e4b4 100644 --- a/src/plugin/creativecommons/conf/crawl-urlfilter.txt +++ b/src/plugin/creativecommons/conf/crawl-urlfilter.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Creative Commnons crawl filter # Each non-comment, non-blank line contains a regular expression diff --git a/src/plugin/creativecommons/conf/nutch-site.xml b/src/plugin/creativecommons/conf/nutch-site.xml index e28e12a9a8..91ffded76a 100644 --- a/src/plugin/creativecommons/conf/nutch-site.xml +++ b/src/plugin/creativecommons/conf/nutch-site.xml @@ -1,3 +1,19 @@ + diff --git a/src/plugin/creativecommons/data/anchor.html b/src/plugin/creativecommons/data/anchor.html index 90b522759d..3267bc9ea8 100755 --- a/src/plugin/creativecommons/data/anchor.html +++ b/src/plugin/creativecommons/data/anchor.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/creativecommons/data/rdf.html b/src/plugin/creativecommons/data/rdf.html index fb2c34dfe5..60c27cc541 100755 --- a/src/plugin/creativecommons/data/rdf.html +++ b/src/plugin/creativecommons/data/rdf.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/creativecommons/data/rel.html b/src/plugin/creativecommons/data/rel.html index 413d52f869..3d11572d82 100755 --- a/src/plugin/creativecommons/data/rel.html +++ b/src/plugin/creativecommons/data/rel.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/exchange-jexl/README.md b/src/plugin/exchange-jexl/README.md index 2d2024276f..35a711b90c 100644 --- a/src/plugin/exchange-jexl/README.md +++ b/src/plugin/exchange-jexl/README.md @@ -1,3 +1,20 @@ + + exchange-jexl plugin for Nutch ============================== diff --git a/src/plugin/index-links/README.md b/src/plugin/index-links/README.md index f25d1cf6da..ac0f071f45 100644 --- a/src/plugin/index-links/README.md +++ b/src/plugin/index-links/README.md @@ -1,3 +1,20 @@ + + indexer-links plugin for Nutch ============================== diff --git a/src/plugin/index-replace/sample/testIndexReplace.html b/src/plugin/index-replace/sample/testIndexReplace.html index 0b90fc2110..fb2ef03a59 100644 --- a/src/plugin/index-replace/sample/testIndexReplace.html +++ b/src/plugin/index-replace/sample/testIndexReplace.html @@ -1,3 +1,19 @@ + Testing the power of the index-replace plugin diff --git a/src/plugin/indexer-cloudsearch/README.md b/src/plugin/indexer-cloudsearch/README.md index 10b5daa901..a0609c0fbb 100644 --- a/src/plugin/indexer-cloudsearch/README.md +++ b/src/plugin/indexer-cloudsearch/README.md @@ -1,3 +1,20 @@ + + AWS CloudSearch plugin for Nutch ================================ diff --git a/src/plugin/indexer-cloudsearch/createCSDomain.sh b/src/plugin/indexer-cloudsearch/createCSDomain.sh index 24fb0156c6..1cb8481fe0 100644 --- a/src/plugin/indexer-cloudsearch/createCSDomain.sh +++ b/src/plugin/indexer-cloudsearch/createCSDomain.sh @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # example of domain configuration for CloudSearch DOMAIN="$1" diff --git a/src/plugin/indexer-csv/README.md b/src/plugin/indexer-csv/README.md index 80220974a7..4d1288b198 100644 --- a/src/plugin/indexer-csv/README.md +++ b/src/plugin/indexer-csv/README.md @@ -1,3 +1,20 @@ + + indexer-csv plugin for Nutch ============================ diff --git a/src/plugin/indexer-dummy/README.md b/src/plugin/indexer-dummy/README.md index 2a4b2bd156..a7fa530090 100644 --- a/src/plugin/indexer-dummy/README.md +++ b/src/plugin/indexer-dummy/README.md @@ -1,3 +1,20 @@ + + indexer-dummy plugin for Nutch ============================== diff --git a/src/plugin/indexer-elastic/README.md b/src/plugin/indexer-elastic/README.md index 466762e1c7..3dfd888ff8 100644 --- a/src/plugin/indexer-elastic/README.md +++ b/src/plugin/indexer-elastic/README.md @@ -1,3 +1,20 @@ + + indexer-elastic plugin for Nutch ================================ diff --git a/src/plugin/indexer-elastic/howto_upgrade_es.txt b/src/plugin/indexer-elastic/howto_upgrade_es.md similarity index 61% rename from src/plugin/indexer-elastic/howto_upgrade_es.txt rename to src/plugin/indexer-elastic/howto_upgrade_es.md index a8156444c6..b57e0c02fa 100644 --- a/src/plugin/indexer-elastic/howto_upgrade_es.txt +++ b/src/plugin/indexer-elastic/howto_upgrade_es.md @@ -1,3 +1,20 @@ + + 1. Upgrade Elasticsearch dependency in src/plugin/indexer-elastic/ivy.xml 2. Upgrade the Elasticsearch specific dependencies in src/plugin/indexer-elastic/plugin.xml diff --git a/src/plugin/indexer-opensearch-1x/README.md b/src/plugin/indexer-opensearch-1x/README.md index 52e5844af8..e5e76f0b60 100644 --- a/src/plugin/indexer-opensearch-1x/README.md +++ b/src/plugin/indexer-opensearch-1x/README.md @@ -1,3 +1,20 @@ + + indexer-opensearch1x plugin for Nutch ================================ diff --git a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md similarity index 62% rename from src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt rename to src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md index 0725900445..c9b723ffcf 100644 --- a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt +++ b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md @@ -1,3 +1,20 @@ + + 1. Upgrade OpenSearch dependency in src/plugin/indexer-opensearch-1x/ivy.xml 2. Upgrade the OpenSearch specific dependencies in src/plugin/indexer-opensearch-1x/plugin.xml diff --git a/src/plugin/indexer-rabbit/README.md b/src/plugin/indexer-rabbit/README.md index 6ea09a9151..8040cd6c76 100644 --- a/src/plugin/indexer-rabbit/README.md +++ b/src/plugin/indexer-rabbit/README.md @@ -1,3 +1,20 @@ + + indexer-rabbit plugin for Nutch =============================== diff --git a/src/plugin/indexer-solr/README.md b/src/plugin/indexer-solr/README.md index c3a4601e1b..3a27e4116c 100644 --- a/src/plugin/indexer-solr/README.md +++ b/src/plugin/indexer-solr/README.md @@ -1,3 +1,20 @@ + + indexer-solr plugin for Nutch ============================= diff --git a/src/plugin/indexer-solr/howto_upgrade_solr.txt b/src/plugin/indexer-solr/howto_upgrade_solr.md similarity index 60% rename from src/plugin/indexer-solr/howto_upgrade_solr.txt rename to src/plugin/indexer-solr/howto_upgrade_solr.md index b2a7eb5c89..905fb84a9e 100644 --- a/src/plugin/indexer-solr/howto_upgrade_solr.txt +++ b/src/plugin/indexer-solr/howto_upgrade_solr.md @@ -1,3 +1,20 @@ + + 1. Upgrade Solr dependency in src/plugin/indexer-solr/ivy.xml 2. Upgrade the Solr specific dependencies in src/plugin/indexer-solr/plugin.xml diff --git a/src/plugin/lib-selenium/README.md b/src/plugin/lib-selenium/README.md index 1c6b37c5f8..5054d7ad8e 100644 --- a/src/plugin/lib-selenium/README.md +++ b/src/plugin/lib-selenium/README.md @@ -1,3 +1,20 @@ + + # Updates * The use of phantomjs has been deprecated. Check [Wikipedia](https://en.wikipedia.org/wiki/PhantomJS) for more info. * The updated code for Safari webriver is under development as starting Safari 10 on OS X El Capitan and macOS Sierra, Safari comes bundled with a new driver implementation. diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.md b/src/plugin/lib-selenium/howto_upgrade_selenium.md new file mode 100644 index 0000000000..3071c74cbf --- /dev/null +++ b/src/plugin/lib-selenium/howto_upgrade_selenium.md @@ -0,0 +1,32 @@ + + +1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml + +2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml + + To get a list of dependencies and their versions execute: + $ ant -f ./build-ivy.xml + $ ls lib | sed 's/^/ \n \n <\/library>/g' + + Note that all dependent libraries are exported for a "library" plugin ("lib-selenium"). + + N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows + + $ brew install gnu-sed --with-default-names + + You can then restart your terminal and the Regex + Sed command should work just fine! diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.txt b/src/plugin/lib-selenium/howto_upgrade_selenium.txt deleted file mode 100644 index 1892a6275e..0000000000 --- a/src/plugin/lib-selenium/howto_upgrade_selenium.txt +++ /dev/null @@ -1,15 +0,0 @@ -1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml - -2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml - - To get a list of dependencies and their versions execute: - $ ant -f ./build-ivy.xml - $ ls lib | sed 's/^/ \n \n <\/library>/g' - - Note that all dependent libraries are exported for a "library" plugin ("lib-selenium"). - - N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows - - $ brew install gnu-sed --with-default-names - - You can then restart your terminal and the Regex + Sed command should work just fine! diff --git a/src/plugin/parse-ext/command b/src/plugin/parse-ext/command index f42c055311..329d58d96d 100644 --- a/src/plugin/parse-ext/command +++ b/src/plugin/parse-ext/command @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + #!/bin/bash # # Sample bash script as external command invoked by parse-ext plugin diff --git a/src/plugin/parse-js/sample/parse_embedded_js_test.html b/src/plugin/parse-js/sample/parse_embedded_js_test.html index 351beacc35..0409bba53b 100644 --- a/src/plugin/parse-js/sample/parse_embedded_js_test.html +++ b/src/plugin/parse-js/sample/parse_embedded_js_test.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/parse-js/sample/parse_pure_js_test.js b/src/plugin/parse-js/sample/parse_pure_js_test.js index f196313f85..0e486a8793 100644 --- a/src/plugin/parse-js/sample/parse_pure_js_test.js +++ b/src/plugin/parse-js/sample/parse_pure_js_test.js @@ -1,3 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // test data for link extraction from "pure" JavaScript function selectProvider(form) { diff --git a/src/plugin/parse-metatags/sample/testMetatags.html b/src/plugin/parse-metatags/sample/testMetatags.html index e9e8e6bd0c..4dc86c194b 100644 --- a/src/plugin/parse-metatags/sample/testMetatags.html +++ b/src/plugin/parse-metatags/sample/testMetatags.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html index ca8b737c2b..36d2c8814a 100644 --- a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html +++ b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.md similarity index 73% rename from src/plugin/parse-tika/howto_upgrade_tika.txt rename to src/plugin/parse-tika/howto_upgrade_tika.md index 46d075948b..8ed6c3f3cd 100644 --- a/src/plugin/parse-tika/howto_upgrade_tika.txt +++ b/src/plugin/parse-tika/howto_upgrade_tika.md @@ -1,3 +1,20 @@ + + We are currently using a shim (https://github.com/tballison/hadoop-safe-tika because of binary conflicts in commons-io versions between what Hadoop supports and the more modern features that Apache Tika and Apache POI were using in commons-io. diff --git a/src/plugin/parse-tika/sample/nutch.html b/src/plugin/parse-tika/sample/nutch.html index 0aa7c98959..8098535126 100644 --- a/src/plugin/parse-tika/sample/nutch.html +++ b/src/plugin/parse-tika/sample/nutch.html @@ -1,3 +1,19 @@ + diff --git a/src/plugin/parsefilter-regex/data/regex-parsefilter.txt b/src/plugin/parsefilter-regex/data/regex-parsefilter.txt index 9d15cd899b..fbc7dd3039 100644 --- a/src/plugin/parsefilter-regex/data/regex-parsefilter.txt +++ b/src/plugin/parsefilter-regex/data/regex-parsefilter.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Example configuration file for parsefilter-regex # # Parse metadata field is set to true if the HTML matches the regex. The diff --git a/src/plugin/protocol-file/sample/testprotocolfile.txt b/src/plugin/protocol-file/sample/testprotocolfile.txt index fbe8a8acf2..5e684e2f47 100644 --- a/src/plugin/protocol-file/sample/testprotocolfile.txt +++ b/src/plugin/protocol-file/sample/testprotocolfile.txt @@ -1 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + Protocol File Test diff --git a/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt b/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt index fbe8a8acf2..5e684e2f47 100644 --- a/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt +++ b/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt @@ -1 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + Protocol File Test diff --git a/src/plugin/protocol-interactiveselenium/README.md b/src/plugin/protocol-interactiveselenium/README.md index dd43ee7948..545efb830a 100644 --- a/src/plugin/protocol-interactiveselenium/README.md +++ b/src/plugin/protocol-interactiveselenium/README.md @@ -1,3 +1,20 @@ + + Nutch Interactive Selenium ========================== diff --git a/src/plugin/protocol-okhttp/howto_upgrade_okhttp.txt b/src/plugin/protocol-okhttp/howto_upgrade_okhttp.md similarity index 52% rename from src/plugin/protocol-okhttp/howto_upgrade_okhttp.txt rename to src/plugin/protocol-okhttp/howto_upgrade_okhttp.md index b3b6f1f223..16ae70d71d 100644 --- a/src/plugin/protocol-okhttp/howto_upgrade_okhttp.txt +++ b/src/plugin/protocol-okhttp/howto_upgrade_okhttp.md @@ -1,3 +1,20 @@ + + 1. Upgrade OkHttp dependency in src/plugin/protocol-okhttp/ivy.xml 2. Upgrade OkHttp's own dependencies in src/plugin/protocol-okhttp/plugin.xml diff --git a/src/plugin/protocol-selenium/README.md b/src/plugin/protocol-selenium/README.md index 05132b9ef1..4d43c330d5 100644 --- a/src/plugin/protocol-selenium/README.md +++ b/src/plugin/protocol-selenium/README.md @@ -1,3 +1,20 @@ + + Nutch Selenium ============== diff --git a/src/plugin/urlfilter-automaton/sample/Benchmarks.rules b/src/plugin/urlfilter-automaton/sample/Benchmarks.rules index a2f6da03b2..e26fbfa9fa 100644 --- a/src/plugin/urlfilter-automaton/sample/Benchmarks.rules +++ b/src/plugin/urlfilter-automaton/sample/Benchmarks.rules @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # The url filter file used by the crawl command. # Better for intranet crawling. diff --git a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls b/src/plugin/urlfilter-automaton/sample/Benchmarks.urls index 6a0e822bdd..3b1b157a54 100644 --- a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls +++ b/src/plugin/urlfilter-automaton/sample/Benchmarks.urls @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +http://www.hostip.info/ -http://www.elanceur.org/Articles/OntologieSurfaite.html +http://www.opensymphony.com/quartz/ diff --git a/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules b/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules index 896618339e..770d1241b1 100644 --- a/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules +++ b/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # The url filter file used by the crawl command. # Better for intranet crawling. diff --git a/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls b/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls index b1ad9b7d38..6551ff8616 100644 --- a/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls +++ b/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + -file://home/jc/nutch/index.html -ftp://ftp.apache.org/nutch.html -mailto:jerome.charron@gmail.com diff --git a/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules b/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules index dfae8b0577..cee394b46b 100644 --- a/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules +++ b/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # The default url filter. # Better for whole-internet crawling. diff --git a/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls b/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls index d3b1bf3829..5a52ac282a 100644 --- a/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls +++ b/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + -file://home/jc/nutch/index.html -ftp://ftp.apache.org/nutch.html -mailto:jerome.charron@gmail.com diff --git a/src/plugin/urlfilter-domain/data/hosts.txt b/src/plugin/urlfilter-domain/data/hosts.txt index 2b88c3b050..8cf43745fa 100644 --- a/src/plugin/urlfilter-domain/data/hosts.txt +++ b/src/plugin/urlfilter-domain/data/hosts.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # comments start with the pound sign net apache.org diff --git a/src/plugin/urlfilter-domaindenylist/data/hosts.txt b/src/plugin/urlfilter-domaindenylist/data/hosts.txt index 2b88c3b050..8cf43745fa 100644 --- a/src/plugin/urlfilter-domaindenylist/data/hosts.txt +++ b/src/plugin/urlfilter-domaindenylist/data/hosts.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # comments start with the pound sign net apache.org diff --git a/src/plugin/urlfilter-fast/README.md b/src/plugin/urlfilter-fast/README.md index 46b293fe87..2e58605752 100644 --- a/src/plugin/urlfilter-fast/README.md +++ b/src/plugin/urlfilter-fast/README.md @@ -1,3 +1,19 @@ + Filters URLs based on a file of regular expressions using host/domains matching first. The default policy is to accept a URL if no matches diff --git a/src/plugin/urlfilter-fast/sample/Benchmarks.urls b/src/plugin/urlfilter-fast/sample/Benchmarks.urls index 6a0e822bdd..3b1b157a54 100644 --- a/src/plugin/urlfilter-fast/sample/Benchmarks.urls +++ b/src/plugin/urlfilter-fast/sample/Benchmarks.urls @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +http://www.hostip.info/ -http://www.elanceur.org/Articles/OntologieSurfaite.html +http://www.opensymphony.com/quartz/ diff --git a/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt b/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt index 27a918bb60..6257c87a94 100644 --- a/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt +++ b/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # port of urlfilter-regex benchmarks to urlfilter-fast # cf. # src/plugin/urlfilter-regex/sample/Benchmarks.rules diff --git a/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt b/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt index 9f26529379..edeb4a17b8 100644 --- a/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt +++ b/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + Host www.example.org DenyPath ^/path/to/be/excluded DenyPath ^/some/other/path/excluded diff --git a/src/plugin/urlfilter-fast/sample/test.urls b/src/plugin/urlfilter-fast/sample/test.urls index 3aa4354a63..b26ff8b0e7 100644 --- a/src/plugin/urlfilter-fast/sample/test.urls +++ b/src/plugin/urlfilter-fast/sample/test.urls @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + -https://www.example.org/path/to/be/excluded -https://www.example.org/path/to/be/excluded/continued -https://www.example.org/some/other/path/excluded diff --git a/src/plugin/urlfilter-ignoreexempt/README.md b/src/plugin/urlfilter-ignoreexempt/README.md index d48b6729f6..a8f932e759 100644 --- a/src/plugin/urlfilter-ignoreexempt/README.md +++ b/src/plugin/urlfilter-ignoreexempt/README.md @@ -1,3 +1,20 @@ + + urlfilter-ignoreexempt ====================== This plugin allows certain urls to be exempted when the external links are configured to be ignored. diff --git a/src/plugin/urlfilter-regex/sample/Benchmarks.rules b/src/plugin/urlfilter-regex/sample/Benchmarks.rules index 6a85118c51..0b9fff1ac0 100644 --- a/src/plugin/urlfilter-regex/sample/Benchmarks.rules +++ b/src/plugin/urlfilter-regex/sample/Benchmarks.rules @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # The url filter file used by the crawl command. # Better for intranet crawling. diff --git a/src/plugin/urlfilter-regex/sample/Benchmarks.urls b/src/plugin/urlfilter-regex/sample/Benchmarks.urls index 6a0e822bdd..3b1b157a54 100644 --- a/src/plugin/urlfilter-regex/sample/Benchmarks.urls +++ b/src/plugin/urlfilter-regex/sample/Benchmarks.urls @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +http://www.hostip.info/ -http://www.elanceur.org/Articles/OntologieSurfaite.html +http://www.opensymphony.com/quartz/ diff --git a/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules b/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules index e651dd509d..6db8a85908 100644 --- a/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules +++ b/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # The url filter file used by the crawl command. # Better for intranet crawling. diff --git a/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls b/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls index b1ad9b7d38..6551ff8616 100644 --- a/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls +++ b/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + -file://home/jc/nutch/index.html -ftp://ftp.apache.org/nutch.html -mailto:jerome.charron@gmail.com diff --git a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules index ac9ad60a88..4274e8a67d 100644 --- a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules +++ b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # The default url filter. # Better for whole-internet crawling. diff --git a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls index ccb626954e..c0593bafc4 100644 --- a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls +++ b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + -file://home/jc/nutch/index.html -ftp://ftp.apache.org/nutch.html -mailto:jerome.charron@gmail.com diff --git a/src/plugin/urlfilter-regex/sample/nutch1838.rules b/src/plugin/urlfilter-regex/sample/nutch1838.rules index f7b0d13f23..6e88006bc2 100644 --- a/src/plugin/urlfilter-regex/sample/nutch1838.rules +++ b/src/plugin/urlfilter-regex/sample/nutch1838.rules @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Skip all url's containing skip for example.org > www.example.org -skip diff --git a/src/plugin/urlfilter-regex/sample/nutch1838.urls b/src/plugin/urlfilter-regex/sample/nutch1838.urls index c6f29d1bb7..78a0e44c5f 100644 --- a/src/plugin/urlfilter-regex/sample/nutch1838.urls +++ b/src/plugin/urlfilter-regex/sample/nutch1838.urls @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + -http://www.example.org/skip-me-now +http://www.example.com/noone-can-skip-me -http://www.example.nl/i-am-filtered \ No newline at end of file diff --git a/src/plugin/urlnormalizer-host/data/hosts.txt b/src/plugin/urlnormalizer-host/data/hosts.txt index c7e0ccfe6c..b81edae147 100644 --- a/src/plugin/urlnormalizer-host/data/hosts.txt +++ b/src/plugin/urlnormalizer-host/data/hosts.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Force all sub domains to www. *.example.com example.com diff --git a/src/plugin/urlnormalizer-protocol/data/protocols.txt b/src/plugin/urlnormalizer-protocol/data/protocols.txt index fc7d86cbd9..1599172528 100644 --- a/src/plugin/urlnormalizer-protocol/data/protocols.txt +++ b/src/plugin/urlnormalizer-protocol/data/protocols.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Example configuration file for urlnormalizer-protocol # # URL's of hosts listed in the configuration are normalized to the target diff --git a/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test b/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test index 7867ad80ea..8560961c0a 100644 --- a/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test +++ b/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # test simple removal of session id, keeping parameters before and after http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php?f=2 diff --git a/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test b/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test index 9d928802e8..9905e683d0 100644 --- a/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test +++ b/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # test removal of subdomains http://www.foo.bar.com/ http://bar.com/ diff --git a/src/plugin/urlnormalizer-slash/data/slashes.txt b/src/plugin/urlnormalizer-slash/data/slashes.txt index d3bd70a666..efcdafb630 100644 --- a/src/plugin/urlnormalizer-slash/data/slashes.txt +++ b/src/plugin/urlnormalizer-slash/data/slashes.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Both domains have duplicate URL's, some with slashes and some without # We prefer this domain with slashes diff --git a/src/test/crawl-tests.xml b/src/test/crawl-tests.xml index 01fc683012..3fa1bf1382 100644 --- a/src/test/crawl-tests.xml +++ b/src/test/crawl-tests.xml @@ -1,3 +1,19 @@ + diff --git a/src/test/filter-all.txt b/src/test/filter-all.txt index 4ed567ab1c..d738aec76a 100644 --- a/src/test/filter-all.txt +++ b/src/test/filter-all.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Config file for urlfilter-suffix plugin # Filter away all urls diff --git a/src/test/log4j.properties b/src/test/log4j.properties index 3ff115f46f..08e272c712 100644 --- a/src/test/log4j.properties +++ b/src/test/log4j.properties @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # log4j configuration used during build and unit tests log4j.rootLogger=info,stdout diff --git a/src/test/nutch-site.xml b/src/test/nutch-site.xml index dd408739dc..dd999ee045 100644 --- a/src/test/nutch-site.xml +++ b/src/test/nutch-site.xml @@ -1,3 +1,19 @@ + diff --git a/src/testresources/fetch-test-site/dup_of_pagea.html b/src/testresources/fetch-test-site/dup_of_pagea.html index 6444c41225..63c4e61537 100644 --- a/src/testresources/fetch-test-site/dup_of_pagea.html +++ b/src/testresources/fetch-test-site/dup_of_pagea.html @@ -1,3 +1,19 @@ + page a diff --git a/src/testresources/fetch-test-site/exception.html b/src/testresources/fetch-test-site/exception.html index e1192a176b..66f134ee25 100644 --- a/src/testresources/fetch-test-site/exception.html +++ b/src/testresources/fetch-test-site/exception.html @@ -1,3 +1,19 @@ + diff --git a/src/testresources/fetch-test-site/index.html b/src/testresources/fetch-test-site/index.html index d73ff3f691..3fc6e61e5a 100644 --- a/src/testresources/fetch-test-site/index.html +++ b/src/testresources/fetch-test-site/index.html @@ -1,3 +1,19 @@ + front page diff --git a/src/testresources/fetch-test-site/nested_spider_trap.html b/src/testresources/fetch-test-site/nested_spider_trap.html index 5dcf7c2209..dd32ee2362 100644 --- a/src/testresources/fetch-test-site/nested_spider_trap.html +++ b/src/testresources/fetch-test-site/nested_spider_trap.html @@ -1,3 +1,19 @@ + nested spider trap diff --git a/src/testresources/fetch-test-site/pagea.html b/src/testresources/fetch-test-site/pagea.html index 6444c41225..63c4e61537 100644 --- a/src/testresources/fetch-test-site/pagea.html +++ b/src/testresources/fetch-test-site/pagea.html @@ -1,3 +1,19 @@ + page a diff --git a/src/testresources/fetch-test-site/pageb.html b/src/testresources/fetch-test-site/pageb.html index 66e3725ef0..cf77ff4f75 100644 --- a/src/testresources/fetch-test-site/pageb.html +++ b/src/testresources/fetch-test-site/pageb.html @@ -1,3 +1,19 @@ + bage b diff --git a/src/testresources/fetch-test-site/robots.txt b/src/testresources/fetch-test-site/robots.txt index e69de29bb2..fc590f9733 100644 --- a/src/testresources/fetch-test-site/robots.txt +++ b/src/testresources/fetch-test-site/robots.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file