diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index afb5519d5..34bd06e1f 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -87,6 +87,37 @@ Default: ``'frontera.contrib.backends.memory.FIFO'`` The :class:`Backend ` to be used by the frontier. For more info see :ref:`Activating a backend `. + +.. setting:: BC_MIN_REQUESTS + +BC_MIN_REQUESTS +--------------- + +Default: ``64`` + +Broad crawling queue get operation will keep retrying until specified number of requests is collected. Maximum number +of retries is hard-coded to 3. + +.. setting:: BC_MIN_HOSTS + +BC_MIN_HOSTS +------------ + +Default: ``24`` + +Keep retyring when getting requests from queue, until there are requests for specified minimum number of hosts +collected. Maximum number of retries is hard-coded and equals 3. + +.. setting:: BC_MAX_REQUESTS_PER_HOST + +BC_MAX_REQUESTS_PER_HOST +------------------------ + +Default:: ``128`` + +Don't include (if possible) batches of requests containing requests for specific host if there are already more then +specified count of maximum requests per host. This is a suggestion for broad crawling queue get algorithm. + .. setting:: CANONICAL_SOLVER CANONICAL_SOLVER diff --git a/docs/source/topics/frontier-backends.rst b/docs/source/topics/frontier-backends.rst index 90fa37874..706da8377 100644 --- a/docs/source/topics/frontier-backends.rst +++ b/docs/source/topics/frontier-backends.rst @@ -283,6 +283,12 @@ tunning a block cache to fit states within one block for average size website. T to achieve documents closeness within the same host. This function can be selected with :setting:`URL_FINGERPRINT_FUNCTION` setting. +.. TODO: document details of block cache tuning, + BC* settings and queue get operation concept, + hbase tables schema and data flow + Queue exploration + shuffling with MR jobs + .. _FIFO: http://en.wikipedia.org/wiki/FIFO .. _LIFO: http://en.wikipedia.org/wiki/LIFO_(computing) .. _DFS: http://en.wikipedia.org/wiki/Depth-first_search diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index e804fcc96..b9b88b44f 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -376,6 +376,10 @@ def __init__(self, manager): port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') + self._min_requests = settings.get('BC_MIN_REQUESTS') + self._min_hosts = settings.get('BC_MIN_HOSTS') + self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') + self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { @@ -456,8 +460,10 @@ def get_next_requests(self, max_next_requests, **kwargs): for partition_id in range(0, self.queue_partitions): if partition_id not in partitions: continue - results = self.queue.get_next_requests(max_next_requests, partition_id, min_requests=64, - min_hosts=24, max_requests_per_host=128) + results = self.queue.get_next_requests(max_next_requests, partition_id, + min_requests=self._min_requests, + min_hosts=self._min_hosts, + max_requests_per_host=self._max_requests_per_host) next_pages.extend(results) self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) return next_pages diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index 7511e3eae..d5f2d8c1f 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -3,6 +3,9 @@ AUTO_START = True BACKEND = 'frontera.contrib.backends.memory.FIFO' +BC_MIN_REQUESTS = 64 +BC_MIN_HOSTS = 24 +BC_MAX_REQUESTS_PER_HOST = 128 CANONICAL_SOLVER = 'frontera.contrib.canonicalsolvers.Basic' DELAY_ON_EMPTY = 5.0 DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1'