diff --git a/twrh-dataset/.gitignore b/twrh-dataset/.gitignore index 087ae8cc..0788a6ca 100644 --- a/twrh-dataset/.gitignore +++ b/twrh-dataset/.gitignore @@ -4,3 +4,5 @@ crawler/settings.py *.log *.csv rental_house.json +devop +datas/*.zip diff --git a/twrh-dataset/django/crawlerrequest/management/commands/deduprequest.py b/twrh-dataset/django/crawlerrequest/management/commands/deduprequest.py new file mode 100644 index 00000000..b94a4281 --- /dev/null +++ b/twrh-dataset/django/crawlerrequest/management/commands/deduprequest.py @@ -0,0 +1,23 @@ +"""Remove duplicated request""" +from django.core.management.base import BaseCommand +from django.db import connection + +SQL = """ +delete from request_ts where id in ( + select id from ( + select + min(id) as id, + count(*) as n + from request_ts + group by year, month, day, (seed->>0) + ) + as t where n > 1 +); +""" + +class Command(BaseCommand): + help = 'Remove duplicated request' + + def handle(self, *args, **options): + with connection.cursor() as cursor: + cursor.execute(SQL) diff --git a/twrh-dataset/remove-duplicated-task.sh b/twrh-dataset/remove-duplicated-task.sh new file mode 100644 index 00000000..cb407ed4 --- /dev/null +++ b/twrh-dataset/remove-duplicated-task.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $DIR + +now=`date +'%Y.%m.%d.%H%M'` +echo "------ $now ------" >> ../logs/deduprequest.log +pipenv run python manage.py deduprequest >> ../logs/deduprequest.log