Skip to content

Commit

Permalink
feat: filman_crawler for filmweb
Browse files Browse the repository at this point in the history
  • Loading branch information
suchencjusz committed Sep 14, 2024
1 parent a741432 commit 90673da
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 147 deletions.
29 changes: 20 additions & 9 deletions src/filman_crawler/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
FROM python:3.11-slim-bullseye
# FROM python:3.11-slim-bullseye

RUN apt update
RUN apt install -y pkg-config default-libmysqlclient-dev build-essential
# RUN apt update
# RUN apt install -y pkg-config default-libmysqlclient-dev build-essential

WORKDIR /src/filman_crawler
# WORKDIR /src/filman_crawler

COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt
# COPY requirements.txt requirements.txt
# RUN pip3 install -r requirements.txt

# COPY /src/filman_crawler .

COPY /src/filman_crawler .
# ENV PYTHONPATH=/src

ENV PYTHONPATH=/src
# CMD [ "python3", "-m", "filman_crawler.main"]

CMD [ "python3", "-m", "filman_crawler.main"]
FROM python:3.11-slim-bullseye
RUN apt update
RUN apt install -y pkg-config default-libmysqlclient-dev build-essential
WORKDIR /src
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt
COPY /src/filman_server /src/filman_server
COPY /src/filman_crawler /src/filman_crawler
ENV PYTHONPATH=/src/filman_server:/src/filman_crawler
CMD [ "python3", "-m", "filman_crawler.main" ]
46 changes: 17 additions & 29 deletions src/filman_crawler/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from fake_useragent import UserAgent
from pydantic import BaseModel

from filman_server.database.schemas import Task, TaskTypes, TaskStatus

from filman_crawler.tasks.scrap_movie import Scraper as movie_scrapper
from filman_crawler.tasks.scrap_series import Scraper as series_scrapper
from filman_crawler.tasks.scrap_user_watched_movies import (
Expand Down Expand Up @@ -70,33 +72,6 @@
"TE": "trailers",
}


class TaskTypes(str, Enum):
SCRAP_USER = "scrap_user"
SCRAP_FILMWEB_MOVIE = "scrap_filmweb_movie"
SCRAP_FILMWEB_SERIES = "scrap_filmweb_series"
SCRAP_FILMWEB_USER_WATCHED_MOVIES = "scrap_filmweb_user_watched_movies"
SCRAP_FILMWEB_USER_WATCHED_SERIES = "scrap_filmweb_user_watched_series"
SEND_DISCORD_NOTIFICATION = "send_discord_notification"


class TaskStatus(str, Enum):
QUEUED = "queued"
RUNNING = "running"
COMPLETED = "completed"
ERROR = "error"


class Task(BaseModel):
task_id: int
task_status: TaskStatus
task_type: TaskTypes
task_job: str
task_created: datetime
task_started: Optional[datetime] = None
task_finished: Optional[datetime] = None


ALLOWED_TASKS = [
TaskTypes.SCRAP_FILMWEB_MOVIE,
TaskTypes.SCRAP_FILMWEB_SERIES,
Expand All @@ -110,7 +85,7 @@ class Task(BaseModel):
def check_there_are_any_tasks():
try:
r = requests.head(
f"{CORE_ENDPOINT}/tasks/get/task/to_do",
f"{CORE_ENDPOINT}/tasks/get/to_do",
params={"task_types": TASK_TYPES},
headers=HEADERS,
)
Expand All @@ -128,7 +103,7 @@ def check_there_are_any_tasks():
def get_task_to_do() -> Task:
try:
r = requests.get(
f"{CORE_ENDPOINT}/tasks/get/task/to_do",
f"{CORE_ENDPOINT}/tasks/get/to_do",
params={"task_types": TASK_TYPES},
headers=HEADERS,
)
Expand Down Expand Up @@ -165,6 +140,16 @@ def do_task(task: Task):
else:
logging.error(f"Unknown task type: {task.task_type}")

def check_connection() -> bool:
try:
r = requests.get(CORE_ENDPOINT, headers=HEADERS)
if r.status_code == 200:
return True
return False
except Exception as e:
logging.error(f"Error checking connection: {e}")
return False


def main():
logging.info("Program started")
Expand All @@ -191,4 +176,7 @@ def main():


if __name__ == "__main__":
while not check_connection():
logging.error("Connection not established, retrying in 3 seconds")
time.sleep(3)
main()
86 changes: 64 additions & 22 deletions src/filman_crawler/tasks/scrap_user_watched_movies.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,17 @@
import requests
import ujson

from .utils import DiscordNotifications, FilmWeb, Task, Tasks, TaskStatus, TaskTypes
from .utils import (
DiscordNotifications,
FilmWeb,
Task,
Tasks,
TaskStatus,
TaskTypes,
fetch,
FilmWebUserWatchedMovie,
FilmWebMovie,
)

# https://www.filmweb.pl/api/v1/user/tirstus/vote/film

Expand All @@ -22,40 +32,68 @@ def __init__(self, headers=None, endpoint_url=None):
self.headers = headers
self.endpoint_url = endpoint_url

def fetch(self, url, params=None):
response = requests.get(url, headers=self.headers, params=params)
if response.status_code != 200:
logging.error(f"Error fetching {url}: HTTP {response.status_code}")
return None
# def fetch(self, url, params=None):
# response = requests.get(url, headers=self.headers, params=params)
# if response.status_code != 200:
# logging.error(f"Error fetching {url}: HTTP {response.status_code}")
# return None

# r = requests.get(
# f"{CORE_ENDPOINT}/tasks/get/task/to_do",
# params={"task_types": TASK_TYPES},
# headers=HEADERS,
# )

return response.text
# return response.text

def scrap(self, task):
logging.info(f"Scraping user watched movies for user: {task.task_job}")

filmweb = FilmWeb(self.headers, self.endpoint_url)
tasks = Tasks(self.headers, self.endpoint_url)

logging.info("NR 2")

last_100_watched = f"https://www.filmweb.pl/api/v1/user/{task.task_job}/vote/film"
user_already_watched = f"{self.endpoint_url}/filmweb/watched/movies/get/ids"
user_already_watched = f"{self.endpoint_url}/filmweb/user/watched/movies/get"

try:
logging.info(f"Fetching user already watched movies from: {user_already_watched}")
user_already_watched_data = fetch(user_already_watched, params={"filmweb_id": task.task_job})
logging.info(f"Fetched user already watched movies: {user_already_watched_data}")
user_already_watched_ids = [movie["movie"]["id"] for movie in user_already_watched_data]
except Exception as e:
logging.error(f"Error fetching user already watched movies: {e}")
return "Error fetching user already watched movies"

logging.info("NR 3")

try:
last_100_watched_data = self.fetch(last_100_watched)
user_already_watched_data = self.fetch(user_already_watched, params={"filmweb_id": task.task_job})
logging.info(f"Fetching last 100 watched movies from: {last_100_watched}")
last_100_watched_data = fetch(last_100_watched)
logging.info(f"Fetched last 100 watched movies: {last_100_watched_data}")
user_already_watched_data = user_already_watched_ids
except Exception as e:
logging.error(f"Error fetching data: {e}")
return "Error fetching data"
logging.error(f"Error fetching last 100 watched movies: {e}")
return "Error fetching last 100 watched movies"

if last_100_watched_data is None:
self.update_task_status_done(task.id_task)
logging.error(f"Error fetching last 100 watched for {task.task_job}")
return "Private profile"

last_100_watched_data = ujson.loads(last_100_watched_data)
logging.info(f"Last 100 watched: {last_100_watched_data}")
logging.debug(f"Type of last_100_watched_data: {type(last_100_watched_data)}")

try:
if isinstance(last_100_watched_data, str):
last_100_watched_data = ujson.loads(last_100_watched_data)
logging.debug(f"Parsed last 100 watched data: {last_100_watched_data}")
else:
logging.warning("last_100_watched_data is not a string, skipping JSON parsing.")
except Exception as e:
logging.error(f"Error parsing last 100 watched data: {e}")
return "Error parsing last 100 watched data"

user_already_watched_data = set(user_already_watched_data or [])

Expand All @@ -66,30 +104,34 @@ def scrap(self, task):
logging.info(f"Found {len(new_movies_watched)} new movies watched")

new_movies_watched_parsed = []

# tu jest niezly pierdolnik logika jest dobra - ale jest pierdolnik TODO do przepisania
# tak samo o chuj chodzi z id a z id_media? musze zerknac na modele nie pamietam
for movie in new_movies_watched:
# f"https://www.filmweb.pl/api/v1/user/{task.job}/vote/film/{movie_id}"

try:
movie_rate_data = self.fetch(f"https://www.filmweb.pl/api/v1/user/{task.task_job}/vote/film/{movie[0]}")
logging.debug(f"Fetching movie rate data for movie: {movie[0]}")
movie_rate_data = fetch(f"https://www.filmweb.pl/api/v1/user/{task.task_job}/vote/film/{movie[0]}")

if movie_rate_data is None:
logging.warning(f"No movie rate data found for movie: {movie[0]}")
continue

movie_rate_data = ujson.loads(movie_rate_data)
logging.debug(f"Parsed movie rate data: {movie_rate_data}")

filmweb_movie = FilmWebMovie(id_media=movie[0])

new_movies_watched_parsed.append(
filmweb.FilmWebUserWatchedMovie(
id_media=movie[0],
id_filmweb=task.task_job,
FilmWebUserWatchedMovie(
movie=filmweb_movie,
filmweb_id=task.task_job,
date=datetime.datetime.fromtimestamp(movie_rate_data["timestamp"] / 1000),
rate=movie_rate_data.get("rate", 0),
comment=movie_rate_data.get("comment", None),
favorite=bool(movie_rate_data.get("favorite", False)),
)
)
except Exception as e:
logging.error(f"Error parsing movie data: {e}")
logging.error(f"{movie_rate_data}")
logging.error(f"Error parsing movie data for movie {movie[0]}: {e}")
continue

logging.info(f"Found {len(new_movies_watched_parsed)} new movies parsed")
Expand Down
Loading

0 comments on commit 90673da

Please sign in to comment.