-
Notifications
You must be signed in to change notification settings - Fork 0
/
nicht_lustig_archive.py
101 lines (73 loc) · 3.3 KB
/
nicht_lustig_archive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import itertools
import json
import random
import re
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://joscha.com/nichtlustig"
MEDIA_URL = "https://joscha.com/data/media/cartoons/"
IMAGE_URL = f"{MEDIA_URL}"+"{image}"
BONUS_URL = f"{MEDIA_URL}bonus/"+"{image}"
def get_cartoons_list():
"""Fetch the landing page of nichtlustig and scrape js for a list of all available cartoon panels.
- **return** and **return types**::
:return: returns a json-compatible list of cartoonpanels
:rtype: list containing dictionaries describing each panel
"""
soup = BeautifulSoup(requests.get(BASE_URL).content, "html.parser")
# As the list is embedded inside of a javascript tag we cannot utilise bs4 any further but to find all occurances of script-tags.
reg = re.compile("var cartoonList = (\[.*?\]);")
# Also we have to make the cartoonlist json-readable ie replace single-quotes with double-qoutes
scripts = [reg.search(scr.text.replace("'",'"')).group(1) for scr in soup.findAll("script") if reg.search(scr.text)]
cartoons_list = []
for script in scripts:
# Remove anything decorativ/we like tabs, excessive spaces.
script = script.replace('\t',"").replace(' "', '"').replace('" ', '"')
# Remove trailing colons
script = script.replace(",]", "]")
cartoons_list.extend(json.loads(script))
# Ensure no duplicates
cartoons_list = [a[0] for a in itertools.groupby(cartoons_list)]
return cartoons_list
def get_bonus_cartoons_list()->list():
return list(filter(lambda cartoon: cartoon['public_bonus'], get_cartoons_list()))
def get_uri(*cartoons)->list():
"""Produce a list of URIs pointing to every publicly accessible panel for a given cartoon-dictionary as produced by get_cartoon_list().
- **parameters**, **types**, **return** and **return types**::
:param cartoons: Any amount of dictionaries describing cartoon-panels
:type cartoons: dictionaries
:return: returns a list of cartoon panels (Including publicly accessible bonus panels)
:rtype: list containing dictionaries describing each panel
"""
uri_list = []
for cartoon in cartoons:
uri_list.append(IMAGE_URL.format(image=cartoon["image"]))
if cartoon["public_bonus"]:
uri_list.append(BONUS_URL.format(image=cartoon["bonus_image"]))
return uri_list
def get_random_cartoon(count:int = 1):
"""Produce a random selection of N cartoon URIs.
- **parameters**, **types**, **return** and **return types**::
:param count: How many cartoons do you want?
:type count: int
:return: List of cartoon panels
:rtype: list containing str
"""
return get_uri(*random.sample(get_cartoons_list(), count))
def get_random_bonus_cartoon(count:int = 1):
"""Produce a random selection of N cartoon URIs with bonus panels (in sum 2*count URIs).
- **parameters**, **types**, **return** and **return types**::
:param count: How many cartoons do you want?
:type count: int
:return: List of cartoon panels
:rtype: list containing str
"""
return get_uri(*random.sample(get_bonus_cartoons_list(), count))
def get_cartoons_list_by_tag(tag:str)->list():
return list(filter(lambda cartoon: tag in cartoon['tags'], get_cartoons_list()))
def get_bonus_cartoons_list_by_tag(tag:str)->list():
return list(filter(lambda cartoon: tag in cartoon['tags'], get_bonus_cartoons_list()))
def main():
print(get_random_cartoon())
if __name__ == '__main__':
main()