-
Notifications
You must be signed in to change notification settings - Fork 1
/
films_descriptions
109 lines (94 loc) · 3.84 KB
/
films_descriptions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash
IN_FILE_LIST=$1
IN_FILE_DESCR=$2
if [ -z "$IN_FILE_LIST" -o -z "$IN_FILE_DESCR" -o -z "$3" ]
then
echo "Format isn't correct!"
echo "in_file_list in_file_descriptions out_file update(optional)"
exit
fi
USER_AGENT="Mozilla/5.0 (Windows; U; Windows NT 5.1; AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.437.3 Safari/534.1"
#cat $1 | head -n 47 | tail -n 3 > /tmp/init_list_fd
DELIM=`grep -m 1 "2012" "$IN_FILE_DESCR" | cut -c 16`
FILMS_NUMBER=`grep -c -E "*" "$IN_FILE_LIST"`
FILMS_I=0
function print_progress() {
FILMS_I=`expr $FILMS_I + 100`
PROGRESS=`expr $FILMS_I / $FILMS_NUMBER`
echo -en "\rProgress: $PROGRESS% "
}
function file_download() {
if [[ -e $HOME/Dropbox/films_about/$NUMBER.html ]]
then
local SZOR=`grep -c "СЗоР" $HOME/Dropbox/films_about/$NUMBER.html`
fi
if [[ ! -e $HOME/Dropbox/films_about/$NUMBER.html || $SZOR > 0 ]]
then
# wget -q -O $HOME/Dropbox/films_about/$NUMBER.html -U "$USER_AGENT" \
# http://m.kinopoisk.ru/movie/$NUMBER/
# lynx
elinks -source http://m.kinopoisk.ru/movie/$NUMBER/ \
| iconv -f cp1251 -t utf8 > $HOME/Dropbox/films_about/$NUMBER.html
echo "Film $FILM downloaded"
fi
if [ ! -e $HOME/Dropbox/films_images/$NUMBER.jpg ]
then
wget -q -O $HOME/Dropbox/films_images/$NUMBER.jpg -U "$USER_AGENT" \
http://st.kinopoisk.ru/images/film/$NUMBER.jpg
fi
}
function html_parsing() {
head -n $1 "$FILE_ABOUT" | tail -n 1 | cut -d ">" -f2 | cut -d "<" -f1
}
echo "Название|ВЭБ|Дата|Длительность|Жанр|Страна|Оригинальный заголовок|Описание|Картинка" > /tmp/films_full.csv
echo "Обложка|Название|ВЭБ|Дата|Длительность|Жанр|Страна|Оригинальный заголовок|Описание" > /tmp/films_full_gd.csv
cat $IN_FILE_LIST | while read FILM
do
print_progress
STRING=`grep -m 1 "$FILM" "$IN_FILE_DESCR"`
URL=`echo "$STRING" | cut -d "$DELIM" -f2`
NUMBER=`echo $URL | cut -d "/" -f7`
DATA_EXIST=`echo "$STRING" | cut -d "$DELIM" -f3`
#echo "$FILM"
#echo "$STRING"
if [[ "$4" == "update" || "$DATA_EXIST" == "" ]]
then
file_download
FILE_ABOUT="$HOME/Dropbox/films_about/$NUMBER.html"
NAME_RU=`html_parsing 41`
#echo $NAME_RU
LINE_44=`html_parsing 42`
GENRE=`html_parsing 46`
COUNTRY=`html_parsing 47`
DESCR=`grep "descr" "$FILE_ABOUT" | cut -d ">" -f2 | cut -d "<" -f1`
PICT="$HOME/Dropbox/films_images/$NUMBER.jpg"
#GENRE=`echo $GENRE | sed 's|,||g'`
RUSSIA=`echo $COUNTRY | grep -c -E "Россия|СССР|Украина|Беларусь"`
if [[ "$RUSSIA" -ne "0" ]]
then
NAME_EN="$NAME_RU"
YEAR=`echo $LINE_44 | cut -d "," -f1`
DURATION=`echo $LINE_44 | cut -d "," -f2`
else
NAME_EN=`echo $LINE_44 | cut -d "," -f1`
YEAR=`echo $LINE_44 | cut -d "," -f2`
if [[ "$YEAR" = " The" || "$YEAR" = " Le" \
|| "$YEAR" = " La" || "$YEAR" = " Man" || "$YEAR" = " Robot" ]]
then
NAME_EN="$NAME_EN,$YEAR"
YEAR=`echo $LINE_44 | cut -d "," -f3`
DURATION=`echo $LINE_44 | cut -d "," -f4`
else
DURATION=`echo $LINE_44 | cut -d "," -f3`
fi
fi
IMG="=Image(\"http://st.kinopoisk.ru/images/film/$NUMBER.jpg\",1)"
echo "$FILM|$URL|$YEAR|$DURATION|$GENRE|$COUNTRY|$NAME_EN|$DESCR|$PICT" >> /tmp/films_full.csv
echo "$IMG|$FILM|$URL|$YEAR|$DURATION|$GENRE|$COUNTRY|$NAME_EN|$DESCR" >> /tmp/films_full_gd.csv
else
echo "$STRING" >> /tmp/films_full.csv
fi
done
mv /tmp/films_full.csv $3
mv /tmp/films_full_gd.csv films_full_gd.csv
echo -e "\n"