-
Notifications
You must be signed in to change notification settings - Fork 1
/
fileparser.py
129 lines (101 loc) · 3.55 KB
/
fileparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
class Parse:
'''Intialize article json'''
def __init__(self, article):
self.article = article
def get_title(self):
try:
return self.article['title']
except:
return []
def get_date(self):
try:
return self.article['date']
except:
return []
def get_url(self):
try:
return self.article['url']
except:
return []
def get_summary(self):
try:
return self.article['summary']
except:
return []
def get_text(self):
'''Gets ALL the text in our article'''
text = ''
for element in self.article:
# check subheading
try:
# subheading comes under a number so conversion to integer is possible
int(element)
text += ''.join(self.article[element]['para']) + '\n'
except:
pass
return text
def get_textall(self):
"gets all the text , treats subheadings as a part of the text"
text = ''
for element in self.article:
# check subheading
try:
# subhead or text comes under a number so conversion to integer is possible
int(element)
try:
text += self.article[element]['subheading'] + '\n'
text += ''.join(self.article[element]['para']) + '\n'
except:
text += ''.join(self.article[element]['para']) + '\n'
except:
pass
return text
def get_subheading(self):
'Returns a list all the subheadings'
subheadings = []
for element in self.article:
try:
int(element)
if int(element) == 0:
pass
else:
subheadings.append(self.article[element]['subheading'])
except:
pass
return subheadings
def get_images(self, path):
'Returns a list of tuples of all image names inside our article along with their captions'
images = []
for element in self.article:
try:
int(element)
if 'images' in self.article[element]:
images.extend(self.article[element]['images'])
except:
pass
all_im = os.listdir(path)
filter_images = []
# Instead of checking like this, create a dictionary of article-images using this method and read image from that file directly
for i in images:
if i[0] in all_im and i not in filter_images:
filter_images.append(i)
return filter_images
def get_keywords(self):
'Returns a list of all keywords'
try:
return self.article['keyword']
except:
return []
def get_related(self):
'Returns a list of URLs all related articles'
try:
return self.article['related']
except:
return []
def get_en_url(self):
'Returns the parallel English URL for cross-lingual data'
try:
return self.article['english_url']
except:
return ''