-
Notifications
You must be signed in to change notification settings - Fork 1
/
rssToLinks.py
executable file
·38 lines (32 loc) · 1.06 KB
/
rssToLinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/python
#
# This program reads a feed an extracts its entries and the links they contain
# The idea is to have for each entry a list of the links at the end.
# Each entry contains the text and a list of links. In the text there are
# numbers in [x] for each link that reference the corresponding link at the
# end. It has no HTML markup.
import feedparser, re
from bs4 import BeautifulSoup
from bs4 import NavigableString
from bs4 import Tag
url='https://fernand0.github.io/feed.xml' # Put your RSS feed here
feed = feedparser.parse(url)
for i in range(len(feed.entries)):
soup = BeautifulSoup(feed.entries[i].summary)
links = soup("a")
j = 0
linksTxt = ""
for link in links:
if not isinstance(link.contents[0], Tag):
# We want to avoid embdeded tags (mainly <img ... )
link.append(" ["+str(j)+"]")
linksTxt = linksTxt + "["+str(j)+"] " + link.contents[0] + "\n"
linksTxt = linksTxt + " " + link['href'] + "\n"
j = j + 1
print "Entry "+str(i)+":"
print soup.get_text()
if linksTxt != "":
print
print "Links :"
print linksTxt
print