forked from streamlit/example-app-langchain-rag
-
Notifications
You must be signed in to change notification settings - Fork 0
/
remote_loader.py
78 lines (54 loc) · 2.13 KB
/
remote_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import requests
import os
from langchain_community.document_loaders import WebBaseLoader, WikipediaLoader
from local_loader import get_document_text
from langchain_community.document_loaders import OnlinePDFLoader
# if you want it locally, you can use:
CONTENT_DIR = os.path.dirname(__file__)
# an alternative if you want it in /tmp or equivalent.
# CONTENT_DIR = tempfile.gettempdir()
def load_web_page(page_url):
loader = WebBaseLoader(page_url)
data = loader.load()
return data
def load_online_pdf(pdf_url):
loader = OnlinePDFLoader(pdf_url)
data = loader.load()
return data
def filename_from_url(url):
filename = url.split("/")[-1]
return filename
def download_file(url, filename=None):
response = requests.get(url)
if not filename:
filename = filename_from_url(url)
full_path = os.path.join(CONTENT_DIR, filename)
with open(full_path, mode="wb") as f:
f.write(response.content)
download_path = os.path.realpath(f.name)
print(f"Downloaded file {filename} to {download_path}")
return download_path
def get_wiki_docs(query, load_max_docs=2):
wiki_loader = WikipediaLoader(query=query, load_max_docs=load_max_docs)
docs = wiki_loader.load()
for d in docs:
print(d.metadata["title"])
return docs
def main():
# run through the different remote loading functions.
problems_of_philosophy_by_russell = "https://www.gutenberg.org/ebooks/5827.html.images"
docs = load_web_page(problems_of_philosophy_by_russell)
for doc in docs:
print(doc)
math_analysis_of_logic_by_boole = "https://www.gutenberg.org/files/36884/36884-pdf.pdf"
local_pdf_path = download_file(math_analysis_of_logic_by_boole)
with open(local_pdf_path, "rb") as pdf_file:
docs = get_document_text(pdf_file, title="Analysis of Logic")
for doc in docs:
print(doc)
problems_of_philosophy_pdf = "https://s3-us-west-2.amazonaws.com/pressbooks-samplefiles/LewisTheme/The-Problems-of-Philosophy-LewisTheme.pdf"
docs = load_online_pdf(problems_of_philosophy_pdf)
for doc in docs:
print(doc)
if __name__ == "__main__":
main()