Skip to content

Commit

Permalink
Merge pull request #870 from PrimozGodec/import-documents-fix-metadat…
Browse files Browse the repository at this point in the history
…a-matching

[FIX] Import Documents - fix metadata matching
  • Loading branch information
PrimozGodec authored Jun 21, 2022
2 parents cc48a0d + c3ffba7 commit 1a153fe
Showing 1 changed file with 13 additions and 11 deletions.
24 changes: 13 additions & 11 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from Orange.data.io import detect_encoding, sanitize_variable
from Orange.data.util import get_unique_names
from Orange.util import Registry, dummy_callback
from Orange.misc.utils.embedder_utils import get_proxies

from orangecontrib.text.corpus import Corpus

Expand Down Expand Up @@ -195,6 +196,16 @@ def read_file(self):
self.content = pd.read_csv(self.path, delimiter="\t")


def quote_url(u):
u = u.strip()
# Support URL with query or fragment like http://filename.txt?a=1&b=2#c=3

def quote_byte(b):
return chr(b) if b < 0x80 else "%{:02X}".format(b)

return "".join(map(quote_byte, u.encode("utf-8")))


ResponseType = Tuple[Optional[Reader], Optional[TextData], Optional[str]]


Expand Down Expand Up @@ -228,23 +239,14 @@ def read_files(

@staticmethod
async def _read_files(urls: List[str], callback: Callable) -> List[ResponseType]:
async with httpx.AsyncClient(timeout=10.0) as client:
async with httpx.AsyncClient(timeout=10.0, proxies=get_proxies()) as client:
req = [UrlProxyReader._read_file(url, client, callback) for url in urls]
return await asyncio.gather(*req)

@staticmethod
async def _read_file(
url: str, client: httpx.AsyncClient, callback: Callable
) -> ResponseType:
def quote_url(u):
u = u.strip()
# Support URL with query or fragment like http://filename.txt?a=1&b=2#c=3

def quote_byte(b):
return chr(b) if b < 0x80 else "%{:02X}".format(b)

return "".join(map(quote_byte, u.encode("utf-8")))

# repeat if unsuccessful (can be due to network error)
for _ in range(3):
try:
Expand Down Expand Up @@ -549,7 +551,7 @@ def _add_metadata(self, corpus: Corpus) -> Corpus:
path_column = corpus.get_column_view("utterance")[0]
else:
df = self._meta_data.set_index(
self.startdir + self._meta_data[self.META_DATA_FILE_KEY]
self.startdir + self._meta_data[self.META_DATA_FILE_KEY].apply(quote_url)
)
path_column = corpus.get_column_view("path")[0]

Expand Down

0 comments on commit 1a153fe

Please sign in to comment.