Skip to content

Commit

Permalink
Merge pull request #142 from OpenUpSA/fix/file-uploading-casing
Browse files Browse the repository at this point in the history
Fixed file upload file encoding
  • Loading branch information
milafrerichs authored Oct 30, 2020
2 parents 8e40954 + d1faea7 commit ccbe716
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 34 deletions.
8 changes: 8 additions & 0 deletions tests/datasets/factories.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import factory
from factory.django import FileField

from wazimap_ng.datasets import models

Expand Down Expand Up @@ -59,6 +60,13 @@ class Meta:

dataset = factory.SubFactory(DatasetFactory)

class DatasetFileFactory(factory.django.DjangoModelFactory):
class Meta:
model = models.DatasetFile


document = factory.django.FileField()


class DatasetDataFactory(factory.django.DjangoModelFactory):
class Meta:
Expand Down
48 changes: 27 additions & 21 deletions tests/datasets/tasks/test_process_uploaded_file.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
from unittest.mock import patch
import csv
from pathlib import Path
import tempfile
import codecs
from io import BytesIO

import pytest

from wazimap_ng.datasets.tasks.process_uploaded_file import process_csv
from tests.datasets.factories import DatasetFactory, GeographyFactory, GeographyHierarchyFactory
from wazimap_ng.datasets.tasks.process_uploaded_file import process_csv, detect_encoding
from tests.datasets.factories import DatasetFactory, GeographyFactory, GeographyHierarchyFactory, DatasetFileFactory

def generate_file(data, encoding="utf8"):
fp = tempfile.NamedTemporaryFile("w", delete=False, encoding=encoding)
writer = csv.writer(fp)
buffer = BytesIO()
StreamWriter = codecs.getwriter(encoding)
text_buffer = StreamWriter(buffer)

writer = csv.writer(text_buffer)
writer.writerow(["Geography", "field1", "field2", "count"])
writer.writerows(data)
filename = fp.name
fp.close()

return filename
buffer.seek(0)
return buffer


def create_datasetfile(csv_data, encoding):
buffer = generate_file(csv_data, encoding)
return DatasetFileFactory(document__data=buffer.read())


@pytest.fixture
Expand Down Expand Up @@ -51,29 +57,29 @@ def dataset(geography_hierarchy):
("GEOCODE_2", "€ŠF1_value_2", "F2_value_2®®", 222),
]

@pytest.fixture(params=[good_data, data_with_different_case, data_with_different_encodings])
@pytest.fixture(params=[(good_data, "utf8"), (data_with_different_case, "utf8"), (data_with_different_encodings, "Windows-1252")])
def data(request):
return request.param

def test_detect_encoding():
buffer = generate_file(data_with_different_encodings, "Windows-1252")
encoding = detect_encoding(buffer)
assert encoding == "Windows-1252"

@pytest.mark.django_db
class TestUploadFile:

def test_process_csv(self, dataset, data, geographies):
filename = generate_file(data)
process_csv(dataset, filename)
csv_data, encoding = data
datasetfile = create_datasetfile(csv_data, encoding)

process_csv(dataset, datasetfile.document.open("rb"))
datasetdata = dataset.datasetdata_set.all()

assert len(datasetdata) == len(data)
assert len(datasetdata) == len(csv_data)

for dd, ed in zip(datasetdata, data):
for dd, ed in zip(datasetdata, csv_data):
assert dd.geography.code == ed[0]
assert dd.data["field1"] == ed[1]
assert dd.data["field2"] == ed[2]
assert dd.data["count"] == str(ed[3])

path = Path(filename)
if path.exists():
path.unlink()


31 changes: 18 additions & 13 deletions wazimap_ng/datasets/tasks/process_uploaded_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
import os
import logging

Expand All @@ -6,20 +7,19 @@
from chardet.universaldetector import UniversalDetector
import pandas as pd

from ..dataloader import loaddata
from .. import models

from wazimap_ng.general.services.permissions import assign_perms_to_group
from wazimap_ng.general.services.csv_helpers import csv_logger

from ..dataloader import loaddata
from .. import models

logger = logging.getLogger(__name__)

def detect_encoding(filename):
def detect_encoding(buffer):
detector = UniversalDetector()
with open(filename, "rb") as fp:
for line in fp:
detector.feed(line)
if detector.done: break
for line in buffer:
detector.feed(line)
if detector.done: break
detector.close()
return detector.result["encoding"]

Expand All @@ -28,16 +28,21 @@ def process_file_data(df, dataset, row_number):
datasource = (dict(d[1]) for d in df.iterrows())
return loaddata(dataset, datasource, row_number)

def process_csv(dataset, filename, chunksize=1000000):
encoding = detect_encoding(filename)
def process_csv(dataset, buffer, chunksize=1000000):
encoding = detect_encoding(buffer)
StreamReader = codecs.getreader(encoding)
wrapper_file = StreamReader(buffer)
wrapper_file.seek(0)

row_number = 1
df = pd.read_csv(filename, nrows=1, dtype=str, sep=",", encoding=encoding)
df = pd.read_csv(wrapper_file, nrows=1, dtype=str, sep=",", encoding=encoding)
df.dropna(how='all', axis='columns', inplace=True)
columns = df.columns.str.lower()
error_logs = [];
warning_logs = [];

for df in pd.read_csv(filename, chunksize=chunksize, dtype=str, sep=",", header=None, skiprows=1, encoding=encoding):
wrapper_file.seek(0)
for df in pd.read_csv(wrapper_file, chunksize=chunksize, dtype=str, sep=",", header=None, skiprows=1, encoding=encoding):
df.dropna(how='all', axis='columns', inplace=True)
df.columns = columns
errors, warnings = process_file_data(df, dataset, row_number)
Expand Down Expand Up @@ -75,7 +80,7 @@ def process_uploaded_file(dataset_file, dataset, **kwargs):

if ".csv" in filename:
logger.debug(f"Processing as csv")
csv_output = process_csv(dataset, dataset_file.document.name, chunksize)
csv_output = process_csv(dataset, dataset_file.document.open("rb"), chunksize)
error_logs = csv_output["error_logs"]
warning_logs = csv_output["warning_logs"]
columns = csv_output["columns"]
Expand Down

0 comments on commit ccbe716

Please sign in to comment.