Skip to content
Snippets Groups Projects
Unverified Commit 9ca2f13a authored by Henning Bredel's avatar Henning Bredel Committed by GitHub
Browse files

Enumerate unzipped files (#10842)

* Keep all uploaded zip content accessible

iterdir() is platform dependent, that is the order of the returned items
may be different on different platforms. In cases where a zip file
contains multiple base_file candidates it will be overridden by the last
one found (which varies on different platforms).

Also, different files with the same extension (file1.csv, file2.csv) will not
be accessible from file_paths as they get overridden, too.

The fix enumerates all files to make them accessible from file_paths.

* Sorts files during unzip

Ensures that unpacked content is sorted before getting handled

* Resolve minor issues

* Ensure index on extensions found multiple times
parent 47a9aa6f
Branches
No related tags found
No related merge requests found
......@@ -205,17 +205,34 @@ class DataRetriever(object):
at the end the zip is deleted
"""
zip_file = self.file_paths["base_file"]
the_zip = zipfile.ZipFile(zip_file, allowZip64=True)
the_zip.extractall(self.temporary_folder)
with zipfile.ZipFile(zip_file, allowZip64=True) as the_zip:
the_zip.extractall(self.temporary_folder)
available_choices = get_allowed_extensions()
not_main_files = ["xml", "sld", "zip", "kmz"]
base_file_choices = [x for x in available_choices if x not in not_main_files]
for _file in Path(self.temporary_folder).iterdir():
if any([_file.name.endswith(_ext) for _ext in base_file_choices]):
self.file_paths["base_file"] = Path(str(_file))
elif not zipfile.is_zipfile(str(_file)):
sorted_files = sorted(Path(self.temporary_folder).iterdir())
for _file in sorted_files:
if not zipfile.is_zipfile(str(_file)):
if any([_file.name.endswith(_ext) for _ext in base_file_choices]):
self.file_paths["base_file"] = Path(str(_file))
ext = _file.name.split(".")[-1]
self.file_paths[f"{ext}_file"] = Path(str(_file))
if f"{ext}_file" in self.file_paths:
existing = self.file_paths[f"{ext}_file"]
self.file_paths[f"{ext}_file"] = [
Path(str(_file)),
*(existing if isinstance(existing, list) else [existing]),
]
else:
self.file_paths[f"{ext}_file"] = Path(str(_file))
tmp = self.file_paths.copy()
for key, value in self.file_paths.items():
if isinstance(value, list):
for index, file_path in enumerate(value):
n = f"{key}_{index}" if index > 0 else key
tmp[n] = file_path
self.file_paths = tmp
# remiving the zip file
os.remove(zip_name)
......
......@@ -573,7 +573,21 @@ class TestDataRetriever(TestCase):
self.assertIsNotNone(storage_manager.data_retriever.temporary_folder)
_files = storage_manager.get_retrieved_paths()
self.assertTrue("example.csv" in _files.get("base_file"))
# Selected base_file is not defined in case of multiple csv files
self.assertTrue(_files.get("base_file").endswith(".csv"))
def test_zip_file_should_correctly_index_file_extensions(self):
# reinitiate the storage manager with the zip file
storage_manager = self.sut(
remote_files={"base_file": os.path.join(f"{self.project_root}", "tests/data/example.zip")}
)
storage_manager.clone_remote_files()
self.assertIsNotNone(storage_manager.data_retriever.temporary_folder)
_files = storage_manager.get_retrieved_paths()
self.assertIsNotNone(_files.get("csv_file"))
# extensions found more than once get indexed
self.assertIsNotNone(_files.get("csv_file_1"))
@override_settings(
SUPPORTED_DATASET_FILE_TYPES=[
......
No preview for this file type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment