diff --git a/schema-api/files/serializers.py b/schema-api/files/serializers.py index 9dfbf9e..6e8c936 100644 --- a/schema-api/files/serializers.py +++ b/schema-api/files/serializers.py @@ -89,3 +89,7 @@ def get_name(self, obj): if issubclass(obj.__class__, Directory): name += '/' return name + +class UnzipRequestSerializer(serializers.Serializer): + zip_path = serializers.CharField() + destination_path = serializers.CharField() \ No newline at end of file diff --git a/schema-api/files/services.py b/schema-api/files/services.py index ac2033c..192de51 100644 --- a/schema-api/files/services.py +++ b/schema-api/files/services.py @@ -10,6 +10,15 @@ from files.models import Directory, File, FileMetadata from util.exceptions import ApplicationError, ApplicationNotFoundError, ApplicationDuplicateError +import zipfile # File unzip functionality +import tempfile # File unzip functionality +import shutil # File unzip functionality +import logging # File unzip functionality +from zipfile import BadZipFile # Catch malformed zip files + +logger = logging.getLogger( + __name__) # File unzip -> logging progress functionality + class S3BucketService: @@ -21,14 +30,14 @@ def __init__(self, auth_entity: AuthEntity): self.auth_entity = auth_entity self.bucket = str(self.auth_entity.uuid) - self.s3_client = boto3.client('s3', - endpoint_url=settings.S3['URL'], - aws_access_key_id=settings.S3['ACCESS_KEY_ID'], - aws_secret_access_key=settings.S3['SECRET_ACCESS_KEY'], - config=boto3.session.Config(signature_version='s3v4'), - verify=settings.S3['USE_SSL'], - use_ssl=settings.S3['USE_SSL'] - ) + self.s3_client = boto3.client( + 's3', + endpoint_url=settings.S3['URL'], + aws_access_key_id=settings.S3['ACCESS_KEY_ID'], + aws_secret_access_key=settings.S3['SECRET_ACCESS_KEY'], + config=boto3.session.Config(signature_version='s3v4'), + verify=settings.S3['USE_SSL'], + use_ssl=settings.S3['USE_SSL']) self._create_bucket_if_not_exists() def _create_bucket_if_not_exists(self): @@ -49,10 +58,12 @@ def _stat_object(self, key: str) -> File: response = self.s3_client.head_object(Bucket=self.bucket, Key=key) except ClientError as ce: if ce.response['Error']['Code'] == '404': - raise ApplicationNotFoundError(f'File `{key}` does not exist') from ce + raise ApplicationNotFoundError( + f'File `{key}` does not exist') from ce raise d = Directory() - metadata = FileMetadata(size=response['ContentLength'], ts_modified=response['LastModified']) + metadata = FileMetadata(size=response['ContentLength'], + ts_modified=response['LastModified']) return d.create_entity_on_path(File, key, metadata=metadata) def issue_upload_urls(self, size: int, file_path: str): @@ -68,28 +79,42 @@ def issue_upload_urls(self, size: int, file_path: str): if size > max_part_size: - part_sizes = [max_part_size] * (size // max_part_size) + [size % max_part_size] + part_sizes = [max_part_size] * (size // max_part_size) + [ + size % max_part_size + ] # Issue new multipart upload creation= - response = self.s3_client.create_multipart_upload(Bucket=self.bucket, Key=key, Expires=expiry) + response = self.s3_client.create_multipart_upload( + Bucket=self.bucket, Key=key, Expires=expiry) upload_id = response['UploadId'] urls = [] for i in range(len(part_sizes)): - url = self.s3_client.generate_presigned_url(ClientMethod='upload_part', - Params={'Bucket': self.bucket, 'Key': key, - 'PartNumber': i + 1, - 'UploadId': upload_id, - 'ContentLength': part_sizes[i]}, - ExpiresIn=validity_period_seconds) - urls.append({'part': i + 1, 'url': url, 'n_bytes': part_sizes[i]}) - - complete_url = self.s3_client.generate_presigned_url(ClientMethod='complete_multipart_upload', - Params={'Bucket': self.bucket, 'Key': key, - 'UploadId': upload_id}, - ExpiresIn=validity_period_seconds - ) + url = self.s3_client.generate_presigned_url( + ClientMethod='upload_part', + Params={ + 'Bucket': self.bucket, + 'Key': key, + 'PartNumber': i + 1, + 'UploadId': upload_id, + 'ContentLength': part_sizes[i] + }, + ExpiresIn=validity_period_seconds) + urls.append({ + 'part': i + 1, + 'url': url, + 'n_bytes': part_sizes[i] + }) + + complete_url = self.s3_client.generate_presigned_url( + ClientMethod='complete_multipart_upload', + Params={ + 'Bucket': self.bucket, + 'Key': key, + 'UploadId': upload_id + }, + ExpiresIn=validity_period_seconds) return { 'type': 'multipart', 'expiry': expiry, @@ -98,14 +123,15 @@ def issue_upload_urls(self, size: int, file_path: str): 'finalize': complete_url } } - url = self.s3_client.generate_presigned_url(ClientMethod='put_object', - Params={'Bucket': self.bucket, 'Key': key, 'ContentLength': size}, - ExpiresIn=validity_period_seconds) - return { - 'type': 'simple', - 'expiry': expiry, - 'url': url - } + url = self.s3_client.generate_presigned_url( + ClientMethod='put_object', + Params={ + 'Bucket': self.bucket, + 'Key': key, + 'ContentLength': size + }, + ExpiresIn=validity_period_seconds) + return {'type': 'simple', 'expiry': expiry, 'url': url} def issue_download_urls(self, file_path: str): key = self._normalize_path(file_path) @@ -115,12 +141,16 @@ def issue_download_urls(self, file_path: str): expiry = current_ref_ts + validity_period self._stat_object(key) return { - 'expiry': expiry, - 'url': self.s3_client.generate_presigned_url( + 'expiry': + expiry, + 'url': + self.s3_client.generate_presigned_url( ClientMethod='get_object', - Params={'Bucket': self.bucket, 'Key': key}, - ExpiresIn=validity_period_seconds - ) + Params={ + 'Bucket': self.bucket, + 'Key': key + }, + ExpiresIn=validity_period_seconds) } def list_objects(self, subdir: str = '.') -> Directory: @@ -139,17 +169,21 @@ def list_objects(self, subdir: str = '.') -> Directory: directory = Directory() for page in page_iterator: for obj in page.get('Contents', []): - metadata = FileMetadata(size=obj['Size'], ts_modified=obj['LastModified']) + metadata = FileMetadata(size=obj['Size'], + ts_modified=obj['LastModified']) directory.create_entity_on_path( File, - obj['Key'][len(prefix):] if prefix != '' and obj['Key'].startswith(prefix) else obj['Key'], - metadata=metadata - ) + obj['Key'][len(prefix):] if prefix != '' + and obj['Key'].startswith(prefix) else obj['Key'], + metadata=metadata) return directory # - same as copy and delete - def move_object(self, old_path: str, new_path: str, overwrite=False) -> File: + def move_object(self, + old_path: str, + new_path: str, + overwrite=False) -> File: old_key = self._normalize_path(old_path) new_key = self._normalize_path(new_path) @@ -165,12 +199,20 @@ def retrieve_object(self, path: str) -> FileMetadata: key = self._normalize_path(path) return self._stat_object(key).metadata + def retrieve_object_bytes(self, path: str) -> bytes: + key = self._normalize_path(path) + response = self.s3_client.get_object(Bucket=self.bucket, Key=key) + return response['Body'].read() + def delete_object(self, path: str) -> None: key = self._normalize_path(path) self._stat_object(key) self.s3_client.delete_object(Bucket=self.bucket, Key=key) - def copy_object(self, source_path: str, destination_path: str, overwrite: bool = False) -> File: + def copy_object(self, + source_path: str, + destination_path: str, + overwrite: bool = False) -> File: source_key = self._normalize_path(source_path) destination_key = self._normalize_path(destination_path) @@ -182,17 +224,108 @@ def copy_object(self, source_path: str, destination_path: str, overwrite: bool = except ApplicationNotFoundError: pass if found: - raise ApplicationDuplicateError({'destination': f'File `{destination_key}` already exists'}) + raise ApplicationDuplicateError({ + 'destination': + f'File `{destination_key}` already exists' + }) try: - self.s3_client.copy_object( - Bucket=self.bucket, - CopySource={'Bucket': self.bucket, 'Key': source_key}, - Key=destination_key - ) + self.s3_client.copy_object(Bucket=self.bucket, + CopySource={ + 'Bucket': self.bucket, + 'Key': source_key + }, + Key=destination_key) except ClientError as ce: if ce.response['Error']['Code'] == 'NoSuchKey': - raise ApplicationNotFoundError(f'File `{source_key}` does not exist') from ce + raise ApplicationNotFoundError( + f'File `{source_key}` does not exist') from ce raise - return Directory().create_entity_on_path(File,destination_key) + return Directory().create_entity_on_path(File, destination_key) + + + def unzip_file_to_s3_folder(self, + zip_path: str, + destination_folder: str = "", + progress_callback=None) -> list[dict]: + """ + Unzips a zip file stored in S3 to a temporary directory and uploads its contents to S3. + Returns metadata of uploaded files. Optionally accepts a progress callback. + + :param zip_path: Path to the .zip file in S3 + :param destination_folder: Destination path prefix in S3 + :param progress_callback: Optional callable(metadata_dict, index, total) + :return: List of metadata dicts with 's3_path', 'filename', 'size' + """ + key = self._normalize_path(zip_path) + destination_folder = self._normalize_path(destination_folder) + + uploaded_files_metadata = [] + + with tempfile.TemporaryDirectory() as tmpdir: + zip_tmp_path = os.path.join(tmpdir, "archive.zip") + + logger.info(f"Downloading zip from S3: {zip_path}") + try: + with open(zip_tmp_path, "wb") as f: + response = self.s3_client.get_object(Bucket=self.bucket, + Key=key) + shutil.copyfileobj(response["Body"], f) + except ClientError as ce: + if ce.response["Error"]["Code"] == "NoSuchKey": + raise ApplicationNotFoundError( + f"File `{key}` does not exist") from ce + raise + + logger.info(f"Extracting zip file to temp dir: {tmpdir}") + try: + with zipfile.ZipFile(zip_tmp_path, 'r') as zip_ref: + zip_ref.extractall(tmpdir) + except BadZipFile as e: + logger.error(f"File at {zip_path} is not a valid ZIP archive: {e}") + raise ValueError( + "The provided file is not a valid zip archive.") from e + + # Collect all files first + extracted_files = [] + for root, _, files in os.walk(tmpdir): + for file_name in files: + abs_file_path = os.path.join(root, file_name) + if abs_file_path != zip_tmp_path: + extracted_files.append(abs_file_path) + + total_files = len(extracted_files) + logger.info(f"Uploading {total_files} extracted files to S3...") + + for index, abs_file_path in enumerate(extracted_files, start=1): + rel_path = os.path.relpath(abs_file_path, + tmpdir).replace("\\", "/") + s3_key = os.path.join(destination_folder, + rel_path).replace("\\", "/") + + with open(abs_file_path, "rb") as f: + file_size = os.path.getsize(abs_file_path) + self.s3_client.put_object(Bucket=self.bucket, + Key=s3_key, + Body=f) + + metadata = { + "s3_path": s3_key, + "filename": os.path.basename(abs_file_path), + "size": file_size + } + uploaded_files_metadata.append(metadata) + + logger.info( + f"[{index}/{total_files}] Uploaded: {s3_key} ({file_size} bytes)" + ) + + if progress_callback: + try: + progress_callback(metadata, index, total_files) + except Exception as e: + logger.warning(f"Progress callback failed: {e}") + + logger.info("Unzip and upload complete.") + return uploaded_files_metadata diff --git a/schema-api/files/urls.py b/schema-api/files/urls.py index cc46962..2ae6b64 100644 --- a/schema-api/files/urls.py +++ b/schema-api/files/urls.py @@ -1,8 +1,12 @@ from django.urls import path -from files.views import FilesListAPIView, FileDetailsAPIView +from files.views import FilesListAPIView, FileDetailsAPIView, FilePreviewAPIView, FileUnzipView urlpatterns = [ path('files', FilesListAPIView.as_view(), name='files_list'), - path('files/', FileDetailsAPIView.as_view(), name='file_details') + path('files/preview', FilePreviewAPIView.as_view(), name='file-preview'), + path("files/unzip/", FileUnzipView.as_view(), name="file-unzip"), + path('files/', + FileDetailsAPIView.as_view(), + name='file_details'), ] diff --git a/schema-api/files/views.py b/schema-api/files/views.py index 0a3215b..8b7a4ce 100644 --- a/schema-api/files/views.py +++ b/schema-api/files/views.py @@ -13,102 +13,110 @@ FileMetadataSerializer from files.services import S3BucketService +import csv, io, mimetypes # File preview functionality +from rest_framework.exceptions import NotFound, ValidationError # File preview functionality +from util.exceptions import ApplicationNotFoundError # File preview functionality + class FilesListAPIView(APIView): - authentication_classes = [ApiTokenAuthentication] if settings.USE_AUTH else [] - permission_classes = [IsAuthenticated, IsUser, IsActive] if settings.USE_AUTH else [] + authentication_classes = [ApiTokenAuthentication + ] if settings.USE_AUTH else [] + permission_classes = [IsAuthenticated, IsUser, IsActive + ] if settings.USE_AUTH else [] @extend_schema( summary='List directory contents', - description='Endpoint that allows to list directories and files under a specific sub-directory. Also may list' - 'all files that are recursively found in the specified subdirectory\'s tree', + description= + 'Endpoint that allows to list directories and files under a specific sub-directory. Also may list' + 'all files that are recursively found in the specified subdirectory\'s tree', tags=['Files'], parameters=[ - OpenApiParameter('subdir', OpenApiTypes.STR, OpenApiParameter.QUERY, required=False, allow_blank=False, + OpenApiParameter('subdir', + OpenApiTypes.STR, + OpenApiParameter.QUERY, + required=False, + allow_blank=False, many=False), - OpenApiParameter('recursive', OpenApiTypes.STR, OpenApiParameter.QUERY, required=False, allow_blank=True, + OpenApiParameter('recursive', + OpenApiTypes.STR, + OpenApiParameter.QUERY, + required=False, + allow_blank=True, many=False) ], responses={ - 200: OpenApiResponse( - description='Stderr of the task specified by the UUID are returned', + 200: + OpenApiResponse( + description= + 'Stderr of the task specified by the UUID are returned', response=FileNamedSerializer, examples=[ OpenApiExample( 'directories-and-files', summary='Listing of files and directories', - value=[ - { - "type": "file", - "metadata": { - "size": 1258291200, - "ts_modified": "2024-04-13T20:41:08.419000Z" - }, - "name": "bigfile.bin" - }, - { - "type": "directory", - "metadata": {}, - "name": "img/" + value=[{ + "type": "file", + "metadata": { + "size": 1258291200, + "ts_modified": "2024-04-13T20:41:08.419000Z" }, - { - "type": "file", - "metadata": { - "size": 95, - "ts_modified": "2024-06-26T11:37:56.492000Z" - }, - "name": "report.txt" + "name": "bigfile.bin" + }, { + "type": "directory", + "metadata": {}, + "name": "img/" + }, { + "type": "file", + "metadata": { + "size": 95, + "ts_modified": "2024-06-26T11:37:56.492000Z" }, - { - "type": "directory", - "metadata": {}, - "name": "bin/" - } - ], + "name": "report.txt" + }, { + "type": "directory", + "metadata": {}, + "name": "bin/" + }], request_only=False, response_only=True, ), OpenApiExample( 'recursive-listing-of-files', summary='Recursive listing of files', - value=[ - { - "path": "directory0/directory1/stderr.log", - "metadata": { - "size": 27, - "ts_modified": "2024-07-04T07:57:19.087000Z" - } - }, - { - "path": "directory0/output.txt", - "metadata": { - "size": 1196, - "ts_modified": "2024-06-25T09:05:23.853000Z" - } - }, - { - "path": "upload.csv", - "metadata": { - "size": 21330, - "ts_modified": "2024-02-19T14:49:32.209000Z" - } + value=[{ + "path": "directory0/directory1/stderr.log", + "metadata": { + "size": 27, + "ts_modified": "2024-07-04T07:57:19.087000Z" } - ], + }, { + "path": "directory0/output.txt", + "metadata": { + "size": 1196, + "ts_modified": "2024-06-25T09:05:23.853000Z" + } + }, { + "path": "upload.csv", + "metadata": { + "size": 21330, + "ts_modified": "2024-02-19T14:49:32.209000Z" + } + }], request_only=False, response_only=True, ) - ] - ), - 400: OpenApiResponse( - description='Request was invalid. Response will contain information about potential errors in the ' - 'request.' - ), - 401: OpenApiResponse( - description='Authentication failed. Perhaps no API token was provided in the `Authorization` header, ' - 'or the API token was invalid.' - ) - } - ) + ]), + 400: + OpenApiResponse( + description= + 'Request was invalid. Response will contain information about potential errors in the ' + 'request.'), + 401: + OpenApiResponse( + description= + 'Authentication failed. Perhaps no API token was provided in the `Authorization` header, ' + 'or the API token was invalid.') + }) def get(self, request): qp_serializer = FilesListQPSerializer(data=request.query_params) qp_serializer.is_valid(raise_exception=True) @@ -121,44 +129,53 @@ def get(self, request): directory = s3_service.list_objects(**qp_serializer_validated_data) if recursive: - return Response(data=FileSerializer(directory.walk(), many=True).data, status=status.HTTP_200_OK) + return Response(data=FileSerializer(directory.walk(), + many=True).data, + status=status.HTTP_200_OK) else: - return Response(data=FileNamedSerializer(directory.contents, many=True).data, status=status.HTTP_200_OK) + return Response(data=FileNamedSerializer(directory.contents, + many=True).data, + status=status.HTTP_200_OK) @extend_schema( summary='Create new file', - description='Endpoint that allows a new file to be created on the remote storage. This can be either through a ' - 'file upload or copying an existing file', + description= + 'Endpoint that allows a new file to be created on the remote storage. This can be either through a ' + 'file upload or copying an existing file', tags=['Files'], request=FileCreateSerializer, parameters=[ - OpenApiParameter('overwrite', OpenApiTypes.BOOL, OpenApiParameter.QUERY, allow_blank=True, required=False, + OpenApiParameter('overwrite', + OpenApiTypes.BOOL, + OpenApiParameter.QUERY, + allow_blank=True, + required=False, many=False) ], examples=[ OpenApiExample( 'copy', summary='Copy', - description='In this example a new file at `dir0/file1.txt` is created by requesting the copy of a ' - 'file at `dir1/file0.txt`.', + description= + 'In this example a new file at `dir0/file1.txt` is created by requesting the copy of a ' + 'file at `dir1/file0.txt`.', value={ "source": "dir1/file0.txt", "path": "dir0/file1.txt" }, request_only=True, - response_only=False - ), + response_only=False), OpenApiExample( 'upload-small', summary='Simple upload request', - description='A request to upload a small file with a single upload URL', + description= + 'A request to upload a small file with a single upload URL', value={ "size": 21330, "path": "upload.csv" }, request_only=True, - response_only=False - ), + response_only=False), OpenApiExample( 'upload-big', summary='Multi-part big upload request', @@ -168,20 +185,18 @@ def get(self, request): "path": "data.bin" }, request_only=True, - response_only=False - ) + response_only=False) ], responses={ - 201: OpenApiResponse( + 201: + OpenApiResponse( description='New file successfully created', response=FileRefSerializer, examples=[ OpenApiExample( 'copied_file', summary='Copied file path returned', - value={ - "path": "dir0/file1.txt" - }, + value={"path": "dir0/file1.txt"}, request_only=False, response_only=True, ), @@ -192,18 +207,20 @@ def get(self, request): "size": 21330, "path": "files/upload.csv", "upload_info": { - "type": "simple", - "expiry": "2024-01-01T14:46:24.804622", - "url": "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b51265e10/files/" - "upload.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRmGyKqKZrRd2W%2" - "F20240219%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240219T144625Z&X-Amz-Ex" - "pires=86400&X-Amz-SignedHeaders=content-length%3Bhost&X-Amz-Signature=6e6de51d0" - "c13569282cec13c70d8b50b4088b137f4baee6efab57274f912b7fc" + "type": + "simple", + "expiry": + "2024-01-01T14:46:24.804622", + "url": + "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b51265e10/files/" + "upload.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRmGyKqKZrRd2W%2" + "F20240219%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240219T144625Z&X-Amz-Ex" + "pires=86400&X-Amz-SignedHeaders=content-length%3Bhost&X-Amz-Signature=6e6de51d0" + "c13569282cec13c70d8b50b4088b137f4baee6efab57274f912b7fc" } }, request_only=False, - response_only=True - ), + response_only=True), OpenApiExample( 'multi-part', summary='Multi-part upload URLs', @@ -214,72 +231,78 @@ def get(self, request): "type": "multipart", "expiry": "2024-01-01T14:52:37.290554", "urls": { - "parts": [ - { - "part": 1, - "url": "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b512" - "65e10/files/big_file.dat?partNumber=1&uploadId=OWFkYTZiMTEtMDlmMi00" - "NGMxLTg4MjgtNWFmNmMzZmMzNGFkLjM3MTI5YjE0LWJhZGEtNGVlZS05YjQzLWM3MTM" - "zZjljY2EwOQ&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRm" - "GyKqKZrRd2W%2F20240219%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=2" - "0240219T145237Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=content-len" - "gth%3Bhost&X-Amz-Signature=1ab96c54effa5618fc736a8033477a775ea998be" - "aa9c678a231adab75a690ade", - "n_bytes": 104857600 - }, - { - "part": 2, - "url": "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b512" - "65e10/files/big_file.dat?partNumber=2&uploadId=OWFkYTZiMTEtMDlmMi00" - "NGMxLTg4MjgtNWFmNmMzZmMzNGFkLjM3MTI5YjE0LWJhZGEtNGVlZS05YjQzLWM3MTM" - "zZjljY2EwOQ&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRm" - "GyKqKZrRd2W%2F20240219%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=2" - "0240219T145237Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=content-len" - "gth%3Bhost&X-Amz-Signature=c7a21db4a8ad5d73d6599d6919f273543372b7dd" - "d5247c87a01be458ccf4cff6", - "n_bytes": 104857600 - }, - { - "part": 3, - "url": "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b512" - "65e10/files/big_file.dat?partNumber=3&uploadId=OWFkYTZiMTEtMDlmMi00" - "NGMxLTg4MjgtNWFmNmMzZmMzNGFkLjM3MTI5YjE0LWJhZGEtNGVlZS05YjQzLWM3MTM" - "zZjljY2EwOQ&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRm" - "GyKqKZrRd2W%2F20240219%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=2" - "0240219T145237Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=content-len" - "gth%3Bhost&X-Amz-Signature=8b335fa92031c9e52bc66a2062623e8e0243151e" - "20fdf04e5da55410fb6ffa7f", - "n_bytes": 20971520 - } - ], - "finalize": "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b51265e" - "10/files/big_file.dat?uploadId=OWFkYTZiMTEtMDlmMi00NGMxLTg4MjgtNWFmNmM" - "zZmMzNGFkLjM3MTI5YjE0LWJhZGEtNGVlZS05YjQzLWM3MTMzZjljY2EwOQ&X-Amz-Algo" - "rithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRmGyKqKZrRd2W%2F20240219%2F" - "us-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240219T145237Z&X-Amz-Expire" - "s=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=d48e7ca7b6133729ec3dd" - "a3c27d36aa16d3e862c6fe6288f8368c3e7c709ce3e" + "parts": [{ + "part": + 1, + "url": + "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b512" + "65e10/files/big_file.dat?partNumber=1&uploadId=OWFkYTZiMTEtMDlmMi00" + "NGMxLTg4MjgtNWFmNmMzZmMzNGFkLjM3MTI5YjE0LWJhZGEtNGVlZS05YjQzLWM3MTM" + "zZjljY2EwOQ&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRm" + "GyKqKZrRd2W%2F20240219%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=2" + "0240219T145237Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=content-len" + "gth%3Bhost&X-Amz-Signature=1ab96c54effa5618fc736a8033477a775ea998be" + "aa9c678a231adab75a690ade", + "n_bytes": + 104857600 + }, { + "part": + 2, + "url": + "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b512" + "65e10/files/big_file.dat?partNumber=2&uploadId=OWFkYTZiMTEtMDlmMi00" + "NGMxLTg4MjgtNWFmNmMzZmMzNGFkLjM3MTI5YjE0LWJhZGEtNGVlZS05YjQzLWM3MTM" + "zZjljY2EwOQ&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRm" + "GyKqKZrRd2W%2F20240219%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=2" + "0240219T145237Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=content-len" + "gth%3Bhost&X-Amz-Signature=c7a21db4a8ad5d73d6599d6919f273543372b7dd" + "d5247c87a01be458ccf4cff6", + "n_bytes": + 104857600 + }, { + "part": + 3, + "url": + "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b512" + "65e10/files/big_file.dat?partNumber=3&uploadId=OWFkYTZiMTEtMDlmMi00" + "NGMxLTg4MjgtNWFmNmMzZmMzNGFkLjM3MTI5YjE0LWJhZGEtNGVlZS05YjQzLWM3MTM" + "zZjljY2EwOQ&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRm" + "GyKqKZrRd2W%2F20240219%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=2" + "0240219T145237Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=content-len" + "gth%3Bhost&X-Amz-Signature=8b335fa92031c9e52bc66a2062623e8e0243151e" + "20fdf04e5da55410fb6ffa7f", + "n_bytes": + 20971520 + }], + "finalize": + "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b51265e" + "10/files/big_file.dat?uploadId=OWFkYTZiMTEtMDlmMi00NGMxLTg4MjgtNWFmNmM" + "zZmMzNGFkLjM3MTI5YjE0LWJhZGEtNGVlZS05YjQzLWM3MTMzZjljY2EwOQ&X-Amz-Algo" + "rithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRmGyKqKZrRd2W%2F20240219%2F" + "us-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240219T145237Z&X-Amz-Expire" + "s=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=d48e7ca7b6133729ec3dd" + "a3c27d36aa16d3e862c6fe6288f8368c3e7c709ce3e" } } }, request_only=False, - response_only=True - ), - ] - ), - 400: OpenApiResponse( - description='Request was invalid. Response will contain information about potential errors in the ' - 'request.' - ), - 401: OpenApiResponse( - description='Authentication failed. Perhaps no API token was provided in the `Authorization` header, ' - 'or the API token was invalid.' - ), - 409: OpenApiResponse( - description='Target file already exists. Consider using overwrite' - ) - } - ) + response_only=True), + ]), + 400: + OpenApiResponse( + description= + 'Request was invalid. Response will contain information about potential errors in the ' + 'request.'), + 401: + OpenApiResponse( + description= + 'Authentication failed. Perhaps no API token was provided in the `Authorization` header, ' + 'or the API token was invalid.'), + 409: + OpenApiResponse( + description= + 'Target file already exists. Consider using overwrite') + }) def post(self, request): file_create_serializer = FileCreateSerializer(data=request.data) file_create_serializer.is_valid(raise_exception=True) @@ -292,35 +315,51 @@ def post(self, request): qp_serializer.is_valid(raise_exception=True) qp_serializer_validated_data = qp_serializer.validated_data - file = s3_service.copy_object(source_path=source, destination_path=validated_data['path'], - **qp_serializer_validated_data) + file = s3_service.copy_object( + source_path=source, + destination_path=validated_data['path'], + **qp_serializer_validated_data) file_ref_serializer = FileRefSerializer(file) - return Response(status=status.HTTP_201_CREATED, data=file_ref_serializer.data) + return Response(status=status.HTTP_201_CREATED, + data=file_ref_serializer.data) else: - upload_info = s3_service.issue_upload_urls(size=validated_data['size'], - file_path=validated_data['path']) - return Response(status=status.HTTP_201_CREATED, data={ - **validated_data, - 'upload_info': upload_info - }) + upload_info = s3_service.issue_upload_urls( + size=validated_data['size'], file_path=validated_data['path']) + return Response(status=status.HTTP_201_CREATED, + data={ + **validated_data, 'upload_info': upload_info + }) class FileDetailsAPIView(APIView): - authentication_classes = [ApiTokenAuthentication] if settings.USE_AUTH else [] - permission_classes = [IsAuthenticated, IsUser, IsActive] if settings.USE_AUTH else [] + authentication_classes = [ApiTokenAuthentication + ] if settings.USE_AUTH else [] + permission_classes = [IsAuthenticated, IsUser, IsActive + ] if settings.USE_AUTH else [] @extend_schema( summary='Retrieve file URL/metadata', - description='Endpoint that allows to retrieve metadata for an existing file or issue a download URL', + description= + 'Endpoint that allows to retrieve metadata for an existing file or issue a download URL', tags=['Files'], parameters=[ - OpenApiParameter('action', OpenApiTypes.BOOL, OpenApiParameter.QUERY, allow_blank=True, required=False, - many=False, enum=['download', 'stat']), - OpenApiParameter('path', OpenApiTypes.STR, OpenApiParameter.PATH, allow_blank=False, required=True, + OpenApiParameter('action', + OpenApiTypes.BOOL, + OpenApiParameter.QUERY, + allow_blank=True, + required=False, + many=False, + enum=['download', 'stat']), + OpenApiParameter('path', + OpenApiTypes.STR, + OpenApiParameter.PATH, + allow_blank=False, + required=True, many=False) ], responses={ - 200: OpenApiResponse( + 200: + OpenApiResponse( response=FileMetadataSerializer, examples=[ OpenApiExample( @@ -337,30 +376,31 @@ class FileDetailsAPIView(APIView): 'download-urls', summary='Download URL issued', value={ - "path": "files/big_file.dat", - "expiry": "2024-02-21T17:22:37.602298", - "url": "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b51265e10/files/big_" - "file.dat?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRmGyKqKZrRd2W%2F20240" - "220%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240220T172238Z&X-Amz-Expires=86400" - "&X-Amz-SignedHeaders=host&X-Amz-Signature=74667bda38320889b0453374c0b8c02c3578f8ef2" - "5a28d99f7663b8b3cdd9d2b" - } - ) - ] - ), - 400: OpenApiResponse( - description='Request was invalid. Response will contain information about potential errors in the ' - 'request.' - ), - 401: OpenApiResponse( - description='Authentication failed. Perhaps no API token was provided in the `Authorization` header, ' - 'or the API token was invalid.' - ), - 404: OpenApiResponse( - description='Provided file path wasn\'t found' - ) - } - ) + "path": + "files/big_file.dat", + "expiry": + "2024-02-21T17:22:37.602298", + "url": + "https://s3.hypatia-comp.athenarc.gr/9cb85312-cfd0-48d4-8839-f36b51265e10/files/big_" + "file.dat?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=zYdRmGyKqKZrRd2W%2F20240" + "220%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240220T172238Z&X-Amz-Expires=86400" + "&X-Amz-SignedHeaders=host&X-Amz-Signature=74667bda38320889b0453374c0b8c02c3578f8ef2" + "5a28d99f7663b8b3cdd9d2b" + }) + ]), + 400: + OpenApiResponse( + description= + 'Request was invalid. Response will contain information about potential errors in the ' + 'request.'), + 401: + OpenApiResponse( + description= + 'Authentication failed. Perhaps no API token was provided in the `Authorization` header, ' + 'or the API token was invalid.'), + 404: + OpenApiResponse(description='Provided file path wasn\'t found') + }) def get(self, request, path): qp_serializer = FileDetailsQPSerializer(data=request.query_params) qp_serializer.is_valid(raise_exception=True) @@ -371,36 +411,37 @@ def get(self, request, path): if qp_serializer_validated_data['action'] == 'stat': file_metadata = s3_service.retrieve_object(path) file_serializer = FileMetadataSerializer(file_metadata) - return Response(data=file_serializer.data, status=status.HTTP_200_OK) + return Response(data=file_serializer.data, + status=status.HTTP_200_OK) else: download_info = s3_service.issue_download_urls(path) - return Response(status=status.HTTP_200_OK, data={ - 'path': path, - **download_info - }) + return Response(status=status.HTTP_200_OK, + data={ + 'path': path, + **download_info + }) @extend_schema( summary='Delete a file', description='Endpoint that allows to delete a file', tags=['Files'], responses={ - 204: OpenApiResponse( - description='File was successfully deleted' - ), - 400: OpenApiResponse( - description='Request was invalid. Response will contain information about potential errors in the ' - 'request.' - ), - 401: OpenApiResponse( - description='Authentication failed. Perhaps no API token was provided in the `Authorization` header, ' - 'or the API token was invalid.' - ), - 404: OpenApiResponse( - description='Provided file path wasn\'t found' - ) - } - ) + 204: + OpenApiResponse(description='File was successfully deleted'), + 400: + OpenApiResponse( + description= + 'Request was invalid. Response will contain information about potential errors in the ' + 'request.'), + 401: + OpenApiResponse( + description= + 'Authentication failed. Perhaps no API token was provided in the `Authorization` header, ' + 'or the API token was invalid.'), + 404: + OpenApiResponse(description='Provided file path wasn\'t found') + }) def delete(self, request, path): s3_service = S3BucketService(request.user) @@ -413,55 +454,59 @@ def delete(self, request, path): tags=['Files'], request=FileRefSerializer, parameters=[ - OpenApiParameter('overwrite', OpenApiTypes.BOOL, OpenApiParameter.QUERY, allow_blank=True, required=False, + OpenApiParameter('overwrite', + OpenApiTypes.BOOL, + OpenApiParameter.QUERY, + allow_blank=True, + required=False, many=False), - OpenApiParameter('source', OpenApiTypes.STR, OpenApiParameter.PATH, allow_blank=False, required=True, + OpenApiParameter('source', + OpenApiTypes.STR, + OpenApiParameter.PATH, + allow_blank=False, + required=True, many=False) ], examples=[ OpenApiExample( 'move', summary='Move', - description='In this example a file is moved to `dir1/file0.txt`.', - value={ - "path": "dir1/file0.txt" - }, + description= + 'In this example a file is moved to `dir1/file0.txt`.', + value={"path": "dir1/file0.txt"}, request_only=True, - response_only=False - ) + response_only=False) ], responses={ - 202: OpenApiResponse( - description='File successfully moved', - response=FileRefSerializer, - examples=[ - OpenApiExample( - 'moved_file', - summary='Moved file path returned', - value={ - "path": "dir0/file1.txt" - }, - request_only=False, - response_only=True, - ), - ] - ), - 400: OpenApiResponse( - description='Request was invalid. Response will contain information about potential errors in the ' - 'request.' - ), - 401: OpenApiResponse( - description='Authentication failed. Perhaps no API token was provided in the `Authorization` header, ' - 'or the API token was invalid.' - ), - 404: OpenApiResponse( - description='Provided source file wasn\'t found' - ), - 409: OpenApiResponse( - description='Target file already exists. Consider using overwrite' - ) - } - ) + 202: + OpenApiResponse(description='File successfully moved', + response=FileRefSerializer, + examples=[ + OpenApiExample( + 'moved_file', + summary='Moved file path returned', + value={"path": "dir0/file1.txt"}, + request_only=False, + response_only=True, + ), + ]), + 400: + OpenApiResponse( + description= + 'Request was invalid. Response will contain information about potential errors in the ' + 'request.'), + 401: + OpenApiResponse( + description= + 'Authentication failed. Perhaps no API token was provided in the `Authorization` header, ' + 'or the API token was invalid.'), + 404: + OpenApiResponse(description='Provided source file wasn\'t found'), + 409: + OpenApiResponse( + description= + 'Target file already exists. Consider using overwrite') + }) def patch(self, request, path): qp_serializer = FileCreateQPSerializer(data=request.query_params) qp_serializer.is_valid(raise_exception=True) @@ -472,6 +517,119 @@ def patch(self, request, path): validated_data = file_update_serializer.validated_data s3_service = S3BucketService(request.user) - file = s3_service.move_object(path, validated_data['path'], **qp_serializer_validated_data) + file = s3_service.move_object(path, validated_data['path'], + **qp_serializer_validated_data) file_serializer = FileRefSerializer(file) - return Response(status=status.HTTP_202_ACCEPTED, data=file_serializer.data) + return Response(status=status.HTTP_202_ACCEPTED, + data=file_serializer.data) + + +class FilePreviewAPIView(APIView): + authentication_classes = [ApiTokenAuthentication + ] if settings.USE_AUTH else [] + permission_classes = [IsAuthenticated, IsUser, IsActive + ] if settings.USE_AUTH else [] + + @extend_schema( + summary='Preview a file (image or CSV)', + description= + 'Returns a presigned URL for image preview or the first few rows of a CSV file.', + tags=['Files'], + parameters=[ + OpenApiParameter('path', + OpenApiTypes.STR, + OpenApiParameter.QUERY, + required=True), + ], + responses={ + 200: OpenApiResponse(description='Preview data returned'), + 400: OpenApiResponse(description='Invalid input'), + 404: + OpenApiResponse(description='File not found or not supported'), + }) + def get(self, request): + file_path = request.query_params.get('path') + if not file_path: + raise ValidationError("`path` query parameter is required.") + + s3_service = S3BucketService(request.user) + + try: + metadata = s3_service._stat_object(file_path) + except ApplicationNotFoundError: + raise NotFound(f"File not found at {file_path}.") + + mimetype, _ = mimetypes.guess_type(file_path) + mimetype = mimetype or metadata.get("ContentType") + + def is_probably_csv(data: bytes) -> bool: + try: + sample = data[:1024].decode('utf-8', errors='ignore') + return ',' in sample and '\n' in sample + except Exception: + return False + + if mimetype and mimetype.startswith('image/'): + url = s3_service.issue_download_urls(file_path)['url'] + return Response({"type": "image", "url": url}) + + raw_bytes = None + is_csv = mimetype in ['text/csv', 'application/vnd.ms-excel'] + + if not is_csv and (mimetype is None + or mimetype == 'application/octet-stream'): + raw_bytes = s3_service.retrieve_object_bytes(file_path) + is_csv = is_probably_csv(raw_bytes) + + if is_csv: + if raw_bytes is None: + raw_bytes = s3_service.retrieve_object_bytes(file_path) + + try: + text_io = io.StringIO(raw_bytes.decode('utf-8')) + reader = csv.reader(text_io) + preview_data = [row for _, row in zip(range(10), reader)] + except Exception as e: + raise ValidationError(f"Failed to parse CSV: {str(e)}") + + return Response({"type": "csv", "preview": preview_data}) + + raise NotFound("Preview not supported for this file type.") + + +from .serializers import UnzipRequestSerializer +import logging + +logger = logging.getLogger(__name__) + + +class FileUnzipView(APIView): + authentication_classes = [ApiTokenAuthentication + ] if settings.USE_AUTH else [] + permission_classes = [IsAuthenticated, IsUser, IsActive + ] if settings.USE_AUTH else [] + + def post(self, request): + serializer = UnzipRequestSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + validated = serializer.validated_data + + s3_service = S3BucketService(request.user) + + try: + extracted_metadata = s3_service.unzip_file_to_s3_folder( + zip_path=validated["zip_path"], + destination_folder=validated["destination_path"]) + return Response( + { + "status": "success", + "extracted_files": extracted_metadata + }, + status=status.HTTP_200_OK) + except Exception as e: + logger.exception("Unzipping failed") + return Response({ + "status": "error", + "message": str(e) + }, + status=status.HTTP_500_INTERNAL_SERVER_ERROR)