scality · scality-gdoumergue · Sep 2, 2024 · Sep 2, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,12 @@
+*.jar
+*.tar
+*.tar.gz
+*.tgz
+*.pyc
+scripts/scality
+scripts/py4j
+*.dist-info
+*.log
+nodejs
+s3utils
+scripts/config/config.yml
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -0,0 +1,5 @@
+{
+    "recommendations": [
+        "charliermarsh.ruff"
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,61 @@
+{
+    "python.analysis.typeCheckingMode": "standard",
+    "editor.quickSuggestions": {
+        "strings": true
+    },
+    "[python]": {
+        "editor.defaultFormatter": "charliermarsh.ruff",
+        "editor.formatOnSave": true,
+        "editor.codeActionsOnSave": {
+            "source.organizeImports": "explicit"
+        }
+    },
+    "git.openRepositoryInParentFolders": "never",
+    "git.autofetch": true,
+    "git.enableSmartCommit": true,
+    "git.replaceTagsWhenPull": true,
+    "github.copilot.editor.enableAutoCompletions": true,
+    "python.createEnvironment.trigger": "off",
+    "cSpell.enabled": false,
+    "python.testing.pytestArgs": ["tests"],
+    "python.testing.pytestEnabled": true,
+    "python.testing.unittestEnabled": false,
+    "git.inputValidation": true,
+    "git.inputValidationLength": 72,
+    "git.inputValidationSubjectLength": 72,
+    "github.copilot.chat.commitMessageGeneration.instructions": [
+        {
+            "text": "Use the Conventional Commits format for all commit messages."
+        },
+        {
+            "text": "The commit subject must follow this pattern: <type>(<scope>): <description>."
+        },
+        {
+            "text": "Replace <type> with one of the following: feat, fix, chore, docs, style, refactor, perf, test, build, ci, revert."
+        },
+        {
+            "text": "The <scope> should be the affected module, feature, or component (e.g., 'auth', 'api', 'ui')."
+        },
+        {
+            "text": "The <description> should be a concise summary of the change, written in imperative mood."
+        },
+        {
+            "text": "If a commit introduces breaking changes, append 'BREAKING CHANGE:' followed by a detailed explanation in the body."
+        },
+        {
+            "text": "If referencing an issue, add 'Closes #123' or 'Fixes #456' in the commit body."
+        },
+        {
+            "text": "Limit the subject line to 72 characters."
+        },
+        {
+            "text": "Separate the subject from the body with a blank line."
+        },
+        {
+            "text": "The commit body should explain what changed and why, wrapped at 72 characters per line."
+        },
+        {
+            "text": "Include Gitmojis where relevant, placed before the <type> in the subject line. Example: '✨ feat(auth): add login via Google'."
+        }
+    ],
+}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,133 @@
+ARG NODE_IMAGE=16.20.2-bullseye-slim
+ARG NODE_VERSION=16.20.2
+
+##############################
+# builder: nodejs dependencies
+##############################
+
+# The builder technique: best way
+# to have a lighter image in the end.
+FROM node:${NODE_IMAGE} as builder
+
+ENV NVM_DIR=/root/.nvm
+
+RUN --mount=type=cache,sharing=locked,target=/var/cache/apt apt update \
+    && apt-get install -y --no-install-recommends \
+    curl \
+    git \
+    build-essential \
+    python3 \
+    jq \
+    ssh \
+    ca-certificates \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY nodejs ./nodejs
+
+WORKDIR nodejs
+
+# The node_version.txt file brings node's version to the next steps
+# because I don't know why the NODE_VERSION variable is not passed
+# to the runner part
+RUN yarn install --production --network-concurrency 1 && \
+    echo "${NODE_VERSION}" > node_version.txt
+
+##########################################
+#
+# RUNNER
+#
+##########################################
+
+FROM python:3.8-slim-bullseye
+
+RUN --mount=type=cache,sharing=locked,target=/var/cache/apt apt update \
+    && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    sudo \
+    curl \
+    libcurl4-openssl-dev libssl-dev \
+    awscli \
+    inetutils-ping \
+    netcat-traditional \
+    wget \
+    vim \
+    unzip \
+    rsync \
+    openjdk-11-jdk \
+    build-essential \
+    software-properties-common \
+    ssh \
+    jq \
+    gawk \
+    net-tools \
+    less \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV NVM_DIR=/opt/nvm
+ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
+ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"}
+ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"}
+ENV SPARK_MASTER_HOST="spark-master"
+ENV SPARK_MASTER_PORT="17077"
+ENV SPARK_MASTER="spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT}"
+ENV PYSPARK_PYTHON=python3
+ENV PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH
+
+RUN mkdir -p ${HADOOP_HOME} ${SPARK_HOME}/scality-tools /spark/jars/
+WORKDIR ${SPARK_HOME}
+
+# Install what's been yarned by the builder part
+COPY --from=builder nodejs/ ./scality-tools/
+
+## Install nodejs without yarn
+RUN NVM_NODE_VERSION=$(cat ./scality-tools/node_version.txt) && \
+    mkdir -p "${NVM_DIR}" && \
+    curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.3/install.sh | /bin/bash && \
+    . "${NVM_DIR}/nvm.sh" && nvm install ${NVM_NODE_VERSION} && \
+    nvm use v${NVM_NODE_VERSION} && \
+    nvm alias default v${NVM_NODE_VERSION}
+
+ENV PATH="${NVM_DIR}/versions/node/v${NVM_NODE_VERSION}/bin/:${PATH}"
+
+# Time to work on Spark & Python stuff
+
+COPY requirements.txt /tmp/requirements.txt
+COPY scality-0.1-py3-none-any.whl /tmp/
+COPY --from=ghcr.io/astral-sh/uv:0.4.8 /uv /bin/uv
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip compile /tmp/requirements.txt > /tmp/requirements-compiled.txt \
+    && uv pip sync --system /tmp/requirements-compiled.txt \
+    && uv pip install --system /tmp/scality-0.1-py3-none-any.whl
+
+
+# globbing to not fail if not found
+COPY spark-3.5.2-bin-hadoop3.tg[z] /tmp/
+# -N enable timestamping to condition download if already present or not
+RUN cd /tmp \
+    && wget -N https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz \
+    && tar xvzf spark-3.5.2-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
+    && rm -f spark-3.5.2-bin-hadoop3.tgz
+
+COPY conf/spark-defaults.conf ${SPARK_HOME}/conf
+COPY conf/spark-env.sh ${SPARK_HOME}/conf
+
+# https://github.com/sayedabdallah/Read-Write-AWS-S3
+# https://spot.io/blog/improve-apache-spark-performance-with-the-s3-magic-committer/
+COPY aws-java-sdk-bundle-1.12.770.ja[r] /spark/jars/
+COPY hadoop-aws-3.3.4.ja[r] /spark/jars/
+COPY spark-hadoop-cloud_2.13-3.5.2.ja[r] /spark/jars/
+RUN cd /spark/jars/ \
+    && wget -N https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.770/aws-java-sdk-bundle-1.12.770.jar \
+    && wget -N https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar \
+    && wget -N https://repo1.maven.org/maven2/org/apache/spark/spark-hadoop-cloud_2.13/3.5.2/spark-hadoop-cloud_2.13-3.5.2.jar
+
+# Misc
+RUN chmod u+x /opt/spark/sbin/* /opt/spark/bin/* && \
+    aws configure set default.s3.multipart_threshold 64MB && \
+    aws configure set default.s3.multipart_chunksize 32MB
+
+COPY entrypoint.sh .
+ENTRYPOINT ["/opt/spark/entrypoint.sh"]
diff --git a/Dockerfile-master b/Dockerfile-master
diff --git a/Dockerfile-worker b/Dockerfile-worker
diff --git a/ansible/roles/create-sample-config/templates/config-template.yml.j2 b/ansible/roles/create-sample-config/templates/config-template.yml.j2
@@ -1,4 +1,4 @@
-master: "spark://{{ hostvars[groups['sparkmaster'][0]]['ansible_host'] }}:7077"
+master: "spark://{{ hostvars[groups['sparkmaster'][0]]['ansible_host'] }}:17077"
 ring: "DATA"
 path: "{{ bucket_name }}"
 protocol: s3a      # Protocol can be either file or s3a.

diff --git a/aws-java-sdk-1.7.4.jar b/aws-java-sdk-1.7.4.jar