diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8582fbb --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +*.jar +*.tar +*.tar.gz +*.tgz +*.pyc +scripts/scality +scripts/py4j +*.dist-info +*.log +nodejs +s3utils +scripts/config/config.yml diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..424b18f --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "charliermarsh.ruff" + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..7a1b1af --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,61 @@ +{ + "python.analysis.typeCheckingMode": "standard", + "editor.quickSuggestions": { + "strings": true + }, + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff", + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.organizeImports": "explicit" + } + }, + "git.openRepositoryInParentFolders": "never", + "git.autofetch": true, + "git.enableSmartCommit": true, + "git.replaceTagsWhenPull": true, + "github.copilot.editor.enableAutoCompletions": true, + "python.createEnvironment.trigger": "off", + "cSpell.enabled": false, + "python.testing.pytestArgs": ["tests"], + "python.testing.pytestEnabled": true, + "python.testing.unittestEnabled": false, + "git.inputValidation": true, + "git.inputValidationLength": 72, + "git.inputValidationSubjectLength": 72, + "github.copilot.chat.commitMessageGeneration.instructions": [ + { + "text": "Use the Conventional Commits format for all commit messages." + }, + { + "text": "The commit subject must follow this pattern: (): ." + }, + { + "text": "Replace with one of the following: feat, fix, chore, docs, style, refactor, perf, test, build, ci, revert." + }, + { + "text": "The should be the affected module, feature, or component (e.g., 'auth', 'api', 'ui')." + }, + { + "text": "The should be a concise summary of the change, written in imperative mood." + }, + { + "text": "If a commit introduces breaking changes, append 'BREAKING CHANGE:' followed by a detailed explanation in the body." + }, + { + "text": "If referencing an issue, add 'Closes #123' or 'Fixes #456' in the commit body." + }, + { + "text": "Limit the subject line to 72 characters." + }, + { + "text": "Separate the subject from the body with a blank line." + }, + { + "text": "The commit body should explain what changed and why, wrapped at 72 characters per line." + }, + { + "text": "Include Gitmojis where relevant, placed before the in the subject line. Example: '✨ feat(auth): add login via Google'." + } + ], +} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..65d46bc --- /dev/null +++ b/Dockerfile @@ -0,0 +1,133 @@ +ARG NODE_IMAGE=16.20.2-bullseye-slim +ARG NODE_VERSION=16.20.2 + +############################## +# builder: nodejs dependencies +############################## + +# The builder technique: best way +# to have a lighter image in the end. +FROM node:${NODE_IMAGE} as builder + +ENV NVM_DIR=/root/.nvm + +RUN --mount=type=cache,sharing=locked,target=/var/cache/apt apt update \ + && apt-get install -y --no-install-recommends \ + curl \ + git \ + build-essential \ + python3 \ + jq \ + ssh \ + ca-certificates \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +COPY nodejs ./nodejs + +WORKDIR nodejs + +# The node_version.txt file brings node's version to the next steps +# because I don't know why the NODE_VERSION variable is not passed +# to the runner part +RUN yarn install --production --network-concurrency 1 && \ + echo "${NODE_VERSION}" > node_version.txt + +########################################## +# +# RUNNER +# +########################################## + +FROM python:3.8-slim-bullseye + +RUN --mount=type=cache,sharing=locked,target=/var/cache/apt apt update \ + && apt-get install -y --no-install-recommends \ + ca-certificates \ + sudo \ + curl \ + libcurl4-openssl-dev libssl-dev \ + awscli \ + inetutils-ping \ + netcat-traditional \ + wget \ + vim \ + unzip \ + rsync \ + openjdk-11-jdk \ + build-essential \ + software-properties-common \ + ssh \ + jq \ + gawk \ + net-tools \ + less \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENV NVM_DIR=/opt/nvm +ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" +ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"} +ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} +ENV SPARK_MASTER_HOST="spark-master" +ENV SPARK_MASTER_PORT="17077" +ENV SPARK_MASTER="spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT}" +ENV PYSPARK_PYTHON=python3 +ENV PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH + +RUN mkdir -p ${HADOOP_HOME} ${SPARK_HOME}/scality-tools /spark/jars/ +WORKDIR ${SPARK_HOME} + +# Install what's been yarned by the builder part +COPY --from=builder nodejs/ ./scality-tools/ + +## Install nodejs without yarn +RUN NVM_NODE_VERSION=$(cat ./scality-tools/node_version.txt) && \ + mkdir -p "${NVM_DIR}" && \ + curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.3/install.sh | /bin/bash && \ + . "${NVM_DIR}/nvm.sh" && nvm install ${NVM_NODE_VERSION} && \ + nvm use v${NVM_NODE_VERSION} && \ + nvm alias default v${NVM_NODE_VERSION} + +ENV PATH="${NVM_DIR}/versions/node/v${NVM_NODE_VERSION}/bin/:${PATH}" + +# Time to work on Spark & Python stuff + +COPY requirements.txt /tmp/requirements.txt +COPY scality-0.1-py3-none-any.whl /tmp/ +COPY --from=ghcr.io/astral-sh/uv:0.4.8 /uv /bin/uv + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip compile /tmp/requirements.txt > /tmp/requirements-compiled.txt \ + && uv pip sync --system /tmp/requirements-compiled.txt \ + && uv pip install --system /tmp/scality-0.1-py3-none-any.whl + + +# globbing to not fail if not found +COPY spark-3.5.2-bin-hadoop3.tg[z] /tmp/ +# -N enable timestamping to condition download if already present or not +RUN cd /tmp \ + && wget -N https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz \ + && tar xvzf spark-3.5.2-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ + && rm -f spark-3.5.2-bin-hadoop3.tgz + +COPY conf/spark-defaults.conf ${SPARK_HOME}/conf +COPY conf/spark-env.sh ${SPARK_HOME}/conf + +# https://github.com/sayedabdallah/Read-Write-AWS-S3 +# https://spot.io/blog/improve-apache-spark-performance-with-the-s3-magic-committer/ +COPY aws-java-sdk-bundle-1.12.770.ja[r] /spark/jars/ +COPY hadoop-aws-3.3.4.ja[r] /spark/jars/ +COPY spark-hadoop-cloud_2.13-3.5.2.ja[r] /spark/jars/ +RUN cd /spark/jars/ \ + && wget -N https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.770/aws-java-sdk-bundle-1.12.770.jar \ + && wget -N https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar \ + && wget -N https://repo1.maven.org/maven2/org/apache/spark/spark-hadoop-cloud_2.13/3.5.2/spark-hadoop-cloud_2.13-3.5.2.jar + +# Misc +RUN chmod u+x /opt/spark/sbin/* /opt/spark/bin/* && \ + aws configure set default.s3.multipart_threshold 64MB && \ + aws configure set default.s3.multipart_chunksize 32MB + +COPY entrypoint.sh . +ENTRYPOINT ["/opt/spark/entrypoint.sh"] diff --git a/Dockerfile-master b/Dockerfile-master deleted file mode 100644 index aebb2bf..0000000 --- a/Dockerfile-master +++ /dev/null @@ -1,17 +0,0 @@ -FROM openjdk:8-alpine -RUN apk --update add wget tar bash && \ - rm -rf /var/cache/apk/* -RUN apk --no-cache add bash procps python R && \ - rm -rf /var/cache/apk/* -#RUN wget http://apache.mirror.anlx.net/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz - -RUN cd /tmp && \ - wget https://archive.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz && \ - tar -xzf spark-2.4.3-bin-hadoop2.7.tgz && \ - mv spark-2.4.3-bin-hadoop2.7 /spark && \ - rm spark-2.4.3-bin-hadoop2.7.tgz - -COPY start-master.sh /start-master.sh -RUN chmod +x /start-master.sh - -ENTRYPOINT ["/start-master.sh"] diff --git a/Dockerfile-worker b/Dockerfile-worker deleted file mode 100644 index a527137..0000000 --- a/Dockerfile-worker +++ /dev/null @@ -1,34 +0,0 @@ -FROM openjdk:8-alpine -RUN apk --update add wget tar bash gcc openssl-dev make && \ - apk --update add python-dev && \ - apk --update add musl-dev libffi-dev && \ - apk --no-cache add bash procps python R py-pip && \ - apk --no-cache add curl libcurl curl-dev g++ && \ - rm -rf /var/cache/apk/* -ENV PYCURL_SSL_LIBRARY=openssl -RUN pip install requests && \ - pip install pyopenssl && \ - pip install certifi && \ - pip install pycurl && \ - pip install kazoo && \ - pip install pyzmq && \ - pip install python-dateutil && \ - pip install s3fs - -#RUN wget http://apache.mirror.anlx.net/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz -#COPY spark-2.4.3-bin-hadoop2.7.tgz /tmp/spark-2.4.3-bin-hadoop2.7.tgz - -RUN cd /tmp && \ - wget https://archive.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz && \ - tar -xzf /tmp/spark-2.4.3-bin-hadoop2.7.tgz && \ - mv spark-2.4.3-bin-hadoop2.7 /spark && \ - rm /tmp/spark-2.4.3-bin-hadoop2.7.tgz - -COPY aws-java-sdk-1.7.4.jar /spark/spark-2.4.3-bin-hadoop2.7/jars/ -COPY hadoop-aws-2.7.3.jar /spark/spark-2.4.3-bin-hadoop2.7/jars/ -COPY start-worker.sh /start-worker.sh -COPY scality-0.1-py2-none-any.whl /tmp/ -RUN pip install --no-index file:///tmp/scality-0.1-py2-none-any.whl -RUN chmod +x /start-worker.sh - -ENTRYPOINT ["/start-worker.sh"] diff --git a/ansible/roles/create-sample-config/templates/config-template.yml.j2 b/ansible/roles/create-sample-config/templates/config-template.yml.j2 index aab8ec7..d2683bb 100755 --- a/ansible/roles/create-sample-config/templates/config-template.yml.j2 +++ b/ansible/roles/create-sample-config/templates/config-template.yml.j2 @@ -1,4 +1,4 @@ -master: "spark://{{ hostvars[groups['sparkmaster'][0]]['ansible_host'] }}:7077" +master: "spark://{{ hostvars[groups['sparkmaster'][0]]['ansible_host'] }}:17077" ring: "DATA" path: "{{ bucket_name }}" protocol: s3a # Protocol can be either file or s3a. diff --git a/aws-java-sdk-1.7.4.jar b/aws-java-sdk-1.7.4.jar deleted file mode 100644 index 02233a8..0000000 Binary files a/aws-java-sdk-1.7.4.jar and /dev/null differ diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..42140a8 --- /dev/null +++ b/build.sh @@ -0,0 +1,138 @@ +#!/bin/bash + +# build.sh +# Builds the spark image used by Scality +# Author: Cédrick Gaillard +# Contributors: François Nomarski, Grégoire Doumergue + +# s3utils Tag for 9.4.2.1 +# Will surely be bumped -> TODO bump it here as well. +S3UTILS_TAG=1.15.3 + +# Get the spark image and version from spark_run.sh +. $(dirname $0)/spark_run.sh env + +SPARK_IMAGE_ARCHIVE="/tmp/spark-image-${VERSION}.tgz" +SPARK_SCRIPTS_ARCHIVE="/tmp/scality-spark-scripts-${VERSION}.tgz" + +container_command="$(basename "$(type -p docker)")" + +if [ -z "${container_command}" ];then + echo "Install docker (and docker-buildx if you're on debian-like OS)." + exit 1 +fi + +# Check that we have git +git_command="$(basename "$(type -p git)")" + +if [ -z "${git_command}" ];then + echo "Install git." + exit 1 +fi + +# Check that there's no un-committed changes (otherwise, they won't +# be included in the build) + +nb_uncommitted=$(git status --porcelain=v1 2>/dev/null | wc -l) + +if [ ${nb_uncommitted} -gt 0 ];then + echo "WARNING The following files were not committed - they won't be + included in ${SPARK_SCRIPTS_ARCHIVE}." + echo "" + git status --porcelain=v1 2>/dev/null + echo "" + echo "Do you want to proceed?" + select yn in "Yes" "No"; do + case $yn in + Yes ) break;; + No ) exit;; + esac +done + +fi + +if [ ! -f scality-0.1-py3-none-any.whl ] ; then + mkdir scality_tmp || exit 1 + cd scality_tmp || exit 1 + unzip ../scality-0.1-py2-none-any.whl + # we don't want to load rpm installed scality modules + mv scality scalityexport + + # Migrate from python2 to python3, 2to3 from python3-devel pkg. + 2to3 . -n -w . + + cat > setup.py << EOL +from setuptools import setup, find_packages + +setup( + name='scality', + version='0.1', + packages=find_packages(), + install_requires=[ + # List your package dependencies here + ], + entry_points={ + 'console_scripts': [ + # Add any command-line scripts here + ] + }, + # Metadata + author='CG', + description='scality 2to3', + license='Your License', + keywords='some, keywords', + url='http://your_package_url.com', +) +EOL + + # rebuild wheel + python setup.py bdist_wheel + + cp dist/scality-0.1-py3-none-any.whl ../ + + cd - || exit 1 +fi + +# Download Spark + +# https://github.com/sayedabdallah/Read-Write-AWS-S3 +wget -N https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz +wget -N https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.770/aws-java-sdk-bundle-1.12.770.jar +wget -N https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar +#Greg - adding magic committer +wget -N https://repo1.maven.org/maven2/org/apache/spark/spark-hadoop-cloud_2.13/3.5.2/spark-hadoop-cloud_2.13-3.5.2.jar + +# Download s3utils to extract the verifyBucketSproxydKeys.js script +# and its dependencies +mkdir nodejs +git clone git@github.com:scality/s3utils +git -C s3utils checkout "${S3UTILS_TAG}" +NODE_IMAGE=$(awk -F= '/NODE_VERSION=/ {print $2}' s3utils/Dockerfile) +NODE_VERSION=$(echo "${NODE_IMAGE}" | awk -F - '{print $1}' ) + +#cp s3utils/package.json nodejs/package.json +# Restrict dependencies to verifyBucketSproxydKeys.js deps only. +jq '.dependencies |= with_entries(select(.key | IN("async", "httpagent","JSONStream","arsenal","werelogs")))' s3utils/package.json > nodejs/package.json +cp -r s3utils/verifyBucketSproxydKeys.js s3utils/VerifyBucketSproxydKeys s3utils/CompareRaftMembers s3utils/yarn.lock nodejs + +# Patch verifyBucketSproxydKeys.js to workaround RD-404. +# +# +if [ -f "verifyBucketSproxydKeys_${S3UTILS_TAG}.js.patch" ];then + echo "Patching nodejs/verifyBucketSproxydKeys.js to workaround RD-404" + patch -p0 < verifyBucketSproxydKeys_${S3UTILS_TAG}.js.patch + +else + echo "No patch found to workaround RD-404. Are you sure s3utils v. ${S3UTILS_TAG} solves it?" +fi +echo "Building ${IMAGE_NAME}:${VERSION}" +docker build -f Dockerfile . -t ${IMAGE_NAME}:${VERSION} --build-arg NODE_IMAGE=${NODE_IMAGE} --build-arg NODE_VERSION=${NODE_VERSION} && \ +echo "Saving the image ... (long)" && \ +docker save ${IMAGE_NAME}:${VERSION} | gzip -c9 > ${SPARK_IMAGE_ARCHIVE} && \ +echo "Archiving the scripts ..." && \ +git archive --format=tgz HEAD > ${SPARK_SCRIPTS_ARCHIVE} + +echo "Upload ${SPARK_IMAGE_ARCHIVE} and ${SPARK_SCRIPTS_ARCHIVE} to the supervisor." + +rm -fr nodejs s3utils + diff --git a/conf/spark-defaults.conf b/conf/spark-defaults.conf new file mode 100644 index 0000000..1d0276c --- /dev/null +++ b/conf/spark-defaults.conf @@ -0,0 +1,4 @@ +spark.master spark://spark-master:17077 +spark.eventLog.enabled true +spark.eventLog.dir /opt/spark/spark-events +spark.history.fs.logDirectory /opt/spark/spark-events diff --git a/conf/spark-env.sh b/conf/spark-env.sh new file mode 100644 index 0000000..aafc7a7 --- /dev/null +++ b/conf/spark-env.sh @@ -0,0 +1,5 @@ + +export SPARK_LOCAL_DIRS=/opt/spark/tmp +export SPARK_EXECUTOR_DIRS=/opt/spark/tmp +export SPARK_LOG_DIR=/opt/spark/spark-events + diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100755 index 0000000..620e368 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +SPARK_WORKLOAD=$1 + +if [ -d /opt/spark/apps/config/tls_ca_certs ];then + + cp /opt/spark/apps/config/tls_ca_certs/* /usr/local/share/ca-certificates/ + rm -f /usr/local/share/ca-certificates/README.txt + /usr/sbin/update-ca-certificates > /var/log/update-ca-certificates.log 2>&1 +fi + +case "$SPARK_WORKLOAD" +in + master) + + start-master.sh \ + --ip spark-master \ + --port 17077 \ + --webui-port 18088 + ;; + worker) + + start-worker.sh --webui-port 18088 spark://spark-master:17077 + ;; + history) + + start-history-server.sh + ;; + driver) + export PS1='driver> ' + /bin/bash --norc + ;; + exec) + shift + "$@" + ;; +esac diff --git a/hadoop-aws-2.7.3.jar b/hadoop-aws-2.7.3.jar deleted file mode 100644 index 2c773ab..0000000 Binary files a/hadoop-aws-2.7.3.jar and /dev/null differ diff --git a/master.sh b/master.sh deleted file mode 100755 index 6721689..0000000 --- a/master.sh +++ /dev/null @@ -1,4 +0,0 @@ -#docker run --rm -it --name spark-master --hostname spark-master \ -# -p 7077:7077 -p 8080:8080 spark /spark/bin/spark-class org.apache.spark.deploy.master.Master --ip `hostname` --port 7077 --webui-port 8080 - -docker run -d --rm -it --net=host --name spark-master --hostname spark-master --add-host spark-master:178.33.63.238 spark-master diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e43a90f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +pandas +numpy +requests +pyyaml +pyopenssl +certifi +pycurl +kazoo +pyzmq +python-dateutil +s3fs +pyspark==3.5.2 +urllib3 +shyaml diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..c36b1f8 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,67 @@ +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", + ".venv", +] + +# Activer les correctifs automatiques +fix = true +include = ["*.py"] +indent-width = 4 +line-length = 120 +preview = true + +[lint] +# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. +# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or +# McCabe complexity (`C901`) by default. +#select = ["I", "E4", "E7", "E9", "F"] +select = ["I", "E", "W", "F", "B"] +ignore = ["E266", "F401"] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +logger-objects = ["resources.logging_setup.logger"] +# Inclure les règles liées aux imports + + +[format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + + +[lint.isort] +required-imports = ["import logging"] +order-by-type = true diff --git a/scality-0.1-py3-none-any.whl b/scality-0.1-py3-none-any.whl new file mode 100644 index 0000000..c037490 Binary files /dev/null and b/scality-0.1-py3-none-any.whl differ diff --git a/scripts/S3_FSCK/README.md b/scripts/S3_FSCK/README.md index 80f32c2..c20b32b 100644 --- a/scripts/S3_FSCK/README.md +++ b/scripts/S3_FSCK/README.md @@ -67,10 +67,10 @@ docker run --rm -dit --net=host --name spark-master \ ### Configuration -Edit scripts/config/config.yaml and fill out the master field with the IP address of the endpoint you ran the master container. The port must be defined as 7077. +Edit scripts/config/config.yaml and fill out the master field with the IP address of the endpoint you ran the master container. The port must be defined as 17077. ``` -master: "spark://178.33.63.238:7077" +master: "spark://178.33.63.238:17077" ``` ## How to submit a job to the cluster diff --git a/scripts/S3_FSCK/count_extracted_metadata_keys.sh b/scripts/S3_FSCK/count_extracted_metadata_keys.sh new file mode 100644 index 0000000..65e4688 --- /dev/null +++ b/scripts/S3_FSCK/count_extracted_metadata_keys.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +CONFIG_FILE="$(dirname $0)/../config/config.yml" + +if [ ! -f "${CONFIG_FILE}" ];then + + echo "${CONFIG_FILE} not found. Did you mount the scripts directory?" + exit 1 +fi + +WORKDIR="$(shyaml get-value datadir.container < ${CONFIG_FILE})" + +if [ ! -d "${WORKDIR}" ];then + echo "${WORKDIR} must be mounted by docker/ctr." + exit 1 +fi + +S3_ENDPOINT="$(shyaml get-value s3.endpoint < ${CONFIG_FILE})" +export AWS_ACCESS_KEY_ID="$(shyaml get-value s3.access_key < ${CONFIG_FILE})" +export AWS_SECRET_ACCESS_KEY="$(shyaml get-value s3.secret_key < ${CONFIG_FILE})" +ORPHANS_BUCKET="$(shyaml get-value path < ${CONFIG_FILE})" +RING="$(shyaml get-value ring < ${CONFIG_FILE})" + +######### RUN + +TMP_DIR="${WORKDIR}/count_extracted_md_keys" +mkdir "${TMP_DIR}" + +LIST_PATH="${RING}/s3-bucketd" +SOURCE="s3://${ORPHANS_BUCKET}/${LIST_PATH}/" + +TOTAL_COUNT=$(aws --endpoint-url "${S3_ENDPOINT}" s3 ls --recursive "${SOURCE}" | wc -l) + +echo "-- Counting" +TOTAL=0 +COUNT=1 +for F in $(aws --endpoint-url "${S3_ENDPOINT}" s3 ls --recursive "${SOURCE}" | awk '{print $4}');do + + echo "--- [${COUNT}/${TOTAL_COUNT}] Counting from ${F}" + aws --endpoint-url "${S3_ENDPOINT}" s3 cp "s3://${ORPHANS_BUCKET}/${F}" "${TMP_DIR}/count_me.csv" > /dev/null + TOTAL=$(( ${TOTAL}+$(wc -l "${TMP_DIR}/count_me.csv" |awk '{print $1}') )) + COUNT=$(( ${COUNT} + 1 )) + rm -f "${TMP_DIR}/count_me.csv" + +done + +rm -fr "${TMP_DIR}" + +echo "" +echo "$TOTAL keys extracted from S3 Metadata." + diff --git a/scripts/S3_FSCK/count_p0-dig-keys.sh b/scripts/S3_FSCK/count_p0-dig-keys.sh index d2ab963..e1b4c10 100644 --- a/scripts/S3_FSCK/count_p0-dig-keys.sh +++ b/scripts/S3_FSCK/count_p0-dig-keys.sh @@ -1,7 +1,75 @@ #!/bin/bash +CONFIG_FILE="$(dirname $0)/../config/config.yml" + +if [ ! -f "${CONFIG_FILE}" ];then + + echo "${CONFIG_FILE} not found. Did you mount the scripts directory?" + exit 1 +fi + +WORKDIR="$(shyaml get-value datadir.container < ${CONFIG_FILE})" + +if [ ! -d "${WORKDIR}" ];then + echo "${WORKDIR} must be mounted by docker/ctr." + exit 1 +fi + +S3_ENDPOINT="$(shyaml get-value s3.endpoint < ${CONFIG_FILE})" +export AWS_ACCESS_KEY_ID="$(shyaml get-value s3.access_key < ${CONFIG_FILE})" +export AWS_SECRET_ACCESS_KEY="$(shyaml get-value s3.secret_key < ${CONFIG_FILE})" +ORPHANS_BUCKET="$(shyaml get-value path < ${CONFIG_FILE})" +RING="$(shyaml get-value ring < ${CONFIG_FILE})" + +######### RUN + +TMP_DIR="${WORKDIR}/count_p0_dig_keys" +mkdir "${TMP_DIR}" + +LIST_PATH="${RING}/s3fsck/s3-dig-keys.csv" +SOURCE="s3://${ORPHANS_BUCKET}/${LIST_PATH}/" + +# Check that P0 was a success +HEAD_RESULT=$(aws --endpoint-url "${S3_ENDPOINT}" s3api head-object --bucket "${ORPHANS_BUCKET}" --key "${LIST_PATH}/_SUCCESS" | jq -r '.LastModified') + +if [ -n "${HEAD_RESULT}" ];then + echo "-- P0 was a SUCCESS on ${HEAD_RESULT}" +else + echo "ERROR Couldn't find ${SOURCE}_SUCCESS. Run s3_fsck_p0.py again." + exit 1 +fi + +TOTAL_COUNT=$(aws --endpoint-url "${S3_ENDPOINT}" s3 ls --recursive "${SOURCE}" | grep -cv _SUCCESS) + +echo "-- Counting" +TOTAL_SINGLE=0 +TOTAL_SPLIT=0 +TOTAL_NOK=0 TOTAL=0 -SINGLE=$(awk 'BEGIN {count=0} ; /SINGLE/ {count++} END {print count}' part* ) -SPLIT=$(awk 'BEGIN {count=0} ; !/subkey/ && !/SINGLE/ && !seen[$1]++ {count++} END {print count}' part* ) -let TOTAL+=${SINGLE}+${SPLIT} -echo "$TOTAL s3 sproxyd dig keys from p0 output" +COUNT=1 +SPLIT_LISTING="${TMP_FILE}/splits.csv" + +rm -f "${SPLIT_LISTING}" + +for F in $(aws --endpoint-url "${S3_ENDPOINT}" s3 ls --recursive "${SOURCE}" | awk '$4 !~ /_SUCCESS/ {print $4}');do + + echo "--- [${COUNT}/${TOTAL_COUNT}] Counting from ${F}" + PROCESS_FILE="${TMP_DIR}/count_me.csv" + aws --endpoint-url "${S3_ENDPOINT}" s3 cp "s3://${ORPHANS_BUCKET}/${F}" "${PROCESS_FILE}" > /dev/null + TOTAL_SINGLE=$(( ${TOTAL_SINGLE} + $(grep -v subkey ${PROCESS_FILE} | grep -v NOK | awk 'BEGIN {count=0} ; /SINGLE/ {count++} END {print count}' ${PROCESS_FILE}) )) + grep -v "subkey" "${PROCESS_FILE}" | grep -v NOK | grep -v "SINGLE" >> ${SPLIT_LISTING} + TOTAL_NOK=$(( ${TOTAL_NOK} + $(grep -c NOK "${PROCESS_FILE}") )) + COUNT=$(( ${COUNT} + 1 )) + rm -f "${PROCESS_FILE}" + +done + +echo "-- Deduplicating split listing" +TOTAL_SPLIT=$(awk '!seen[$1]++ {count++} END {print count}' "${SPLIT_LISTING}") +TOTAL=$(( ${TOTAL_SINGLE} + ${TOTAL_SPLIT} )) +rm -fr "${TMP_DIR}" + +echo "" +echo "${TOTAL_NOK} keys with NOK_HTTP status" +echo "${TOTAL} keys computed by P0." + diff --git a/scripts/S3_FSCK/count_p1-arc-keys.sh b/scripts/S3_FSCK/count_p1-arc-keys.sh index a8790d8..0f92cfd 100644 --- a/scripts/S3_FSCK/count_p1-arc-keys.sh +++ b/scripts/S3_FSCK/count_p1-arc-keys.sh @@ -1,7 +1,67 @@ #!/bin/bash +CONFIG_FILE="$(dirname $0)/../config/config.yml" + +if [ ! -f "${CONFIG_FILE}" ];then + + echo "${CONFIG_FILE} not found. Did you mount the scripts directory?" + exit 1 +fi + +WORKDIR="$(shyaml get-value datadir.container < ${CONFIG_FILE})" + +if [ ! -d "${WORKDIR}" ];then + echo "${WORKDIR} must be mounted by docker/ctr." + exit 1 +fi + +S3_ENDPOINT="$(shyaml get-value s3.endpoint < ${CONFIG_FILE})" +export AWS_ACCESS_KEY_ID="$(shyaml get-value s3.access_key < ${CONFIG_FILE})" +export AWS_SECRET_ACCESS_KEY="$(shyaml get-value s3.secret_key < ${CONFIG_FILE})" +ORPHANS_BUCKET="$(shyaml get-value path < ${CONFIG_FILE})" +RING="$(shyaml get-value ring < ${CONFIG_FILE})" + +######### RUN + +TMP_DIR="${WORKDIR}/count_p1_arc_keys" +mkdir "${TMP_DIR}" + +LIST_PATH="${RING}/s3fsck/arc-keys.csv" +SOURCE="s3://${ORPHANS_BUCKET}/${LIST_PATH}/" + +# Check that P1 was a success +HEAD_RESULT=$(aws --endpoint-url "${S3_ENDPOINT}" s3api head-object --bucket "${ORPHANS_BUCKET}" --key "${LIST_PATH}/_SUCCESS" | jq -r '.LastModified') + +if [ -n "${HEAD_RESULT}" ];then + echo "-- P1 was a SUCCESS on ${HEAD_RESULT}" +else + echo "ERROR Couldn't find ${SOURCE}_SUCCESS. Run s3_fsck_p1.py again." + exit 1 +fi + +# TOTAL_COUNT has two purposes: +# - Indicate how many csv files will be processed +# - Substract the number of headers from the grand total +TOTAL_COUNT=$(aws --endpoint-url "${S3_ENDPOINT}" s3 ls "${SOURCE}" | grep -cv _SUCCESS) + +echo "-- Counting" TOTAL=0 -NUM_HEADERS=$(ls part* | wc -l) -LINES=$(cat part* | wc -l) -let TOTAL+=${LINES}-${NUM_HEADERS} -echo "$TOTAL arc keys parsed from arc-keys.csv" \ No newline at end of file +COUNT=1 +for F in $(aws --endpoint-url "${S3_ENDPOINT}" s3 ls --recursive "${SOURCE}" | awk '$4 !~ /_SUCCESS/ {print $4}');do + + echo "--- [${COUNT}/${TOTAL_COUNT}] Counting from ${F}" + aws --endpoint-url "${S3_ENDPOINT}" s3 cp "s3://${ORPHANS_BUCKET}/${F}" "${TMP_DIR}/count_me.csv" > /dev/null + TOTAL=$(( ${TOTAL}+$(wc -l "${TMP_DIR}/count_me.csv" |awk '{print $1}') )) + COUNT=$(( ${COUNT} + 1 )) + rm -f "${TMP_DIR}/count_me.csv" + +done + +# We remove the count of headers included in the S3 objects. +TOTAL=$(( ${TOTAL} - ${TOTAL_COUNT} )) + +rm -fr "${TMP_DIR}" + +echo "" +echo "$TOTAL keys computed by P1." + diff --git a/scripts/S3_FSCK/export_s3_keys.sh b/scripts/S3_FSCK/export_s3_keys.sh index c0541f0..5ab0ed8 100644 --- a/scripts/S3_FSCK/export_s3_keys.sh +++ b/scripts/S3_FSCK/export_s3_keys.sh @@ -1,32 +1,163 @@ -#!/bin/bash +#!/usr/bin/env bash +# ----------------------------------------------------------------------------- +# Script Name: export_s3_keys.sh +# Description: This script exports a list of (s3key, sproxydkey) for a given raft session. +# It retrieves bucket information from a specified Raft session ID +# and processes the keys using a containerized utility. +# +# Prerequisites: +# - The `jq` command-line JSON processor must be installed. +# - Either `docker` or `ctr` must be available for running the containerized utility. +# +# Usage: +# ./export_s3_keys.sh (-r|--rid) [(-b|--bucketd) ] +# +# Arguments: +# -r|--rid : The Raft session ID to retrieve bucket information. +# -b|--bucketd : (Optional) The address and port of the Scality BucketD service. +# If not provided, the script attempts to determine it automatically. +# +# Workflow: +# 1. Validates the presence of required tools (`jq`, `docker`/`ctr`). +# 2. Determines the default bucketd address:port if not provided. +# 3. Retrieves the list of buckets for the specified Raft session ID. +# 4. For each bucket: +# - Runs a containerized utility to verify and export Sproxyd keys. +# - Processes the output to extract relevant key information. +# - Saves the processed keys to a file in the working directory. +# +# Output: +# - Processed keys are saved in `/var/tmp/bucketSproxydKeys/_keys.txt`. +# - Temporary raw output files are removed after processing. +# +# Notes: +# - Buckets containing "mpuShadowBucket" or "users..bucket" in their names are skipped. +# - The script creates the working directory `/var/tmp/bucketSproxydKeys` if it does not exist. +# +# Error Handling: +# - Exits with an error message if prerequisites are not met or required arguments are missing. +# - Logs errors if the bucketd address cannot be determined or if no keys file is generated. +# +# Limitations: +# - The script does not handle errors from the containerized utility: s3utils, esp. if verifyBucketSproxydKeys.js fails. +# - The script does not handle errors from the `jq` command. +# - The script does not handle errors from the `docker` or `ctr` commands. +# - The script does not handle errors from the `curl` command. +# ----------------------------------------------------------------------------- + +# Check if jq is installed +if ! command -v jq &> /dev/null; then + echo "Error: jq is not installed. Please install jq to proceed." + exit 1 +fi + +# Default value for BUCKETD address:port +DEFAULT_BUCKETD="$(ss -Hnlp sport :9000 | awk 'match($0,/([^ ]+:9000)/,B){print B[1]}')" +if [ -z "$DEFAULT_BUCKETD" ]; then + echo "Error: Could not determine DEFAULT_BUCKETD (address:port for scality-bucketd). Please specify it manually using the (-b|--bucketd) option." + exit 1 +fi + +# Parse arguments +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -r|--rid) + RID="$2" + shift + shift + ;; + -b|--bucketd) + BUCKETD="$2" + shift + shift + ;; + *) + echo "Unknown option: $1" + echo "Usage: $0 (-r|--rid) [(-b|--bucketd) ]" + exit 1 + ;; + esac +done + +# Check if RID is provided +if [ -z "$RID" ]; then + echo "Error: Missing required argument (-r|--rid) . Please provide the Raft session ID." + echo "Usage: $0 (-r|--rid) [(-b|--bucketd) ]" + exit 1 +fi + +# Use default value if BUCKETD is not provided +BUCKETD="${BUCKETD:-$DEFAULT_BUCKETD}" -RID=$1 export WORKDIR=/var/tmp/bucketSproxydKeys +container_command="$(basename "$(type -p docker || type -p ctr)")" +test -z "$container_command" && echo "docker or CTR not found!" && exit 1 if ! [ -d "${WORKDIR}" ] then mkdir -pv ${WORKDIR} fi -for bucket in $(curl --silent http://localhost:9000/_/raft_sessions/${RID}/bucket | jq -r '.[] | select (. | contains("mpuShadowBucket") | not) | select (. | contains("users..bucket") | not)') +for bucket in $(curl --silent http://${BUCKETD}/_/raft_sessions/${RID}/bucket | jq -r '.[] | select (. | contains("mpuShadowBucket") | not) | select (. | contains("users..bucket") | not)') do echo "--- Starting on ${bucket} ---" - docker run \ - --rm \ - -it \ - --net host \ - --entrypoint /usr/local/bin/node \ - -e 'BUCKETD_HOSTPORT=127.0.0.1:9000' \ - -e "BUCKETS=${bucket}" \ - -e 'NO_MISSING_KEY_CHECK=1' \ - -e 'VERBOSE=1' \ - ghcr.io/scality/s3utils:1.14.6 \ - verifyBucketSproxydKeys.js \ - > ${WORKDIR}/raw_${bucket}_keys.txt - - echo "--- Processing output... ---" - jq -r '. | select(.message | contains("sproxyd key")) + {"bucket": .objectUrl } | .bucket |= sub("s3://(?.*)/.*"; .bname) | .objectUrl |= sub("s3://.*/(?.*)$"; .oname) | [.bucket, .objectUrl, .sproxydKey] | @csv' ${WORKDIR}/raw_${bucket}_keys.txt > ${WORKDIR}/${bucket}_keys.txt + case $container_command + in + docker) echo "Running $container_command" + $container_command run \ + --rm \ + -it \ + --net host \ + --entrypoint /usr/local/bin/node \ + -e "BUCKETD_HOSTPORT=${BUCKETD}" \ + -e "BUCKETS=${bucket}" \ + -e 'NO_MISSING_KEY_CHECK=1' \ + -e 'VERBOSE=1' \ + registry.scality.com/s3utils/s3utils:1.14.101 \ + verifyBucketSproxydKeys.js \ + > ${WORKDIR}/raw_${bucket}_keys.txt + ;; + ctr) echo "Running $container_command" + # Define cleanup function for trap + cleanup() { + echo "Cleaning up ctr task, snapshot and container..." + $container_command task kill EXPORT_S3_KEYS 2>/dev/null || true + $container_command snapshot rm EXPORT_S3_KEYS 2>/dev/null || true + $container_command container rm EXPORT_S3_KEYS 2>/dev/null || true + exit 1 + } + + # Set trap for CTRL+C + trap cleanup INT + + $container_command run \ + --rm \ + --net-host \ + --mount='type=bind,src=/root/spark-apps,dst=/opt/spark/apps,options=rbind:rw' \ + -cwd=/usr/src/app \ + --env "BUCKETD_HOSTPORT=${BUCKETD}" \ + --env "BUCKETS=${bucket}" \ + --env 'NO_MISSING_KEY_CHECK=1' \ + --env 'VERBOSE=1' \ + registry.scality.com/s3utils/s3utils:1.14.101 \ + EXPORT_S3_KEYS \ + /usr/local/bin/node verifyBucketSproxydKeys.js \ + > ${WORKDIR}/raw_${bucket}_keys.txt + + # Remove trap after successful execution + trap - INT + ;; + esac + + echo "--- Processing output of ${bucket}... ---" + jq -r 'select(.message | contains("sproxyd key"))| [(.objectUrl | split("s3://") | .[1] | split("/") | .[0]),(.objectUrl | split("s3://") | .[1] | split("/") | .[-1]),.sproxydKey]| @csv' ${WORKDIR}/raw_${bucket}_keys.txt > ${WORKDIR}/${bucket}_keys.txt rm -f ${WORKDIR}/raw_${bucket}_keys.txt - echo + if [ -f "${WORKDIR}/${bucket}_keys.txt" ]; then + echo "--- Lines count of ${bucket}: $(wc -l ${WORKDIR}/${bucket}_keys.txt) ---" + else + echo "--- No keys file found for ${bucket}. Skipping line count. ---" + fi done + diff --git a/scripts/S3_FSCK/extract_metadata_keys_to_s3.sh b/scripts/S3_FSCK/extract_metadata_keys_to_s3.sh new file mode 100755 index 0000000..6d90182 --- /dev/null +++ b/scripts/S3_FSCK/extract_metadata_keys_to_s3.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# Extract location keys from S3 objects Metadata +# And automatically upload them to an S3 bucket +# +# Author: Gregoire Doumergue +# +# Warning: This script only works from within +# a SPARK container. + +# The outcome of this script is SUPER TOUCHY. +# Any failure needs to BREAK the workflow +# to be sure we end up with the complete list +# of sproxyd keys from S3 Metadata. +# Do NOT comment out this line. + +set -e +trap 'echo ""; + echo "################################################################"; + echo "Something wrong happened, do NOT proceed until fixed."; + echo ""; + exit 1' ERR + +CONFIG_FILE="$(dirname $0)/../config/config.yml" + +# Need to know where node is installed +. /opt/nvm/nvm.sh + +# Need to find node_modules dir +cd /opt/spark/scality-tools + +if [ ! -f "${CONFIG_FILE}" ];then + + echo "${CONFIG_FILE} not found. Did you mount the scripts directory?" + exit 1 +fi + +if [ -z "${RAFT_SESSIONS}" -a -z "${BUCKETS}" ];then + + echo "Provide either RAFT_SESSIONS or BUCKETS to extact." + exit 1 +fi + +WORKDIR="$(shyaml get-value datadir.container < ${CONFIG_FILE})" + +if [ ! -d "${WORKDIR}" ];then + echo "${WORKDIR} must be mounted by docker/ctr." + exit 1 +fi + +BUCKETD_HOSTPORT="$(shyaml get-value bucketd.url < ${CONFIG_FILE} | awk -F / '{print $NF}')" +if [ -z "${BUCKETD_HOSTPORT}" ];then + + BUCKETD_HOSTPORT="$(netstat -lnt | awk '/:9000 / {print $4}')" + if [[ -z "${BUCKETD_HOSTPORT}" ]]; then + echo "Error: Failed to determine bucketd's listening ip:port locally." + exit 1 + fi + echo "No bucketd.url provided. Using locally determined value: ${BUCKETD_HOSTPORT}" +else + echo "Using provided bucketd.url: ${BUCKETD_HOSTPORT}" +fi +export BUCKETD_HOSTPORT + +S3_ENDPOINT="$(shyaml get-value s3.endpoint_on_light < ${CONFIG_FILE})" +[ -z "${S3_ENDPOINT}" ] && S3_ENDPOINT="$(shyaml get-value s3.endpoint < ${CONFIG_FILE})" +export AWS_ACCESS_KEY_ID="$(shyaml get-value s3.access_key < ${CONFIG_FILE})" +export AWS_SECRET_ACCESS_KEY="$(shyaml get-value s3.secret_key < ${CONFIG_FILE})" +ORPHANS_BUCKET="$(shyaml get-value path < ${CONFIG_FILE})" +RING="$(shyaml get-value ring < ${CONFIG_FILE})" + +######### RUN + +# verifyBucketSproxydKeys.js parameters to list sproxyd keys of buckets. +export VERBOSE=1 +export NO_MISSING_KEY_CHECK=1 + +# Even though verifyBucketSproxydKeys.js has a RAFT_SESSIONS parameter that scans +# all the buckets of the given raft sessions, we prefer having 1 file per bucket. +# That's why we build a bucket list. + +BUCKET_LIST="" +if [ -n "${RAFT_SESSIONS}" ];then + + for RS_ID in $(echo "${RAFT_SESSIONS}" | sed "s#,# #g");do + BUCKET_LIST="${BUCKET_LIST} \ + $(curl --silent http://${BUCKETD_HOSTPORT}/_/raft_sessions/${RS_ID}/bucket | jq -r '.[] | select (. | contains("mpuShadowBucket") | not) | select (. | contains("users..bucket") | not)')" + done + unset RAFT_SESSIONS +else + + BUCKET_LIST=$(echo "${BUCKETS}" | sed "s#,# #g") +fi + +BUCKET_COUNT=$(echo "${BUCKET_LIST}" | wc -w) +COUNTER=1 +for BUCKET in ${BUCKET_LIST};do + + # Re-write BUCKETS (yes, ugly) for verifyBucketSproxydKeys.js to scan the BUCKET + export BUCKETS="${BUCKET}" + RAW_OUTPUT=$(mktemp "${WORKDIR}/raw_${BUCKET}_XXX.csv") + OUTPUT=$(mktemp "${WORKDIR}/output_${BUCKET}_XXX.csv") + S3_PATH="s3://${ORPHANS_BUCKET}/${RING}/s3-bucketd/${BUCKET}_keys.csv" + echo "--- [${COUNTER}/${BUCKET_COUNT}] Starting extraction for ${BUCKET} ---" + node verifyBucketSproxydKeys.js > ${RAW_OUTPUT} + + echo "--- [${COUNTER}/${BUCKET_COUNT}] Processing output for ${BUCKET} ---" + jq -r 'select(.message | contains("sproxyd key"))| [(.objectUrl | split("s3://") | .[1] | split("/") | .[0]),(.objectUrl | split("s3://") | .[1] | split("/") | .[-1]),.sproxydKey]| @csv' "${RAW_OUTPUT}" > "${OUTPUT}" + + rm -f "${RAW_OUTPUT}" + + echo "--- [${COUNTER}/${BUCKET_COUNT}] Uploading ${BUCKET}.csv to ${S3_PATH}, endpoint ${S3_ENDPOINT} ---" + aws --endpoint-url "${S3_ENDPOINT}" s3 cp "${OUTPUT}" "${S3_PATH}" + rm -f "${OUTPUT}" + COUNTER=$(( ${COUNTER} + 1 )) + +done diff --git a/scripts/S3_FSCK/s3_fsck_p0.py b/scripts/S3_FSCK/s3_fsck_p0.py index 6b67e68..e1c58fc 100644 --- a/scripts/S3_FSCK/s3_fsck_p0.py +++ b/scripts/S3_FSCK/s3_fsck_p0.py @@ -12,7 +12,7 @@ config_path = "%s/%s" % ( sys.path[0] ,"../config/config.yml") with open(config_path, 'r') as ymlfile: - cfg = yaml.load(ymlfile) + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) if len(sys.argv) >1: RING = sys.argv[1] @@ -31,27 +31,15 @@ COS = cfg["cos_protection"] PARTITIONS = int(cfg["spark.executor.instances"]) * int(cfg["spark.executor.cores"]) +# The arcindex maps the ARC Schema to the hex value found in the ringkey, in the 24 bits preceding the last 8 bits of the key +# e.g. FD770A344D6A6D259F92C500000000512040C070 +# FD770A344D6A6D259F92C50000000051XXXXXX70 where XXXXXX : 2040C0 arcindex = {"4+2": "102060", "8+4": "2040C0", "9+3": "2430C0", "7+5": "1C50C0", "5+7": "1470C0"} -os.environ["PYSPARK_SUBMIT_ARGS"] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell' - spark = SparkSession.builder \ .appName("s3_fsck_p0.py:Translate the S3 ARC keys :" + RING) \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ - .config("spark.hadoop.fs.s3a.access.key", ACCESS_KEY) \ - .config("spark.hadoop.fs.s3a.secret.key", SECRET_KEY) \ - .config("spark.hadoop.fs.s3a.endpoint", ENDPOINT_URL) \ - .config("spark.executor.instances", cfg["spark.executor.instances"]) \ - .config("spark.executor.memory", cfg["spark.executor.memory"]) \ - .config("spark.executor.cores", cfg["spark.executor.cores"]) \ - .config("spark.driver.memory", cfg["spark.driver.memory"]) \ - .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ - .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ - .config("spark.local.dir", cfg["path"]) \ .getOrCreate() - - def pad2(n): x = '%s' % (n,) return ('0' * (len(x) % 2)) + x @@ -118,6 +106,7 @@ def sparse(f): def check_split(key): + """With srebuildd, check if the RING key is split or not. Return True if split, False if not split, None if error (422, 404, 50X, etc.)""" url = "http://%s:81/%s/%s" % (SREBUILDD_IP, SREBUILDD_ARC_PATH, str(key.zfill(40))) r = requests.head(url) if r.status_code == 200: @@ -126,13 +115,19 @@ def check_split(key): def blob(row): + """Return a list of dict with the sproxyd input key, its subkey if it exists and digkey""" + # set key from row._c2 (column 3) which contains an sproxyd input key + # input structure: (bucket name, s3 object key, sproxyd input key) key = row._c2 + # use the sproxyd input key to find out if the key is split or not + # check_split(key) is used to transform the input key into a RING key, assess if it exists AND whether it is a SPLIT. split = check_split(key) if not split['result']: - return [{"key":key, "subkey":"NOK_HTTP", "digkey":"NOK_HTTP"}] + # If the key is not found, return a dict with the key, subkey and digkey set to NOK_HTTP + return [{"key": key, "subkey": "NOK_HTTP", "digkey": "NOK_HTTP"}] if split['is_split']: try: - header = { "x-scal-split-policy": "raw" } + header = {"x-scal-split-policy": "raw"} url = "http://%s:81/%s/%s" % ( SREBUILDD_IP, SREBUILDD_ARC_PATH, @@ -140,31 +135,67 @@ def blob(row): ) response = requests.get(url, headers=header, stream=True) if response.status_code == 200: - chunks = "" + chunks = b"" for chunk in response.iter_content(chunk_size=1024000000): if chunk: - chunks = chunk + chunk - chunkshex = chunks.encode("hex") + chunks += chunk + chunkshex = binascii.hexlify(chunks).decode('utf-8') rtlst = [] + # the k value is the subkey, a subkey is the sproxyd input key for each stripe of the split for k in list(set(sparse(chunkshex))): + # "key": key == primary sproxyd input key of a split object + # "subkey": k == subkey sproxyd input key of an individual stripe of a split object + # "digkey": gen_md5_from_id(k)[:26] == md5 of the subkey + # digkey: the unique part of a main chunk before service id, + # arc schema, and class are appended rtlst.append( {"key": key, "subkey": k, "digkey": gen_md5_from_id(k)[:26]} ) + # If the key is split and request is OK: + # return a list of dicts with the key (primary sproxyd input key), + # subkey (sproxyd input key of a split stripe) and + # digkey, (md5 of the subkey) + # digkey: the unqiue part of a main chunk before service id, + # arc schema, and class are appended return rtlst + # If the key is split and request is not OK: + # return a dict with the key (primary sproxyd input key) + # with both subkey and digkey columns set to NOK return [{"key": key, "subkey": "NOK", "digkey": "NOK"}] except requests.exceptions.ConnectionError as e: + # If there is a Connection Error in the HTTP request: + # return a dict with the key(primary sproxyd input key), + # with both subkey and digkey set to NOK return [{"key": key, "subkey": "NOK_HTTP", "digkey": "NOK_HTTP"}] if not split['is_split']: + # If the key is not split: + # return a dict with the key (primary sproxyd input key), + # subkey set to SINGLE and + # digkey, (md5 of the subkey) + # digkey: the unique part of a main chunk before service id, + # arc schema, and class are appended return [{"key": key, "subkey": "SINGLE", "digkey": gen_md5_from_id(key)[:26]}] + new_path = os.path.join(PATH, RING, "s3-bucketd") files = "%s://%s" % (PROTOCOL, new_path) +# reading without a header, +# columns _c0, _c1, _c2 are the default column names of +# columns 1, 2, 3 for the csv +# input structure: (bucket name, s3 object key, sproxyd input key) +# e.g. test,48K_object.01,9BC9C6080ED24A42C2F1A9C78F6BCD5967F70220 +# Required Fields: +# - _c2 (sproxyd input key) df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").option("delimiter", ",").load(files) +# repartition the dataframe to have the same number of partitions as the number of executors * cores df = df.repartition(PARTITIONS) +# Return a new Resilient Distributed Dataset (RDD) by applying a function to each element of this RDD. rdd = df.rdd.map(lambda x : blob(x)) dfnew = rdd.flatMap(lambda x: x).toDF() single = "%s://%s/%s/s3fsck/s3-dig-keys.csv" % (PROTOCOL, PATH, RING) +# Upload the dataframe to csv-formatted S3 objects with a header +# output structure: (digkey, sproxyd input key, subkey if available) dfnew.write.format("csv").mode("overwrite").options(header="true").save(single) diff --git a/scripts/S3_FSCK/s3_fsck_p1.py b/scripts/S3_FSCK/s3_fsck_p1.py index 8d7b5ac..9d0c940 100644 --- a/scripts/S3_FSCK/s3_fsck_p1.py +++ b/scripts/S3_FSCK/s3_fsck_p1.py @@ -7,7 +7,7 @@ config_path = "%s/%s" % ( sys.path[0] ,"../config/config.yml") with open(config_path, 'r') as ymlfile: - cfg = yaml.load(ymlfile) + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) if len(sys.argv) >1: RING = sys.argv[1] @@ -20,56 +20,92 @@ SECRET_KEY = cfg["s3"]["secret_key"] ENDPOINT_URL = cfg["s3"]["endpoint"] COS = cfg["cos_protection"] +SPARK_EXECUTOR_INSTANCES = cfg["spark.executor.instances"] +SPARK_EXECUTOR_MEMORY = cfg["spark.executor.memory"] +SPARK_EXECUTOR_CORES = cfg["spark.executor.cores"] -os.environ["PYSPARK_SUBMIT_ARGS"] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell' spark = SparkSession.builder \ .appName("s3_fsck_p1.py:Build RING keys :" + RING) \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ - .config("spark.hadoop.fs.s3a.access.key", ACCESS_KEY)\ - .config("spark.hadoop.fs.s3a.secret.key", SECRET_KEY)\ - .config("spark.hadoop.fs.s3a.endpoint", ENDPOINT_URL) \ - .config("spark.executor.instances", cfg["spark.executor.instances"]) \ - .config("spark.executor.memory", cfg["spark.executor.memory"]) \ - .config("spark.executor.cores", cfg["spark.executor.cores"]) \ - .config("spark.driver.memory", cfg["spark.driver.memory"]) \ - .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ - .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ - .config("spark.local.dir", cfg["path"]) \ .getOrCreate() +# .config("spark.sql.shuffle.partitions", 100) \ +# .config("spark.shuffle.file.buffer", "1m") \ +# Calculate the number of partitions +num_executors = SPARK_EXECUTOR_INSTANCES +num_cores_per_executor = SPARK_EXECUTOR_CORES +total_cores = num_executors * num_cores_per_executor +partitions_per_core = 1 +num_partitions = total_cores * partitions_per_core files = "%s://%s/%s/listkeys.csv" % (PROTOCOL, PATH, RING) +# columns _c0, _c1, _c2, _c3 are the default column names of +# columns 1, 2, 3, 4 for the csv +# REQUIRED N, Y, N, Y +# input structure: (RING key, main chunk, disk, flag) +# e.g. 555555A4948FAA554034E155555555A61470C07A,8000004F3F3A54FFEADF8C00000000511470C070,g1disk1,0 +# Required Fields: +# - _c1 (main chunk) +# - _c3 (flag) df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").option("delimiter", ",").load(files) -#list the ARC SPLIT main chunks +# Repartition the DataFrame +df = df.repartition(num_partitions) + +# list the ARC_REPLICATED main chunks from column 2 with service ID 50 and flag = 0 (new), into a single column (vector) named "_c1" +# ARC_REPLICATED = COS-protected keys that were uploaded with the arc ring driver. df_split = df.filter(df["_c1"].rlike(r".*000000..50........$") & df["_c3"].rlike("0")).select("_c1") +# NOTE: dfARCsingle will never find any key, because it searches for keys that end with 70. dfARCsingle = df_split.filter(df["_c1"].rlike(r".*70$")) dfARCsingle = dfARCsingle.groupBy("_c1").count().filter("count > 3") dfARCsingle = dfARCsingle.withColumn("ringkey",dfARCsingle["_c1"]) +# In df_split (a vector of main chunks), filter column named "_c1" RING key main chunk for the configured COS protection dfCOSsingle = df_split.filter(df["_c1"].rlike(r".*" + str(COS) + "0$")) +# count the number of chunks in column named "_c1" found for each key, creating the "count" column on the fly +# dfCOSsingle now has two columns ("_c1", "count") dfCOSsingle = dfCOSsingle.groupBy("_c1").count() +# in dfCOSsingle, duplicate column named "_c1" into a new "ringkey" (aka. "main chunk") column +# dfCOSsingle now has three columns ("_c1", "count", "ringkey") dfCOSsingle = dfCOSsingle.withColumn("ringkey",dfCOSsingle["_c1"]) +# in dfCOSsingle, do an in-place substring operation on column "_c1": get the 26 first characters of the main chunk (MD5 hash of the input key + 4 extra chars) +# FIXME: say why we need those 4 extra characters (about 18% more weight than the 22-char md5 alone) dfCOSsingle = dfCOSsingle.withColumn("_c1",F.expr("substring(_c1, 1, length(_c1)-14)")) +# Union the dfCOSsingle and dfARCsingle dataframes ("_c1", "count", "ringkey") +# NOTE: dfARCsingle should not exist in the first place - we "union" an empty list with a legit list. dfARCsingle = dfARCsingle.union(dfCOSsingle) -#list the ARC KEYS +# list the ARC_SINGLE keys with service ID 51 +# repeat the same logic as before, with a different initial mask +# NOTE: Here we don't filter with _c3 (=flag), hence, we include DELETED keys. +# It's harmless. During p3, we don't count their size. +# Output is a three-column matrix that will be unioned with the previous dataframe dfARCsingle df_sync = df.filter(df["_c1"].rlike(r".*000000..51........$")).select("_c1") +# Match keys which end in 70 from a single column named "_c1" dfARCSYNC = df_sync.filter(df["_c1"].rlike(r".*70$")) +# Filter out when less than 3 stripe chunks (RING orphans) dfARCSYNC = dfARCSYNC.groupBy("_c1").count().filter("count > 3") +# dfARCSYNC "_c1" column is duplicated into a "ringkey" column dfARCSYNC = dfARCSYNC.withColumn("ringkey",dfARCSYNC["_c1"]) +# in dfARCSYNC, do an in-place substring operation on column "_c1": get the 26 first characters of the main chunk (MD5 hash of the input key + 4 extra chars) dfARCSYNC = dfARCSYNC.withColumn("_c1",F.expr("substring(_c1, 1, length(_c1)-14)")) +# filter "_c1" for configured COS protection dfCOCSYNC = df_sync.filter(df["_c1"].rlike(r".*" + str(COS) + "0$")) +# count the number of chunks in "_c1" found for each key dfCOCSYNC = dfCOCSYNC.groupBy("_c1").count() +# dfCOCSYNC "_c1" column is duplicated into a "ringkey" column dfCOCSYNC = dfCOCSYNC.withColumn("ringkey",dfCOCSYNC["_c1"]) +# in dfCOCSYNC, do an in-place substring operation on column "_c1": get the 26 first characters of the main chunk (MD5 hash of the input key + 4 extra chars) dfCOCSYNC = dfCOCSYNC.withColumn("_c1",F.expr("substring(_c1, 1, length(_c1)-14)")) +# union the two previous dataframes dfARCSYNC = dfARCSYNC.union(dfCOCSYNC) +# union again the two outstanding dataframes dfARCSYNC and dfARCSINGLE into a dftotal dataframe dftotal = dfARCSYNC.union(dfARCsingle) +# Upload into the S3 bucket total = "%s://%s/%s/s3fsck/arc-keys.csv" % (PROTOCOL, PATH, RING) dftotal.write.format("csv").mode("overwrite").options(header="true").save(total) diff --git a/scripts/S3_FSCK/s3_fsck_p2.py b/scripts/S3_FSCK/s3_fsck_p2.py index 22eeb39..5c29cd7 100644 --- a/scripts/S3_FSCK/s3_fsck_p2.py +++ b/scripts/S3_FSCK/s3_fsck_p2.py @@ -9,7 +9,7 @@ config_path = "%s/%s" % ( sys.path[0], "../config/config.yml") with open(config_path, "r") as ymlfile: - cfg = yaml.load(ymlfile) + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) if len(sys.argv) >1: RING = sys.argv[1] @@ -21,34 +21,68 @@ ACCESS_KEY = cfg["s3"]["access_key"] SECRET_KEY = cfg["s3"]["secret_key"] ENDPOINT_URL = cfg["s3"]["endpoint"] +SPARK_EXECUTOR_INSTANCES = cfg["spark.executor.instances"] +SPARK_EXECUTOR_MEMORY = cfg["spark.executor.memory"] +SPARK_EXECUTOR_CORES = cfg["spark.executor.cores"] -os.environ["PYSPARK_SUBMIT_ARGS"] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell' spark = SparkSession.builder \ .appName("s3_fsck_p2.py:Union the S3 keys and the RING keys :" + RING) \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ - .config("spark.hadoop.fs.s3a.access.key", ACCESS_KEY)\ - .config("spark.hadoop.fs.s3a.secret.key", SECRET_KEY)\ - .config("spark.hadoop.fs.s3a.endpoint", ENDPOINT_URL) \ - .config("spark.executor.instances", cfg["spark.executor.instances"]) \ - .config("spark.executor.memory", cfg["spark.executor.memory"]) \ - .config("spark.executor.cores", cfg["spark.executor.cores"]) \ - .config("spark.driver.memory", cfg["spark.driver.memory"]) \ - .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ - .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ - .config("spark.local.dir", cfg["path"]) \ .getOrCreate() +# .config("spark.sql.shuffle.partitions", 100) \ +# .config("spark.shuffle.file.buffer", "1m") \ +# Calculate the number of partitions +num_executors = SPARK_EXECUTOR_INSTANCES +num_cores_per_executor = SPARK_EXECUTOR_CORES +total_cores = num_executors * num_cores_per_executor +partitions_per_core = 1 +num_partitions = total_cores * partitions_per_core +# s3keys are generated by verifySproxydKeys.js script and processed by s3_fsck_p0.py s3keys = "%s://%s/%s/s3fsck/s3-dig-keys.csv" % (PROTOCOL, PATH, RING) +# ringkeys are generated by the listkeys.py (or ringsh dump) script and processed by s3_fsck_p1.py ringkeys = "%s://%s/%s/s3fsck/arc-keys.csv" % (PROTOCOL, PATH, RING) +# reading with a header, the columns are named. +# columns digkey, sproxyd input key, subkey are the actual column names of +# columns 1, 2, 3 for the csv +# input structure: (digkey, sproxyd input key, subkey) +# e.g. 7359114991482315D0A5890000,BDE4B9BBEB45711EC2F1A9C78F6BCD59E02C6220,SINGLE +# Required Fields: +# - digkey +# - sproxyd input key dfs3keys = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(s3keys) +# Repartition the DataFrame +dfs3keys = dfs3keys.repartition(num_partitions) + + +# reading with a header, the columns are named. +# columns _c1, count, ringkey (main chunk) are the actual column names of +# columns 1, 2, 3 for the csv +# input structure: (digkey, count, ringkey (main chunk)) +# e.g. 907024530554A8DB3167280000,12,907024530554A8DB31672800000000512430C070 +# Required Fields: +# - digkey +# - ringkey (main chunk) dfringkeys = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(ringkeys) +# Repartition the DataFrame +dfringkeys = dfringkeys.repartition(num_partitions) +# rename the column _c1 to digkey, the next write will output a header that uses digkey instead of _c1 +# digkey: the unique part of a main chunk before service id, arc schema, and class are appended dfringkeys = dfringkeys.withColumnRenamed("_c1","digkey") +# inner join the s3keys (sproxyd input key) and ringkeys (the main chunk of the strip or replica) +# on the digkey column. The result will be a dataframe with the columns ringkey, digkey +# the inner join leftani will not return rows that are present in both dataframes, +# eliminating ringkeys (main chunks) that have metadata in s3 (not application orphans). +# digkey: the unique part of a main chunk before service id, arc schema, and class are appended +# ringkey: the main chunk of the strip or replica inner_join_false = dfringkeys.join(dfs3keys,["digkey"], "leftanti").withColumn("is_present", F.lit(int(0))).select("ringkey", "is_present", "digkey") + +# Create the final dataframe with only the ringkey (the main chunk of the strip or replica) df_final = inner_join_false.select("ringkey") +# Upload the final dataframe to csv-formatted S3 objects allmissing = "%s://%s/%s/s3fsck/s3objects-missing.csv" % (PROTOCOL, PATH, RING) df_final.write.format("csv").mode("overwrite").options(header="false").save(allmissing) diff --git a/scripts/S3_FSCK/s3_fsck_p2_reverselookup.py b/scripts/S3_FSCK/s3_fsck_p2_reverselookup.py index 9115950..c5be920 100644 --- a/scripts/S3_FSCK/s3_fsck_p2_reverselookup.py +++ b/scripts/S3_FSCK/s3_fsck_p2_reverselookup.py @@ -10,7 +10,7 @@ config_path = "%s/%s" % (sys.path[0], "../config/config.yml") with open(config_path, "r") as ymlfile: - cfg = yaml.safe_load(ymlfile) + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) if len(sys.argv) >1: RING = sys.argv[1] @@ -25,23 +25,12 @@ REVLOOKUP_BIN = cfg["nasdk_tools"]["scalarcrevlookupid_bin"] SPLITGETUSERMD_BIN = cfg["nasdk_tools"]["scalsplitgetusermd_bin"] PORT_RANGE = cfg["nasdk_tools"]["port_range"] -ARCDATA_DRIVER = cfg["nasdk_tools"]["arcdata_path"] +SSH_KEY = cfg["nasdk_tools"]["ssh_key"] +ARCDATA_DRIVER = cfg["nasdk_tools"]["arcdata_driver_type"] -os.environ["PYSPARK_SUBMIT_ARGS"] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell' spark = SparkSession.builder \ .appName("s3_fsck_p2_reverselookup.py: lookup the input keys for the newly found orphaned DATA RING main chunks. If their service ID is SPLIT, look the top input key up. RING: " + RING) \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ - .config("spark.hadoop.fs.s3a.access.key", ACCESS_KEY)\ - .config("spark.hadoop.fs.s3a.secret.key", SECRET_KEY)\ - .config("spark.hadoop.fs.s3a.endpoint", ENDPOINT_URL) \ - .config("spark.executor.instances", cfg["spark.executor.instances"]) \ - .config("spark.executor.memory", cfg["spark.executor.memory"]) \ - .config("spark.executor.cores", cfg["spark.executor.cores"]) \ - .config("spark.driver.memory", cfg["spark.driver.memory"]) \ - .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ - .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ - .config("spark.local.dir", cfg["path"]) \ .getOrCreate() #spark.sparkContext.setLogLevel("DEBUG") @@ -66,7 +55,7 @@ def get_random_port(): def rev_lookup(key): try: ip_port = "{}:{}".format(get_local_ip(), get_random_port()) - cmd = "ssh -qT $(hostname -i) {} -b '{}' {}".format(REVLOOKUP_BIN, ip_port, key) + cmd = "ssh -qT localhost -o StrictHostKeyChecking=no -i {} {} -b '{}' {}".format(SSH_KEY,REVLOOKUP_BIN, ip_port, key) print(" Executing rev_lookup command: {}".format(cmd)) output = subprocess.check_output(cmd, shell=True, stderr=open(os.devnull, 'wb')) print(" rev_lookup output: {}".format(output)) diff --git a/scripts/S3_FSCK/s3_fsck_p3.py b/scripts/S3_FSCK/s3_fsck_p3.py index 6243e85..fae6494 100644 --- a/scripts/S3_FSCK/s3_fsck_p3.py +++ b/scripts/S3_FSCK/s3_fsck_p3.py @@ -10,7 +10,7 @@ config_path = "%s/%s" % ( sys.path[0] ,"../config/config.yml") with open(config_path, 'r') as ymlfile: - cfg = yaml.load(ymlfile) + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) if len(sys.argv) >1: @@ -29,50 +29,69 @@ ARC = cfg["arc_protection"] COS = cfg["cos_protection"] -os.environ["PYSPARK_SUBMIT_ARGS"] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell' spark = SparkSession.builder \ .appName("s3_fsck_p3.py:Compute the total sizes to be deleted :" + RING) \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ - .config("spark.hadoop.fs.s3a.access.key", ACCESS_KEY)\ - .config("spark.hadoop.fs.s3a.secret.key", SECRET_KEY)\ - .config("spark.hadoop.fs.s3a.endpoint", ENDPOINT_URL) \ - .config("spark.executor.instances", cfg["spark.executor.instances"]) \ - .config("spark.executor.memory", cfg["spark.executor.memory"]) \ - .config("spark.executor.cores", cfg["spark.executor.cores"]) \ - .config("spark.driver.memory", cfg["spark.driver.memory"]) \ - .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ - .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ - .config("spark.local.dir", PATH) \ .getOrCreate() - +# Use of the arcindex limits the inspection to a specific ARC protection scheme. +# If there were more than one cluster with different ARC protection schemes then this would limit the check to a specific scheme. +# FOOD FOR THOUGHT: limits finding keys which may have been written after a schema change or any bug did not honor the schema. +# The arcindex is a dictionary that contains the ARC protection scheme and the hex value found in the ringkey arcindex = {"4+2": "102060", "8+4": "2040C0", "9+3": "2430C0", "7+5": "1C50C0", "5+7": "1470C0"} + +# The arcdatakeypattern is a regular expression that matches the ARC data keys arcdatakeypattern = re.compile(r'[0-9a-fA-F]{38}70') def statkey(row): + """ statkey takes a row from the dataframe and returns a tuple with the key, status_code, size""" key = row._c0 try: url = "%s/%s" % (SREBUILDD_URL, str(key.zfill(40))) r = requests.head(url) if r.status_code == 200: - if re.search(arcdatakeypattern, key): + if r.headers.get("X-Scal-Attr-Is-Deleted") == "integer=1": + # The key is found but already marked as Deleted. Return the key, the status code, and 0 for the size + return ( key, r.status_code, 0) + if re.search(arcdatakeypattern, key): # Should consider changing this to match any entry in the arcindex + # The size of the ARC data key is 12 times the size of the ARC index key. + # At this point there is no longer access to the qty of keys found, so + # it simply computes based on the presumed schema of 12 chunks per key. size = int(r.headers.get("X-Scal-Size", False))*12 else: + # The size of the ARC index key is the size of the ARC index key plus the size of the ARC data key times the COS protection. + # At this point there is no longer access to the qty of keys found, so + # it simply computes based on the presumed schema of int(COS) chunks per key. + # If there are orphans which are not matching the arcdatakeypattern they will + # be computed as if they were COS. size = int(r.headers.get("X-Scal-Size",False)) + int(r.headers.get("X-Scal-Size",False))*int(COS) return ( key, r.status_code, size) else: + # If the key is not found (HTTP code != 200) then return the key, the status code, and 0 for the size return ( key, r.status_code, 0) except requests.exceptions.ConnectionError as e: + # If there is a connection error then return the key, the status code, and 0 for the size return ( key, "HTTP_ERROR", 0) files = "%s://%s/%s/s3fsck/s3objects-missing.csv" % (PROTOCOL, PATH, RING) + +# reading without a header, +# columns _c0 is the default column names of +# column 1 for the csv +# input structure: _c0 (main chunk) +# e.g. 998C4DF2FC7389A7C82A9600000000512040C070 +# Required Fields: +# - _c0 (main chunk) df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").load(files) + +# Create a resilient distributed dataset (RDD) from the dataframe (logical partitions of data) +# The rdd is a collection of tuples returned from statkey (key, status_code, size) rdd = df.rdd.map(statkey) #rdd1 = rdd.toDF() +# The size_computed is the sum of the size column in the rdd size_computed= rdd.map(lambda x: (2,int(x[2]))).reduceByKey(lambda x,y: x + y).collect()[0][1] string = "The total computed size of the not indexed keys is: %d bytes" % size_computed banner = '\n' + '-' * len(string) + '\n' diff --git a/scripts/S3_FSCK/s3_fsck_p4.py b/scripts/S3_FSCK/s3_fsck_p4.py index df61463..edcc04b 100644 --- a/scripts/S3_FSCK/s3_fsck_p4.py +++ b/scripts/S3_FSCK/s3_fsck_p4.py @@ -12,7 +12,7 @@ config_path = "%s/%s" % ( sys.path[0], "../config/config.yml") with open(config_path, "r") as ymlfile: - cfg = yaml.load(ymlfile) + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) if len(sys.argv) >1: @@ -37,20 +37,8 @@ arcindex = {"4+2": "102060", "8+4": "2040C0", "9+3": "2430C0", "7+5": "1C50C0", "5+7": "1470C0"} arcdatakeypattern = re.compile(r'[0-9a-fA-F]{38}70') -os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell' spark = SparkSession.builder \ .appName("s3_fsck_p4.py:Clean the extra keys :" + RING) \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ - .config("spark.hadoop.fs.s3a.access.key", ACCESS_KEY)\ - .config("spark.hadoop.fs.s3a.secret.key", SECRET_KEY)\ - .config("spark.hadoop.fs.s3a.endpoint", ENDPOINT_URL) \ - .config("spark.executor.instances", cfg["spark.executor.instances"]) \ - .config("spark.executor.memory", cfg["spark.executor.memory"]) \ - .config("spark.executor.cores", cfg["spark.executor.cores"]) \ - .config("spark.driver.memory", cfg["spark.driver.memory"]) \ - .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ - .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ - .config("spark.local.dir", PATH) \ .getOrCreate() @@ -68,9 +56,24 @@ def deletekey(row): files = "%s://%s/%s/s3fsck/s3objects-missing.csv" % (PROTOCOL, PATH, RING) + +# reading without a header, +# columns _c0 is the default column names of +# column 1 for the csv +# input structure: _c0 (main chunk) +# e.g. 998C4DF2FC7389A7C82A9600000000512040C070 +# Required Fields: +# - _c0 (main chunk) df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").load(files) +# rename the column _c0 (column 1) to ringkey df = df.withColumnRenamed("_c0","ringkey") + +# repartition the dataframe to the number of partitions (executors * cores) df = df.repartition(PARTITIONS) + +# map the deletekey function to the dataframe: blindly delete keys on the RING. rdd = df.rdd.map(deletekey).toDF() deletedorphans = "%s://%s/%s/s3fsck/deleted-s3-orphans.csv" % (PROTOCOL, PATH, RING) + +# Upload the dataframe to csv-formatted S3 objects with the results of the deletekey function rdd.write.format("csv").mode("overwrite").options(header="false").save(deletedorphans) diff --git a/scripts/S3_FSCK/s3mdjournalbackuphashes.sh b/scripts/S3_FSCK/s3mdjournalbackuphashes.sh new file mode 100644 index 0000000..f213086 --- /dev/null +++ b/scripts/S3_FSCK/s3mdjournalbackuphashes.sh @@ -0,0 +1,195 @@ +#!/usr/bin/env bash +# shellcheck disable=SC2154,SC2164,SC2128,SC2166,SC2086,SC2207,SC2188,SC2068,SC2016 +# script name: s3mdjournalbackuphashes.sh +# goal of the script: When run on an S3 connector that hosts stateless components bucketd and vault, this script computes the list of theoretically available S3 MD journals backups sproxyd keys. +# +# author: francois.nomarski@scality.com +# contributors: gregoire.doumergue@scality.com +# version history: +# - 1.0 (2023-11-27) : generate input keys in a fully sequential fashion +# - 2.0 (2024-05-17) : rename the script to s3mdjournalbackuphashes.sh and introduce xargs for parallelism +# - 2.1 (2024-05-22) : revert some parallelisation to keep it functional +# - 3.0 (2025-03-17) : poll `scality-bucketd` instead of `repd` +# - 3.1 (2025-05-02) : add support for `--bucketd` argument to specify the IP:PORT of the bucketd service +# - 4.0 (2025-06-13) : automatically detect Vault topology file with a function, use netstat to be compatible with RHEL/CentOS 7 +#=== +# ref: MetaData repo + MD-706 set of scripts to generate ad-hoc S3 MD backup input keys +# Iterate through every locally available raft session: +# - get its latest bseq value, +# - derive the list of theoretically available S3 MD journals backup input keys +# Output: unsorted CSV file with "s3mdbseq","installid/raftid/bseq/copyid","input key" +#=== +# FIXME: parallelisation is imperfect +# FIXME: implement sanity-check of output file: compute the expected number of lines, compare it against the output file +# TODO: if the current flavour is not scalable or too aggressive, allow to at least proceed on a per-raft session basis + +declare SID # UKS service ID + +# In most cases, a single Install ID is used (0). +declare INSTALLID # Federation: "env_metadata_installID" padded to 2 hex chars in ".installID" in repd_*.json, retrieved from configuration + +declare CLUSTERTYPE # Federation: text +declare ADMINPORT # repd +declare REPDHOST # repd +declare RAFTSESSIONID # repd +declare BSEQ # repd +declare COS # repd +declare JSON_DATA # configuration retrieved from repd_*.json +declare h # _the_ hash +declare VAULT_TOPOLOGY +declare OUTPUT_FILE +declare -a bseq_array +declare BUCKETD_HOSTPORT + + +usage() { + echo "Computes the list of theoretically available S3 MD journals backup sproxyd keys." + echo "Usage: $0 [-b BUCKETD_HOSTPORT | --bucketd BUCKETD_HOSTPORT] [-h | --help]" + echo + echo "Optional arguments:" + echo " -b, --bucketd Specify the IP:PORT of the bucketd service (e.g., 192.168.1.100:9000)." + echo " If not provided, the script will attempt to determine it locally." + echo + echo "Description:" + echo " This script computes the list of theoretically available S3 MD journals backup sproxyd keys." + echo " It outputs a CSV file with the format: \"s3mdbseq\",\"INSTALLID/RAFTSESSIONID/bseq/copyid\",\"input key\"." + echo " Output file: /var/tmp/s3mdbseq_keys.txt" + echo + echo "Examples:" + echo " Run with a specific BUCKETD_HOSTPORT:" + echo " bash $0 -b 192.168.1.100:9000" + echo + echo " Run without specifying BUCKETD_HOSTPORT (local detection):" + echo " bash $0" + echo + exit 1 +} + +vault_topology() { + +CONTAINER_COMMAND="$(basename "$(type -p docker || type -p ctrctl)")" +test -z "${CONTAINER_COMMAND}" && echo "docker or containerd not found!" && exit 1 + +CONTAINER_NAME=$(${CONTAINER_COMMAND} ps | grep -m 1 -o "scality-vault[^ ]*") + +${CONTAINER_COMMAND} exec ${CONTAINER_NAME} cat /conf/topology.json + +} + +# Default value for BUCKETD_HOSTPORT +BUCKETD_HOSTPORT="" + +# Parse command-line arguments +while [[ "$#" -gt 0 ]]; do + case "$1" in + -b|--bucketd) + if [[ -n "$2" && "$2" != -* ]]; then + BUCKETD_HOSTPORT="$2" + shift 2 + else + echo "Error: Missing value for $1" + usage + fi + ;; + -h|--help) + usage + ;; + *) + echo "Error: Unknown argument $1" + usage + ;; + esac +done + +# Determine BUCKETD_HOSTPORT if not provided as a command-line argument +if [[ -z "$BUCKETD_HOSTPORT" ]]; then + BUCKETD_HOSTPORT="$(netstat -lnt | awk '/:9000 / {print $4}')" + if [[ -z "$BUCKETD_HOSTPORT" ]]; then + echo "Error: Failed to determine BUCKETD_HOSTPORT locally. Please provide it explicitly using -b or --bucketd." + exit 1 + fi + echo "No BUCKETD_HOSTPORT provided. Using locally determined value: $BUCKETD_HOSTPORT" +else + echo "Using provided BUCKETD_HOSTPORT: $BUCKETD_HOSTPORT" +fi + +SID="5A" # 0x5A the service ID in the input keys responsible for pointing to S3 MD journals backups +COS=2 # constant in RaftSession.js:cos +mapfile -t RAFT_SESSIONS < <(curl -Ls "${BUCKETD_HOSTPORT}/_/raft_sessions/"| jq -r .[].id) # bucketd + +VAULT_TOPOLOGY=$(vault_topology) + +if [ -z "${VAULT_TOPOLOGY}" ];then + echo "Error: Failed to get Vault configuration." + exit 1 +fi + +OUTPUT_FILE="/var/tmp/s3mdbseq_keys.txt" +echo "Output will be written to OUTPUT_FILE=$OUTPUT_FILE" + +> "$OUTPUT_FILE" +exec 3>> "$OUTPUT_FILE" + +# loop: for every raft session +# explicitly remove the double-quotes for globbing (* wildcard in the path) +for i in "${RAFT_SESSIONS[@]}" vault; do + echo "raft: $i" + # assess raft configuration and raft state + # - assess raft configuration + if [[ $i =~ ^[0-9]+$ ]] ; then + mapfile -t JSON_DATA < <(curl -Ls "${BUCKETD_HOSTPORT}/_/raft_sessions/$i/leader" | jq -cr '.adminPort, .host') + read -r ADMINPORT REPDHOST <<< "$(echo -n ${JSON_DATA[@]})" + RAFTSESSIONID="$i" + INSTALLID="$(curl -Ls "${REPDHOST}:${ADMINPORT}/_/configuration/installID")" + INSTALLID="$(printf "%02X" "${INSTALLID}")" + CLUSTERTYPE="$(curl -Ls "${REPDHOST}:${ADMINPORT}/_/configuration/cluster")" + echo " CLUSTERTYPE: $CLUSTERTYPE" + elif [ "$i" = "vault" ] ; then + echo " vault loop" + mapfile -t JSON_DATA < <(echo "${VAULT_TOPOLOGY}" | jq -r '."0".repds[0] | .adminPort, .host') + # shellcheck disable=SC2145 + echo " JSON_DATA: ${JSON_DATA[@]}" + read -r ADMINPORT REPDHOST <<< "$(echo -n ${JSON_DATA[@]})" + echo " vault ADMINPORT: $ADMINPORT ; REPDHOST: $REPDHOST" + RAFTSESSIONID="$i" + echo " vault RAFTSESSIONID: $RAFTSESSIONID" + INSTALLID="$(curl -Ls "${REPDHOST}:${ADMINPORT}/_/configuration/installID")" + INSTALLID="$(printf "%02X" "${INSTALLID}")" + echo " vault INSTALLID: $INSTALLID" + CLUSTERTYPE="$(curl -Ls "${REPDHOST}:${ADMINPORT}/_/configuration/cluster")" + echo " vault CLUSTERTYPE: $CLUSTERTYPE" + fi + # - assess raft state + HTTP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" "${REPDHOST}:${ADMINPORT}/_/raft/state") + if [[ "$HTTP_RESPONSE" -ne 200 ]]; then + echo " WARNING: Failed to retrieve raft state from ${REPDHOST}:${ADMINPORT}. HTTP response code: $HTTP_RESPONSE" + BSEQ=0 + else + BSEQ="$(curl -Ls "${REPDHOST}:${ADMINPORT}/_/raft/state" | jq -r '.committing / 10000 | floor')" + if [[ "$BSEQ" -eq 0 ]]; then + echo " WARNING: Computed BSEQ is 0 for raft session $RAFTSESSIONID. No journal backups available." + fi + fi + echo " latest BSEQ: $BSEQ" + # build _the_ hash, sliced and diced, later found in the s3mdbseq input keys + h="$(printf "%s" "${CLUSTERTYPE}/${RAFTSESSIONID}"|md5sum|awk '{print toupper($1)}')" ; + # expected mask for a given session: + #echo " e.g.: ${h:0:6} ???????????????? ${h:6:8} ${SID} ${h:14:2} ${INSTALLID} 0 ?${COS}0" + # Populate the bseq_array with the theoretically existing `BSEQ` values sequence + bseq_array=( $(seq 1 $BSEQ) ) + #echo ${bseq_array[@]} + chunk_size=432 + + for ((bseq=0; bseq<${#bseq_array[@]}; bseq+=chunk_size)); do + chunk=("${bseq_array[@]:bseq:chunk_size}") + echo ${chunk[@]} | xargs -P 0 -n1 bash -c ' + for ((copyid=0; copyid<='$COS'; copyid++)); do + echo "\"s3mdbseq\",\"'"${INSTALLID}/${RAFTSESSIONID}"'/$0/$copyid\",\"'${h:0:6}'$(printf "%016X" "$0")'${h:6:8}${SID}${h:14:2}${INSTALLID}0'${copyid}'${COS}'0\"" + done + ' + done >&3 + #done | xargs -0 -n1 -I line -P0 echo line >&3 # FIXME: this parallelisation fails, gets a signal 13 after it's ballooned to "argument line too long" + unset bseq_array +done +exec 3>&-; # sort -t',' -k3,3 -k2,2 -n "$OUTPUT_FILE" > "${OUTPUT_FILE}.new" && \mv -vf "${OUTPUT_FILE}"{.new,} # not necessary +echo "Output written to $OUTPUT_FILE" diff --git a/scripts/S3_FSCK/test_s3.py b/scripts/S3_FSCK/test_s3.py new file mode 100644 index 0000000..fad3c7d --- /dev/null +++ b/scripts/S3_FSCK/test_s3.py @@ -0,0 +1,46 @@ +''' +Author: G. Doumergue + +This script tests the connectivity between Spark workers (running in containers), +and S3. +Make sure the endpoint, credentials, and bucket name are correctly set in the +config/config.yml file before running this script. +''' + +from pyspark.sql import SparkSession, Row, SQLContext +from pyspark.sql.types import IntegerType +import pyspark.sql.functions as F +import os +import sys +import re +import binascii +import hashlib +import base64 +import yaml + +config_path = "%s/%s" % ( sys.path[0] ,"../config/config.yml") +with open(config_path, 'r') as ymlfile: + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) + +PROTOCOL = cfg["protocol"] +BUCKET = cfg["path"] +TEST_PREFIX="test-spark" + +spark = SparkSession.builder.appName("Test S3 connection from Spark").getOrCreate() + +files = "%s://%s/%s" % (PROTOCOL, BUCKET, TEST_PREFIX) + +df = spark.createDataFrame([{"a":1},{"b":2},{"c":3}]) + +#Upload to S3 +print(f"Uploading to {files}") +df.write.format("csv").option("delimiter",",").mode("overwrite").options(header="false").save(files) +print(f"Success") + +#Reading from S3 +print(f"Loading dataset from {files}") +dfread=spark.read.format("csv").option("header", "false").option("inferSchema", "true").option("delimiter", ",").load(files) +print(f"Success") +dfread.show() +print(f"Successfully used {files} in read and write mode, you can remove it now.") + diff --git a/scripts/S3_FSCK/upload_metadata_backup_keys_to_s3.sh b/scripts/S3_FSCK/upload_metadata_backup_keys_to_s3.sh new file mode 100755 index 0000000..19c9d22 --- /dev/null +++ b/scripts/S3_FSCK/upload_metadata_backup_keys_to_s3.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# Upload S3 Metadata backup keys to S3 +# +# Author: Gregoire Doumergue +# +# Warning: This script only works from within +# a SPARK container. + +# The outcome of this script is SUPER TOUCHY. +# Any failure needs to BREAK the workflow +# to be sure we end up with the complete list +# of sproxyd keys from S3 Metadata. +# Do NOT comment out this line. + +set -e +trap 'echo ""; + echo "################################################################"; + echo "Something wrong happened, do NOT proceed until fixed."; + echo ""; + exit 1' ERR + +CONFIG_FILE="$(dirname $0)/../config/config.yml" + +if [ ! -f "${CONFIG_FILE}" ];then + + echo "${CONFIG_FILE} not found. Did you mount the scripts directory?" + exit 1 +fi + +S3_ENDPOINT="$(shyaml get-value s3.endpoint_on_light < ${CONFIG_FILE})" +[ -z "${S3_ENDPOINT}" ] && S3_ENDPOINT="$(shyaml get-value s3.endpoint < ${CONFIG_FILE})" +export AWS_ACCESS_KEY_ID="$(shyaml get-value s3.access_key < ${CONFIG_FILE})" +export AWS_SECRET_ACCESS_KEY="$(shyaml get-value s3.secret_key < ${CONFIG_FILE})" +ORPHANS_BUCKET="$(shyaml get-value path < ${CONFIG_FILE})" +RING="$(shyaml get-value ring < ${CONFIG_FILE})" + +# We use an exotic path for WORKDIR to make sure the container +# was started with the documented command line. + +WORKDIR=/opt/spark/tmp + +if [ ! -d "${WORKDIR}" ];then + echo "ERROR ${WORKDIR} must be mounted by docker/ctr." + exit 1 +fi + +# This file should have been previously generated by s3mdjournalbackuphashes.sh +UPLOAD_FILE="${WORKDIR}/s3mdbseq_keys.txt" + +S3_PATH="s3://${ORPHANS_BUCKET}/${RING}/s3-bucketd/s3mdbseq_keys.csv" + +######### RUN + +if [ ! -f "${UPLOAD_FILE}" ];then + echo "Run s3mdjournalbackuphashes.sh to generate the Metadata backup keys list." + exit 1 +fi + +aws --endpoint-url "${S3_ENDPOINT}" s3 cp "${UPLOAD_FILE}" "${S3_PATH}" diff --git a/scripts/check_key.py b/scripts/check_key.py index 824c3c3..663311b 100755 --- a/scripts/check_key.py +++ b/scripts/check_key.py @@ -4,61 +4,62 @@ import re import sys import yaml -from pyspark.sql import SparkSession, SQLContext -from pyspark import SparkContext +from pyspark.sql import SparkSession config_path = "%s/%s" % ( sys.path[0] ,"./config/config.yml") with open(config_path, 'r') as ymlfile: - cfg = yaml.load(ymlfile) - + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) if len(sys.argv) >1: - RING = sys.argv[1] + RING_NAME = sys.argv[1] else: - RING = cfg["ring"] + RING_NAME = cfg["ring"] -PATH = cfg["path"] -PROT = cfg["protocol"] +BUCKET = cfg["path"] +PROTO = cfg["protocol"] srebuildd_ip = cfg["srebuildd_ip"] srebuildd_path = cfg["srebuildd_path"] -srebuildd_url = "http://%s:81/%s/" % ( srebuildd_ip, srebuildd_path) -os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell' -spark = SparkSession.builder \ - .appName("Check ring keys:"+RING) \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ - .config("spark.hadoop.fs.s3a.access.key", cfg["s3"]["access_key"])\ - .config("spark.hadoop.fs.s3a.secret.key", cfg["s3"]["secret_key"])\ - .config("spark.hadoop.fs.s3a.endpoint", cfg["s3"]["endpoint"])\ - .config("spark.executor.instances", cfg["spark.executor.instances"]) \ - .config("spark.executor.memory", cfg["spark.executor.memory"]) \ - .config("spark.executor.cores", cfg["spark.executor.cores"]) \ - .config("spark.driver.memory", cfg["spark.driver.memory"]) \ - .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ - .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ - .config("spark.local.dir", cfg["path"]) \ - .getOrCreate() +os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:3.3.4" pyspark-shell' +os.environ['AWS_JAVA_V1_DISABLE_DEPRECATION_ANNOUNCEMENT'] = 'true' + +spark_base = SparkSession.builder.appName("Check ring keys:" + RING_NAME) # type: ignore +spark = spark_base \ + .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ + .config("spark.hadoop.fs.s3a.access.key", cfg["s3"]["access_key"])\ + .config("spark.hadoop.fs.s3a.secret.key", cfg["s3"]["secret_key"])\ + .config("spark.hadoop.fs.s3a.endpoint", cfg["s3"]["endpoint"])\ + .config("spark.executor.instances", cfg["spark.executor.instances"]) \ + .config("spark.executor.memory", cfg["spark.executor.memory"]) \ + .config("spark.executor.cores", cfg["spark.executor.cores"]) \ + .config("spark.driver.memory", cfg["spark.driver.memory"]) \ + .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ + .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ + .config("spark.local.dir", cfg["path"]) \ + .getOrCreate() + def headkey(row): key = row._c0 try: - r = requests.head(srebuildd_url+str(key.zfill(40)),timeout=10) - if r.status_code in (200,422,404): - return (key,r.status_code) - else: - return(key,"UNKNOWN|RING_FAILURE|SREBUILDD_DOWN") + zero_filled_key = key.zfill(40) + r = requests.head(f"http://{srebuildd_ip}:81/{srebuildd_path}/{zero_filled_key}", timeout=10,) + if r.status_code in (200, 404, 422): + return (key, r.status_code) + else: + return(key, "UNKNOWN|RING_FAILURE|SREBUILDD_DOWN") except requests.exceptions.ConnectionError as e: - return (key,"ERROR_HTTP") + return (key, "ERROR_HTTP") -filenamearc = "%s://%s/listkeys-%s.csv" % (PROT, PATH, RING) -df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").load(filenamearc) -df.show(10,False) +key_list_csv_path = f"{PROTO}://{BUCKET}/listkeys-{RING_NAME}.csv" +df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").load(key_list_csv_path) +df.show(10, False) + rdd = df.rdd.map(headkey).toDF() -rdd.show(10,False) +rdd.show(10, False) df_final_ok = rdd.filter(rdd["_2"] != "404") -df_final_ok.show(100,False) - +df_final_ok.show(100, False) diff --git a/scripts/config/config-template.yml b/scripts/config/config-template.yml index 951cea8..28c7e4c 100755 --- a/scripts/config/config-template.yml +++ b/scripts/config/config-template.yml @@ -1,32 +1,88 @@ -master: "" # spark master ip:port, usually port=7077 +# Don't touch it. All Spark runners will be able to resolve `spark-master`. +master: "spark://spark-master:17077" +spark.driver.bindAddress: "" # Ip address of the first node, from where you will run the scripts + +## Where we'll centrally store Spark work +# Put the name of the RING which stores the data ring: "DATA" -path: "/fs/spark/" # bucket name, can be named "orphans" -protocol: file # Protocol can be either file or s3a. -# Protocol file requires SOFS+DLM & path is the folder to write to within the SOFS volume. -# Protocol s3a requires access/secret keys & endpoint URL & path is the bucket name within s3 to use. + +protocol: s3a # Protocol can be either `file` (when chasing SOFS orphans) or `s3a` (when chasing S3 orphans). + # Protocol `file` requires SOFS+DLM & path is the folder to write to within the SOFS volume. + # Protocol `s3a` requires access/secret keys & endpoint URL & path is the bucket name + # within s3 to use. + +# Depending on the protocol, `path` is either: +# - `s3a`: The name of an S3 bucket +# - `file`: The path to a shared directory +path: " / " + srebuildd_ip: "127.0.0.1" srebuildd_chord_path: "rebuild/chord-DATA" srebuildd_arc_path: "rebuild/arc-DATA" srebuildd_arcdata_path: "rebuild/arcdata-DATA" +retention: 604800 + +# In S3 mode, refer to the location_constraints setting in env/s3config/group_vars/all. +# arc_protection: + +# cos_protection: +arc_protection: 8+4 +cos_protection: 3 + +# The whole nasdk_tools block is optional - focus on it only +# if you want to run S3_FSCK/s3_fsck_p2_reverselookup.py nasdk_tools: scalarcrevlookupid_bin: "/usr/bin/scalarcrevlookupid" scalsplitgetusermd_bin: "/usr/bin/scalsplitgetusermd" port_range: [4244, 4249] # typical DATA RING bizstorenode port range - arcdata_path: "arcdata-DATA" -retention: 604800 -arc_protection: 8+4 -cos_protection: 3 + arcdata_driver_type: "arcdata" + ssh_key: "/opt/spark/apps/config/" + + +## With protocol `s3a`, Spark tools need to access to an S3 bucket +# Set access key pair that allows full rights on the bucket configured +# in the `path` variable. +# Set the `endpoint` which the Spark tools must connect to. s3: access_key: "" secret_key: "" + # Don't forget to prepend the endpoint with http:// or https:// endpoint: "" + # Set endpoint_on_light if you previously defined hosts in the + # [runners_light] group in the inventory file. + # Most common value is http://127.0.0.1:8000. + endpoint_on_light: "" + +## Access to the supervisor API to get the list +# of storage nodes sup: - url: "https://127.0.0.1:2443" + url: "https://:2443" login: "root" - password: "" + password: "" # `grep spass /etc/scality/sagentd.yaml` On a storage node + +## Configuration for direct access to S3 +# internal bucketd API (now only used to extract sproxyd keys +# from S3 Metadata) +# Leave blank for automatic setting. +bucketd: + url: "" + +## Spark tuning +# Number of cores you want Spark to use on each machine spark.executor.cores: 6 +# Number of machines that you want Spark to run on spark.executor.instances: 6 -spark.executor.memory: "8g" -spark.driver.memory: "8g" +spark.executor.memory: "3g" +spark.driver.memory: "3g" spark.memory.offHeap.enabled: True spark.memory.offHeap.size: "6g" + +## Work directories. +# Change the `container` entries only if you know +# what you're doing. +logdir: + container: "/opt/spark/spark-events" # where spark writes its logs, in the container + host: "/root/spark-logs" # for future use: bind this path to the container path +datadir: + container: "/opt/spark/tmp" # where spark writes temporary work files, in the container + host: "/scality/ssd01/spark" # Directory in all servers hosting spark-worker, where spark writes temporary work files + # /!\ It must be identical to the `datadir` variable in spark_run.sh diff --git a/scripts/config/tls_ca_certs/README.txt b/scripts/config/tls_ca_certs/README.txt new file mode 100644 index 0000000..16834ae --- /dev/null +++ b/scripts/config/tls_ca_certs/README.txt @@ -0,0 +1,2 @@ +Put any Certificate authority files in this directory +to be able to connect to self-signed HTTPs endpoints. diff --git a/scripts/count-flag-uniq.py b/scripts/count-flag-uniq.py index 969f77f..d64f833 100755 --- a/scripts/count-flag-uniq.py +++ b/scripts/count-flag-uniq.py @@ -1,40 +1,42 @@ -from pyspark.sql import SparkSession, Row, SQLContext +from pyspark.sql import SparkSession import pyspark.sql.functions as F -from pyspark import SparkContext import sys import yaml import os config_path = "%s/%s" % ( sys.path[0] ,"./config/config.yml") with open(config_path, 'r') as ymlfile: - cfg = yaml.load(ymlfile) + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) if len(sys.argv) >1: - RING = sys.argv[1] + RING_NAME = sys.argv[1] else: - RING = cfg["ring"] - -PATH = cfg["path"] -PROT = cfg["protocol"] - -os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell' -spark = SparkSession.builder \ - .appName("Count flags uniq ring::"+RING) \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ - .config("spark.hadoop.fs.s3a.access.key", cfg["s3"]["access_key"])\ - .config("spark.hadoop.fs.s3a.secret.key", cfg["s3"]["secret_key"])\ - .config("spark.hadoop.fs.s3a.endpoint", cfg["s3"]["endpoint"])\ - .config("spark.executor.instances", cfg["spark.executor.instances"]) \ - .config("spark.executor.memory", cfg["spark.executor.memory"]) \ - .config("spark.executor.cores", cfg["spark.executor.cores"]) \ - .config("spark.driver.memory", cfg["spark.driver.memory"]) \ - .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ - .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ - .config("spark.local.dir", cfg["path"]) \ - .getOrCreate() - - -files = "%s://%s/listkeys-%s.csv" % (PROT, PATH, RING) -df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").load(files) - -print df.groupBy("_c3").agg(F.countDistinct("_c1")).show() + RING_NAME = cfg["ring"] + +BUCKET = cfg["path"] +PROTO = cfg["protocol"] + +os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:3.3.4" pyspark-shell' +os.environ['AWS_JAVA_V1_DISABLE_DEPRECATION_ANNOUNCEMENT'] = 'true' + +spark_base = SparkSession.builder.appName(f"Count flags uniq ring: {RING_NAME}") # type: ignore +spark = spark_base \ + .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ + .config("spark.hadoop.fs.s3a.access.key", cfg["s3"]["access_key"])\ + .config("spark.hadoop.fs.s3a.secret.key", cfg["s3"]["secret_key"])\ + .config("spark.hadoop.fs.s3a.endpoint", cfg["s3"]["endpoint"])\ + .config("spark.executor.instances", cfg["spark.executor.instances"]) \ + .config("spark.executor.memory", cfg["spark.executor.memory"]) \ + .config("spark.executor.cores", cfg["spark.executor.cores"]) \ + .config("spark.driver.memory", cfg["spark.driver.memory"]) \ + .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ + .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ + .config("spark.local.dir", cfg["path"]) \ + .getOrCreate() + + +key_list_csv_path = f"{PROTO}://{BUCKET}/listkeys-{RING_NAME}.csv" +df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").load(key_list_csv_path) +df.show(10, False) + +df.groupBy("_c3").agg(F.countDistinct("_c1")).show() diff --git a/scripts/count-flag.py b/scripts/count-flag.py index b9128b1..f2eb9ea 100755 --- a/scripts/count-flag.py +++ b/scripts/count-flag.py @@ -7,7 +7,7 @@ config_path = "%s/%s" % ( sys.path[0] ,"./config/config.yml") with open(config_path, 'r') as ymlfile: - cfg = yaml.load(ymlfile) + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) if len(sys.argv) >1: RING = sys.argv[1] @@ -19,22 +19,22 @@ os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell' spark = SparkSession.builder \ - .appName("Count flags ring::"+RING) \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ - .config("spark.hadoop.fs.s3a.access.key", cfg["s3"]["access_key"])\ - .config("spark.hadoop.fs.s3a.secret.key", cfg["s3"]["secret_key"])\ - .config("spark.hadoop.fs.s3a.endpoint", cfg["s3"]["endpoint"])\ - .config("spark.executor.instances", cfg["spark.executor.instances"]) \ - .config("spark.executor.memory", cfg["spark.executor.memory"]) \ - .config("spark.executor.cores", cfg["spark.executor.cores"]) \ - .config("spark.driver.memory", cfg["spark.driver.memory"]) \ - .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ - .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ - .config("spark.local.dir", cfg["path"]) \ - .getOrCreate() + .appName("Count flags ring::"+RING) \ + .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\ + .config("spark.hadoop.fs.s3a.access.key", cfg["s3"]["access_key"])\ + .config("spark.hadoop.fs.s3a.secret.key", cfg["s3"]["secret_key"])\ + .config("spark.hadoop.fs.s3a.endpoint", cfg["s3"]["endpoint"])\ + .config("spark.executor.instances", cfg["spark.executor.instances"]) \ + .config("spark.executor.memory", cfg["spark.executor.memory"]) \ + .config("spark.executor.cores", cfg["spark.executor.cores"]) \ + .config("spark.driver.memory", cfg["spark.driver.memory"]) \ + .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ + .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ + .config("spark.local.dir", cfg["path"]) \ + .getOrCreate() files = "%s://%s/listkeys-%s.csv" % (PROT, PATH, RING) df = spark.read.format("csv").option("header", "false").option("inferSchema", "true").load(files) -print df.groupBy("_c3").agg(F.count("_c1")).show() +df.groupBy("_c3").agg(F.count("_c1")).show() diff --git a/scripts/count_ring_keys.sh b/scripts/count_ring_keys.sh index 5a31e44..1f579e1 100644 --- a/scripts/count_ring_keys.sh +++ b/scripts/count_ring_keys.sh @@ -1,7 +1,111 @@ #!/bin/bash -TOTAL=0 -ARC_REPLICA=$(awk -F',' 'BEGIN {count=0} ; !seen[$2]++ && $2 ~ "[A-F0-9]{30}50[A-F0-9]{6}30" && $4 == "0" {count++} END {print count}' *) -ARC_STRIPE=$(awk -F',' 'BEGIN {count=0} ; !seen[$2]++ && $2 ~ "[A-F0-9]{30}51[A-F0-9]{6}70" {count++} END {print count}' *) -let TOTAL+=${ARC_REPLICA}+${ARC_STRIPE} -echo "$TOTAL ring keys dumped from the listkeys.csv" \ No newline at end of file +# +# Script used to check p1 is correct. +# It can only run from within a container, with the Spark image. +# +# Authors: Francesco Triti and Grégoire Doumergue +# csv files processing inspired from https://github.com/scality/fra-private/blob/master/bash/scripts/spark-ring-keys-test/count_ring_keys_with_split.sh +# + + +CONFIG_FILE="$(dirname $0)/config/config.yml" + +if [ ! -f "${CONFIG_FILE}" ];then + + echo "${CONFIG_FILE} not found. Did you mount the scripts directory?" + exit 1 +fi + +WORKDIR="$(shyaml get-value datadir.container < ${CONFIG_FILE})" + +if [ ! -d "${WORKDIR}" ];then + echo "${WORKDIR} must be mounted by docker/ctr." + exit 1 +fi + +S3_ENDPOINT="$(shyaml get-value s3.endpoint < ${CONFIG_FILE})" +export AWS_ACCESS_KEY_ID="$(shyaml get-value s3.access_key < ${CONFIG_FILE})" +export AWS_SECRET_ACCESS_KEY="$(shyaml get-value s3.secret_key < ${CONFIG_FILE})" +ORPHANS_BUCKET="$(shyaml get-value path < ${CONFIG_FILE})" +RING="$(shyaml get-value ring < ${CONFIG_FILE})" +COS_PROTECTION="$(shyaml get-value cos_protection < ${CONFIG_FILE})" + +SUBSPLIT_KEYS=0 +ARC_REPLICA_PREFIX="ARC_R" +ARC_STRIPE_PREFIX="ARC_S" + +######## FUNCTIONS + +############################################################################### +# FUNCTION process_file +# split the input file in multiple files according to the type of +# the key and if the sub splitting is configured +############################################################################### + +process_file() { + + filename=$(basename "$1") + awk -F',' -vfilename="${filename}" -vtmpdir="${TMP_DIR}" -varprefix="${ARC_REPLICA_PREFIX}" -vasprefix="${ARC_STRIPE_PREFIX}" -vsp="${SUBSPLIT_KEYS}" '{ + if ($2 ~ "[A-F0-9]{30}50[A-F0-9]{6}'${COS_PROTECTION}'0" && $4 == "0") { + (sp == 0 ) ? f = tmpdir"/"arprefix"_"filename : f = tmpdir"/"arprefix"_"substr($2, 1, sp)"_"filename + print $2 >> f + } + if ( $2 ~ "[A-F0-9]{30}51[A-F0-9]{6}70") { + (sp == 0 ) ? f = tmpdir"/"asprefix"_"filename : f = tmpdir"/"asprefix"_"substr($2, 1, sp)"_"filename + print $2 >> f + } + + }' "${1}" +} + +############################################################################### +# FUNCTION dedup_keys +# print on the stdout the number of keys seen only one time per +# group of files passed as args +############################################################################### +dedup_keys() { + awk 'BEGIN {count=0} ; !u[$0]++ {count++} END {print count}' "${1}"* +} + +######### RUN + +TMP_DIR="${WORKDIR}/count_ring_keys" +mkdir "${TMP_DIR}" + +LIST_PATH="${RING}/listkeys.csv" +SOURCE="s3://${ORPHANS_BUCKET}/${LIST_PATH}/" + +# Check that listkeys.py was a success +HEAD_RESULT=$(aws --endpoint-url "${S3_ENDPOINT}" s3api head-object --bucket "${ORPHANS_BUCKET}" --key "${LIST_PATH}/_SUCCESS" | jq -r '.LastModified') + +if [ -n "${HEAD_RESULT}" ];then + echo "-- listkeys.py was a SUCCESS on ${HEAD_RESULT}" +else + echo "ERROR Couldn't find ${SOURCE}_SUCCESS. Run listkeys.py again." + exit 1 +fi + +TOTAL_COUNT=$(aws --endpoint-url "${S3_ENDPOINT}" s3 ls --recursive "${SOURCE}" | grep -cv _SUCCESS) + +echo "-- Counting RING main chunks" +COUNT=1 +for F in $(aws --endpoint-url "${S3_ENDPOINT}" s3 ls --recursive "${SOURCE}" | awk '$4 !~ /_SUCCESS/ {print $4}');do + + echo "--- [${COUNT}/${TOTAL_COUNT}] Processing ${F}" + PROCESS_FILE="${TMP_DIR}/$(basename "${F}")" + aws --endpoint-url "${S3_ENDPOINT}" s3 cp "s3://${ORPHANS_BUCKET}/${F}" "${PROCESS_FILE}" > /dev/null + process_file "${PROCESS_FILE}" && \ + rm -f "${PROCESS_FILE}" + COUNT=$(( ${COUNT} + 1 )) + +done +echo "-- Computing REPLICA count" +NB_REPLICA=$(dedup_keys "${TMP_DIR}/${ARC_REPLICA_PREFIX}") +echo "-- Computing STRIPE count" +NB_STRIPE=$(dedup_keys "${TMP_DIR}/${ARC_STRIPE_PREFIX}") +TOTAL=$(( ${NB_REPLICA} + ${NB_STRIPE} )) + +rm -fr "${TMP_DIR}" + +echo "$TOTAL ring keys dumped from the listkeys.csv" diff --git a/scripts/dig.py b/scripts/dig.py index bf15521..25bd33b 100755 --- a/scripts/dig.py +++ b/scripts/dig.py @@ -3,11 +3,11 @@ import struct def pad2(n): - x = '%s' % (n,) - return ('0' * (len(x) % 2)) + x + x = '%s' % (n,) + return ('0' * (len(x) % 2)) + x def to_bytes(h): - return binascii.unhexlify(h) + return binascii.unhexlify(h) def get_digest(name): m = hashlib.md5() @@ -55,5 +55,5 @@ def gen_md5_from_id(key): lst["7CCC4C0D687B4980E1DA5A9ADD60EC0801463820"] = "7CCC4C0D687B4980E1DA5A9ADD60EC0801463820" for i in lst: - print i, gen_md5_from_id(i) , lst[i] + print(i, gen_md5_from_id(i), lst[i]) assert gen_md5_from_id(i) == lst[i] diff --git a/scripts/export_ring_keys.sh b/scripts/export_ring_keys.sh index dac7cfc..b4ef826 100644 --- a/scripts/export_ring_keys.sh +++ b/scripts/export_ring_keys.sh @@ -2,6 +2,12 @@ RING=$1 BUCKET=$2 +# +# This script is an alternative to the Spark script listkey.py +# It needs aws-cli installed and configured +# +# + h=$(hostname) for NODE in ${RING}-${h}-n{1..6} do diff --git a/scripts/listkey.py b/scripts/listkey.py index 33cf0d7..d086228 100755 --- a/scripts/listkey.py +++ b/scripts/listkey.py @@ -1,22 +1,25 @@ +""" +WARNING: does not work on python 3.9+ due to a change in elementtree used in supervisor.py +This script is used to generate a list of keys in a ring. + +Step 1. The driver reads the configuration from config.yml, sets the variables and get the list of nodes from the supervisor. +Step 2. The driver creates a Spark session and reads the list of nodes into a DataFrame to store the nodes's keys into a CSV file. +""" + import os import sys import shutil import requests import time -requests.packages.urllib3.disable_warnings() - -from pyspark.sql import SparkSession, Row, SQLContext -from pyspark import SparkContext - +import yaml +import ssl +from pyspark.sql import SparkSession from scality.supervisor import Supervisor -from scality.daemon import DaemonFactory , ScalFactoryExceptionTypeNotFound -from scality.key import Key -from scality.storelib.storeutils import uks_parse +from scality.daemon import DaemonFactory +from scality.node import Node +requests.packages.urllib3.disable_warnings() # type: ignore -import yaml - -import ssl try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: @@ -27,14 +30,15 @@ ssl._create_default_https_context = _create_unverified_https_context -config_path = "%s/%s" % ( sys.path[0] ,"config/config.yml") +config_path = "%s/%s" % ( os.getcwd(),"config/config.yml") +print(f"config_path: {config_path}") with open(config_path, 'r') as ymlfile: - cfg = yaml.load(ymlfile) + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) if len(sys.argv) >1: - RING = sys.argv[1] + RING_NAME = sys.argv[1] else: - RING = cfg["ring"] + RING_NAME = cfg["ring"] # CLI and Config derived arguments using CAPITALS USER = cfg["sup"]["login"] @@ -46,57 +50,115 @@ SECRET_KEY = cfg["s3"]["secret_key"] ENDPOINT_URL = cfg["s3"]["endpoint"] RETENTION = cfg.get("retention", 604800) -PATH = "%s/%s/listkeys.csv" % (CPATH, RING) PARTITIONS = int(cfg["spark.executor.instances"]) * int(cfg["spark.executor.cores"]) -files = "%s://%s/%s/listkeys.csv" % (PROTOCOL, CPATH, RING) - -spark = SparkSession.builder.appName("Generate Listkeys ring:" + RING) \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ - .config("spark.hadoop.fs.s3a.access.key", ACCESS_KEY) \ - .config("spark.hadoop.fs.s3a.secret.key", SECRET_KEY) \ - .config("spark.hadoop.fs.s3a.endpoint", ENDPOINT_URL) \ - .config("spark.executor.instances", cfg["spark.executor.instances"]) \ - .config("spark.executor.memory", cfg["spark.executor.memory"]) \ - .config("spark.executor.cores", cfg["spark.executor.cores"]) \ - .config("spark.driver.memory", cfg["spark.driver.memory"]) \ - .config("spark.memory.offHeap.enabled", cfg["spark.memory.offHeap.enabled"]) \ - .config("spark.memory.offHeap.size", cfg["spark.memory.offHeap.size"]) \ - .config("spark.local.dir", cfg["path"]) \ - .getOrCreate() - -# s3 = s3fs.S3FileSystem(anon=False, key=ACCESS_KEY, secret=SECRET_KEY, client_kwargs={'endpoint_url': ENDPOINT_URL}) -os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.3" pyspark-shell' - -def prepare_path(): - try: - shutil.rmtree(PATH) - except: - pass - if not os.path.exists(PATH): - os.makedirs(PATH) +files = "%s://%s/%s/listkeys.csv" % (PROTOCOL, CPATH, RING_NAME) + +spark_base = SparkSession.builder.appName("Generate Listkeys ring:" + RING_NAME) # type: ignore +spark = spark_base.getOrCreate() + +# Create an accumulator for the grand total of keys +grand_total_keys = spark.sparkContext.accumulator(0) + def listkeys(row, now): + print(f" -- Entering listkeys function") + print(f" -- Row: {row}") klist = [] - n = DaemonFactory().get_daemon("node", login=USER, passwd=PASSWORD, url='https://{0}:{1}'.format(row.ip, row.adminport), chord_addr=row.ip, chord_port=row.chordport, dso=RING) - params = { "mtime_min":"123456789", "mtime_max":now, "loadmetadata":"browse"} - for k in n.listKeysIter(extra_params=params): - if len(k.split(",")[0]) > 30 : - klist.append([ k.rstrip().split(',')[i] for i in [0,1,2,3] ]) - # data = [ k.rstrip().split(',')[i] for i in [0,1,2,3] ] - # data = ",".join(data) - # print >> f, data - print [( row.ip, row.adminport, 'OK')] + + try: + n = DaemonFactory().get_daemon("node", login=USER, passwd=PASSWORD, url=f'https://{row.ip}:{row.adminport}', chord_addr=row.ip, chord_port=row.chordport, dso=RING_NAME,) + if not isinstance(n, Node): + print(f" -- Error: {n}") + raise TypeError(f"Expected `Node` instance, got {type(n)}") + + print(f" -- n : {n}") + + # Never remove the "loadmetadata":"browse" if you plan to filter on the *time* keys + # params = {"mtime_min": "1738400902", "mtime_max": now, "loadmetadata":"browse"} + params = {} + + count = 0 + for k in n.listKeysIter(extra_params=params): + try: + if len(k.split(",")[0]) > 30: + klist.append([k.rstrip().split(',')[i] for i in [0, 1, 2, 3]]) + count += 1 + + # Increment the accumulator for each key + grand_total_keys.add(1) + + # Sleep to not overload the nodes + if count % 50000 == 0: + time.sleep(0.1) + + except IndexError: + print(f"Malformed key: {k}") + + print(f"Node {row.ip}:{row.adminport} - keys found: {len(klist)}") + + except Exception as e: + print(f"Error processing node {row.ip}:{row.adminport}: {e}") + return klist +# We try to not get the latest keys, please set the retention to at least 1 day. now = int(str(time.time()).split('.')[0]) - RETENTION -prepare_path() + s = Supervisor(url=URL, login=USER, passwd=PASSWORD) -listm = sorted(s.supervisorConfigDso(dsoname=RING)['nodes']) +config_dict = s.supervisorConfigDso(dsoname=RING_NAME) + +if not isinstance(config_dict, dict): + raise TypeError(f"Expected `dict` instance, got {type(config_dict)} for {config_dict}") + +if "nodes" not in config_dict: + raise ValueError(f"Error: no nodes found in the configuration file.") + +NodeKeyFields = ["name", "ip", "chordport", "adminport"] +config_dict = [ { key:node[key] for key in NodeKeyFields } for node in config_dict['nodes'] ] + +listm = sorted(config_dict, key=lambda x: x['name']) +print(f"List of RING nodes: {listm}") + df = spark.createDataFrame(listm) -print df.show(36, False) +print(f"Number of RING nodes: {df.count()}") + +### +# You may want to force the number of partitions to the number of nodes (Optional). +### + +df = df.repartition(PARTITIONS) +# df = df.repartition(int(df.count())) + +# print(df.explain(True)) # Debug line to check the execution plan +print(f"df show 50:") +df.show(50, False) + +key_count = df.count() +if key_count == 0: + print("DataFrame is empty. Exiting.") + exit(1) +else: + print(f"DataFrame count: {key_count}") + +### +# The code above is executed on the "driver" only +# Starting this point, the following code is executed on the "workers" +### + +# Map the listkeys function over the RDD +dfklist = df.rdd.map(lambda x:listkeys(x, now)) + +# Trigger an action to ensure the RDD is processed +try: + print(f"rdd count before collect: {dfklist.count()}") + # print(" -- Keys collected:", dfklist.collect()) # dangerous debugging line to check if any keys are collected, It can happen if RETENTION is too high for example. +except Exception as e: + print(f" -- Error printing the keys collected: {e}") + +# Print the grand total of keys on the driver +print(f"Grand total of keys listed: {grand_total_keys.value}") -dfnew = df.repartition(36) -dfklist = dfnew.rdd.map(lambda x:listkeys(x, now)) +# Flatten the RDD and write the results to a CSV dfklist = dfklist.flatMap(lambda x: x).toDF() dfklist.write.format("csv").mode("overwrite").options(header="true").save(files) diff --git a/scripts/offline-archive-setup.sh b/scripts/offline-archive-setup.sh deleted file mode 100644 index 94327cc..0000000 --- a/scripts/offline-archive-setup.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -if [[ -z "${SUDO}" ]] ; then - if [[ $EUID -ne 0 ]]; then - echo -e "This script is not being run as root. We will preface each" - echo -e "command with sudo. If this user does not have sudo permissions" - echo -e "extract the offline archive as root." - export sudo=sudo - else - export sudo='' - fi -else - echo "SUDO Enironment variable set. Using sudo to execute all commands." -fi - -${sudo} cd ../ && mv ./staging /var/tmp/ -${sudo} tar -C /root -x -v -f /var/tmp/staging/spark-repo.tar spark/ansible -${sudo} echo -e "spark-offline-archive.run has extracted itself.\n" -${sudo} echo -e "To continue with the staged deployment cd into ~/spark/ansible" -${sudo} echo -e "and execute run.yml without the staging tag." diff --git a/scripts/scality-0.1-py3-none-any.whl b/scripts/scality-0.1-py3-none-any.whl new file mode 100644 index 0000000..fb9f5d9 Binary files /dev/null and b/scripts/scality-0.1-py3-none-any.whl differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/LICENSE b/scripts/spark-2.4.3-bin-hadoop2.7/LICENSE deleted file mode 100644 index 1346f06..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/LICENSE +++ /dev/null @@ -1,517 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ------------------------------------------------------------------------------------- -This project bundles some components that are also licensed under the Apache -License Version 2.0: - -commons-beanutils:commons-beanutils -org.apache.zookeeper:zookeeper -oro:oro -commons-configuration:commons-configuration -commons-digester:commons-digester -com.chuusai:shapeless_2.11 -com.googlecode.javaewah:JavaEWAH -com.twitter:chill-java -com.twitter:chill_2.11 -com.univocity:univocity-parsers -javax.jdo:jdo-api -joda-time:joda-time -net.sf.opencsv:opencsv -org.apache.derby:derby -org.objenesis:objenesis -org.roaringbitmap:RoaringBitmap -org.scalanlp:breeze-macros_2.11 -org.scalanlp:breeze_2.11 -org.typelevel:macro-compat_2.11 -org.yaml:snakeyaml -org.apache.xbean:xbean-asm5-shaded -com.squareup.okhttp3:logging-interceptor -com.squareup.okhttp3:okhttp -com.squareup.okio:okio -org.apache.spark:spark-catalyst_2.11 -org.apache.spark:spark-kvstore_2.11 -org.apache.spark:spark-launcher_2.11 -org.apache.spark:spark-mllib-local_2.11 -org.apache.spark:spark-network-common_2.11 -org.apache.spark:spark-network-shuffle_2.11 -org.apache.spark:spark-sketch_2.11 -org.apache.spark:spark-tags_2.11 -org.apache.spark:spark-unsafe_2.11 -commons-httpclient:commons-httpclient -com.vlkan:flatbuffers -com.ning:compress-lzf -io.airlift:aircompressor -io.dropwizard.metrics:metrics-core -io.dropwizard.metrics:metrics-ganglia -io.dropwizard.metrics:metrics-graphite -io.dropwizard.metrics:metrics-json -io.dropwizard.metrics:metrics-jvm -org.iq80.snappy:snappy -com.clearspring.analytics:stream -com.jamesmurty.utils:java-xmlbuilder -commons-codec:commons-codec -commons-collections:commons-collections -io.fabric8:kubernetes-client -io.fabric8:kubernetes-model -io.netty:netty -io.netty:netty-all -net.hydromatic:eigenbase-properties -net.sf.supercsv:super-csv -org.apache.arrow:arrow-format -org.apache.arrow:arrow-memory -org.apache.arrow:arrow-vector -org.apache.calcite:calcite-avatica -org.apache.calcite:calcite-core -org.apache.calcite:calcite-linq4j -org.apache.commons:commons-crypto -org.apache.commons:commons-lang3 -org.apache.hadoop:hadoop-annotations -org.apache.hadoop:hadoop-auth -org.apache.hadoop:hadoop-client -org.apache.hadoop:hadoop-common -org.apache.hadoop:hadoop-hdfs -org.apache.hadoop:hadoop-mapreduce-client-app -org.apache.hadoop:hadoop-mapreduce-client-common -org.apache.hadoop:hadoop-mapreduce-client-core -org.apache.hadoop:hadoop-mapreduce-client-jobclient -org.apache.hadoop:hadoop-mapreduce-client-shuffle -org.apache.hadoop:hadoop-yarn-api -org.apache.hadoop:hadoop-yarn-client -org.apache.hadoop:hadoop-yarn-common -org.apache.hadoop:hadoop-yarn-server-common -org.apache.hadoop:hadoop-yarn-server-web-proxy -org.apache.httpcomponents:httpclient -org.apache.httpcomponents:httpcore -org.apache.orc:orc-core -org.apache.orc:orc-mapreduce -org.mortbay.jetty:jetty -org.mortbay.jetty:jetty-util -com.jolbox:bonecp -org.json4s:json4s-ast_2.11 -org.json4s:json4s-core_2.11 -org.json4s:json4s-jackson_2.11 -org.json4s:json4s-scalap_2.11 -com.carrotsearch:hppc -com.fasterxml.jackson.core:jackson-annotations -com.fasterxml.jackson.core:jackson-core -com.fasterxml.jackson.core:jackson-databind -com.fasterxml.jackson.dataformat:jackson-dataformat-yaml -com.fasterxml.jackson.module:jackson-module-jaxb-annotations -com.fasterxml.jackson.module:jackson-module-paranamer -com.fasterxml.jackson.module:jackson-module-scala_2.11 -com.github.mifmif:generex -com.google.code.findbugs:jsr305 -com.google.code.gson:gson -com.google.inject:guice -com.google.inject.extensions:guice-servlet -com.twitter:parquet-hadoop-bundle -commons-cli:commons-cli -commons-dbcp:commons-dbcp -commons-io:commons-io -commons-lang:commons-lang -commons-logging:commons-logging -commons-net:commons-net -commons-pool:commons-pool -io.fabric8:zjsonpatch -javax.inject:javax.inject -javax.validation:validation-api -log4j:apache-log4j-extras -log4j:log4j -net.sf.jpam:jpam -org.apache.avro:avro -org.apache.avro:avro-ipc -org.apache.avro:avro-mapred -org.apache.commons:commons-compress -org.apache.commons:commons-math3 -org.apache.curator:curator-client -org.apache.curator:curator-framework -org.apache.curator:curator-recipes -org.apache.directory.api:api-asn1-api -org.apache.directory.api:api-util -org.apache.directory.server:apacheds-i18n -org.apache.directory.server:apacheds-kerberos-codec -org.apache.htrace:htrace-core -org.apache.ivy:ivy -org.apache.mesos:mesos -org.apache.parquet:parquet-column -org.apache.parquet:parquet-common -org.apache.parquet:parquet-encoding -org.apache.parquet:parquet-format -org.apache.parquet:parquet-hadoop -org.apache.parquet:parquet-jackson -org.apache.thrift:libfb303 -org.apache.thrift:libthrift -org.codehaus.jackson:jackson-core-asl -org.codehaus.jackson:jackson-mapper-asl -org.datanucleus:datanucleus-api-jdo -org.datanucleus:datanucleus-core -org.datanucleus:datanucleus-rdbms -org.lz4:lz4-java -org.spark-project.hive:hive-beeline -org.spark-project.hive:hive-cli -org.spark-project.hive:hive-exec -org.spark-project.hive:hive-jdbc -org.spark-project.hive:hive-metastore -org.xerial.snappy:snappy-java -stax:stax-api -xerces:xercesImpl -org.codehaus.jackson:jackson-jaxrs -org.codehaus.jackson:jackson-xc -org.eclipse.jetty:jetty-client -org.eclipse.jetty:jetty-continuation -org.eclipse.jetty:jetty-http -org.eclipse.jetty:jetty-io -org.eclipse.jetty:jetty-jndi -org.eclipse.jetty:jetty-plus -org.eclipse.jetty:jetty-proxy -org.eclipse.jetty:jetty-security -org.eclipse.jetty:jetty-server -org.eclipse.jetty:jetty-servlet -org.eclipse.jetty:jetty-servlets -org.eclipse.jetty:jetty-util -org.eclipse.jetty:jetty-webapp -org.eclipse.jetty:jetty-xml - -core/src/main/java/org/apache/spark/util/collection/TimSort.java -core/src/main/resources/org/apache/spark/ui/static/bootstrap* -core/src/main/resources/org/apache/spark/ui/static/jsonFormatter* -core/src/main/resources/org/apache/spark/ui/static/vis* -docs/js/vendor/bootstrap.js - - ------------------------------------------------------------------------------------- -This product bundles various third-party components under other open source licenses. -This section summarizes those components and their licenses. See licenses-binary/ -for text of these licenses. - - -BSD 2-Clause ------------- - -com.github.luben:zstd-jni -javolution:javolution -com.esotericsoftware:kryo-shaded -com.esotericsoftware:minlog -com.esotericsoftware:reflectasm -com.google.protobuf:protobuf-java -org.codehaus.janino:commons-compiler -org.codehaus.janino:janino -jline:jline -org.jodd:jodd-core - - -BSD 3-Clause ------------- - -dk.brics.automaton:automaton -org.antlr:antlr-runtime -org.antlr:ST4 -org.antlr:stringtemplate -org.antlr:antlr4-runtime -antlr:antlr -com.github.fommil.netlib:core -com.thoughtworks.paranamer:paranamer -org.scala-lang:scala-compiler -org.scala-lang:scala-library -org.scala-lang:scala-reflect -org.scala-lang.modules:scala-parser-combinators_2.11 -org.scala-lang.modules:scala-xml_2.11 -org.fusesource.leveldbjni:leveldbjni-all -net.sourceforge.f2j:arpack_combined_all -xmlenc:xmlenc -net.sf.py4j:py4j -org.jpmml:pmml-model -org.jpmml:pmml-schema - -python/lib/py4j-*-src.zip -python/pyspark/cloudpickle.py -python/pyspark/join.py -core/src/main/resources/org/apache/spark/ui/static/d3.min.js - -The CSS style for the navigation sidebar of the documentation was originally -submitted by Óscar Nájera for the scikit-learn project. The scikit-learn project -is distributed under the 3-Clause BSD license. - - -MIT License ------------ - -org.spire-math:spire-macros_2.11 -org.spire-math:spire_2.11 -org.typelevel:machinist_2.11 -net.razorvine:pyrolite -org.slf4j:jcl-over-slf4j -org.slf4j:jul-to-slf4j -org.slf4j:slf4j-api -org.slf4j:slf4j-log4j12 -com.github.scopt:scopt_2.11 - -core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js -core/src/main/resources/org/apache/spark/ui/static/*dataTables* -core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js -ore/src/main/resources/org/apache/spark/ui/static/jquery* -core/src/main/resources/org/apache/spark/ui/static/sorttable.js -docs/js/vendor/anchor.min.js -docs/js/vendor/jquery* -docs/js/vendor/modernizer* - - -Common Development and Distribution License (CDDL) 1.0 ------------------------------------------------------- - -javax.activation:activation http://www.oracle.com/technetwork/java/javase/tech/index-jsp-138795.html -javax.xml.stream:stax-api https://jcp.org/en/jsr/detail?id=173 - - -Common Development and Distribution License (CDDL) 1.1 ------------------------------------------------------- - -javax.annotation:javax.annotation-api https://jcp.org/en/jsr/detail?id=250 -javax.servlet:javax.servlet-api https://javaee.github.io/servlet-spec/ -javax.transaction:jta http://www.oracle.com/technetwork/java/index.html -javax.ws.rs:javax.ws.rs-api https://github.com/jax-rs -javax.xml.bind:jaxb-api https://github.com/javaee/jaxb-v2 -org.glassfish.hk2:hk2-api https://github.com/javaee/glassfish -org.glassfish.hk2:hk2-locator (same) -org.glassfish.hk2:hk2-utils -org.glassfish.hk2:osgi-resource-locator -org.glassfish.hk2.external:aopalliance-repackaged -org.glassfish.hk2.external:javax.inject -org.glassfish.jersey.bundles.repackaged:jersey-guava -org.glassfish.jersey.containers:jersey-container-servlet -org.glassfish.jersey.containers:jersey-container-servlet-core -org.glassfish.jersey.core:jersey-client -org.glassfish.jersey.core:jersey-common -org.glassfish.jersey.core:jersey-server -org.glassfish.jersey.media:jersey-media-jaxb - - -Mozilla Public License (MPL) 1.1 --------------------------------- - -com.github.rwl:jtransforms https://sourceforge.net/projects/jtransforms/ - - -Python Software Foundation License ----------------------------------- - -pyspark/heapq3.py - - -Public Domain -------------- - -aopalliance:aopalliance -net.iharder:base64 -org.tukaani:xz - - -Creative Commons CC0 1.0 Universal Public Domain Dedication ------------------------------------------------------------ -(see LICENSE-CC0.txt) - -data/mllib/images/kittens/29.5.a_b_EGDP022204.jpg -data/mllib/images/kittens/54893.jpg -data/mllib/images/kittens/DP153539.jpg -data/mllib/images/kittens/DP802813.jpg -data/mllib/images/multi-channel/chr30.4.184.jpg diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/NOTICE b/scripts/spark-2.4.3-bin-hadoop2.7/NOTICE deleted file mode 100644 index b707c43..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/NOTICE +++ /dev/null @@ -1,1174 +0,0 @@ -Apache Spark -Copyright 2014 and onwards The Apache Software Foundation. - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - -Export Control Notice ---------------------- - -This distribution includes cryptographic software. The country in which you currently reside may have -restrictions on the import, possession, use, and/or re-export to another country, of encryption software. -BEFORE using any encryption software, please check your country's laws, regulations and policies concerning -the import, possession, or use, and re-export of encryption software, to see if this is permitted. See - for more information. - -The U.S. Government Department of Commerce, Bureau of Industry and Security (BIS), has classified this -software as Export Commodity Control Number (ECCN) 5D002.C.1, which includes information security software -using or performing cryptographic functions with asymmetric algorithms. The form and manner of this Apache -Software Foundation distribution makes it eligible for export under the License Exception ENC Technology -Software Unrestricted (TSU) exception (see the BIS Export Administration Regulations, Section 740.13) for -both object code and source code. - -The following provides more details on the included cryptographic software: - -This software uses Apache Commons Crypto (https://commons.apache.org/proper/commons-crypto/) to -support authentication, and encryption and decryption of data sent across the network between -services. - - -// ------------------------------------------------------------------ -// NOTICE file corresponding to the section 4d of The Apache License, -// Version 2.0, in this case for -// ------------------------------------------------------------------ - -Hive Beeline -Copyright 2016 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - -Apache Avro -Copyright 2009-2014 The Apache Software Foundation - -This product currently only contains code developed by authors -of specific components, as identified by the source code files; -if such notes are missing files have been created by -Tatu Saloranta. - -For additional credits (generally to people who reported problems) -see CREDITS file. - -Apache Commons Compress -Copyright 2002-2012 The Apache Software Foundation - -This product includes software developed by -The Apache Software Foundation (http://www.apache.org/). - -Apache Avro Mapred API -Copyright 2009-2014 The Apache Software Foundation - -Apache Avro IPC -Copyright 2009-2014 The Apache Software Foundation - -Objenesis -Copyright 2006-2013 Joe Walnes, Henri Tremblay, Leonardo Mesquita - -Apache XBean :: ASM 5 shaded (repackaged) -Copyright 2005-2015 The Apache Software Foundation - --------------------------------------- - -This product includes software developed at -OW2 Consortium (http://asm.ow2.org/) - -This product includes software developed by The Apache Software -Foundation (http://www.apache.org/). - -The binary distribution of this product bundles binaries of -org.iq80.leveldb:leveldb-api (https://github.com/dain/leveldb), which has the -following notices: -* Copyright 2011 Dain Sundstrom -* Copyright 2011 FuseSource Corp. http://fusesource.com - -The binary distribution of this product bundles binaries of -org.fusesource.hawtjni:hawtjni-runtime (https://github.com/fusesource/hawtjni), -which has the following notices: -* This product includes software developed by FuseSource Corp. - http://fusesource.com -* This product includes software developed at - Progress Software Corporation and/or its subsidiaries or affiliates. -* This product includes software developed by IBM Corporation and others. - -The binary distribution of this product bundles binaries of -Gson 2.2.4, -which has the following notices: - - The Netty Project - ================= - -Please visit the Netty web site for more information: - - * http://netty.io/ - -Copyright 2014 The Netty Project - -The Netty Project licenses this file to you under the Apache License, -version 2.0 (the "License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at: - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -License for the specific language governing permissions and limitations -under the License. - -Also, please refer to each LICENSE..txt file, which is located in -the 'license' directory of the distribution file, for the license terms of the -components that this product depends on. - -------------------------------------------------------------------------------- -This product contains the extensions to Java Collections Framework which has -been derived from the works by JSR-166 EG, Doug Lea, and Jason T. Greene: - - * LICENSE: - * license/LICENSE.jsr166y.txt (Public Domain) - * HOMEPAGE: - * http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/ - * http://viewvc.jboss.org/cgi-bin/viewvc.cgi/jbosscache/experimental/jsr166/ - -This product contains a modified version of Robert Harder's Public Domain -Base64 Encoder and Decoder, which can be obtained at: - - * LICENSE: - * license/LICENSE.base64.txt (Public Domain) - * HOMEPAGE: - * http://iharder.sourceforge.net/current/java/base64/ - -This product contains a modified portion of 'Webbit', an event based -WebSocket and HTTP server, which can be obtained at: - - * LICENSE: - * license/LICENSE.webbit.txt (BSD License) - * HOMEPAGE: - * https://github.com/joewalnes/webbit - -This product contains a modified portion of 'SLF4J', a simple logging -facade for Java, which can be obtained at: - - * LICENSE: - * license/LICENSE.slf4j.txt (MIT License) - * HOMEPAGE: - * http://www.slf4j.org/ - -This product contains a modified portion of 'ArrayDeque', written by Josh -Bloch of Google, Inc: - - * LICENSE: - * license/LICENSE.deque.txt (Public Domain) - -This product contains a modified portion of 'Apache Harmony', an open source -Java SE, which can be obtained at: - - * LICENSE: - * license/LICENSE.harmony.txt (Apache License 2.0) - * HOMEPAGE: - * http://archive.apache.org/dist/harmony/ - -This product contains a modified version of Roland Kuhn's ASL2 -AbstractNodeQueue, which is based on Dmitriy Vyukov's non-intrusive MPSC queue. -It can be obtained at: - - * LICENSE: - * license/LICENSE.abstractnodequeue.txt (Public Domain) - * HOMEPAGE: - * https://github.com/akka/akka/blob/wip-2.2.3-for-scala-2.11/akka-actor/src/main/java/akka/dispatch/AbstractNodeQueue.java - -This product contains a modified portion of 'jbzip2', a Java bzip2 compression -and decompression library written by Matthew J. Francis. It can be obtained at: - - * LICENSE: - * license/LICENSE.jbzip2.txt (MIT License) - * HOMEPAGE: - * https://code.google.com/p/jbzip2/ - -This product contains a modified portion of 'libdivsufsort', a C API library to construct -the suffix array and the Burrows-Wheeler transformed string for any input string of -a constant-size alphabet written by Yuta Mori. It can be obtained at: - - * LICENSE: - * license/LICENSE.libdivsufsort.txt (MIT License) - * HOMEPAGE: - * https://code.google.com/p/libdivsufsort/ - -This product contains a modified portion of Nitsan Wakart's 'JCTools', Java Concurrency Tools for the JVM, - which can be obtained at: - - * LICENSE: - * license/LICENSE.jctools.txt (ASL2 License) - * HOMEPAGE: - * https://github.com/JCTools/JCTools - -This product optionally depends on 'JZlib', a re-implementation of zlib in -pure Java, which can be obtained at: - - * LICENSE: - * license/LICENSE.jzlib.txt (BSD style License) - * HOMEPAGE: - * http://www.jcraft.com/jzlib/ - -This product optionally depends on 'Compress-LZF', a Java library for encoding and -decoding data in LZF format, written by Tatu Saloranta. It can be obtained at: - - * LICENSE: - * license/LICENSE.compress-lzf.txt (Apache License 2.0) - * HOMEPAGE: - * https://github.com/ning/compress - -This product optionally depends on 'lz4', a LZ4 Java compression -and decompression library written by Adrien Grand. It can be obtained at: - - * LICENSE: - * license/LICENSE.lz4.txt (Apache License 2.0) - * HOMEPAGE: - * https://github.com/jpountz/lz4-java - -This product optionally depends on 'lzma-java', a LZMA Java compression -and decompression library, which can be obtained at: - - * LICENSE: - * license/LICENSE.lzma-java.txt (Apache License 2.0) - * HOMEPAGE: - * https://github.com/jponge/lzma-java - -This product contains a modified portion of 'jfastlz', a Java port of FastLZ compression -and decompression library written by William Kinney. It can be obtained at: - - * LICENSE: - * license/LICENSE.jfastlz.txt (MIT License) - * HOMEPAGE: - * https://code.google.com/p/jfastlz/ - -This product contains a modified portion of and optionally depends on 'Protocol Buffers', Google's data -interchange format, which can be obtained at: - - * LICENSE: - * license/LICENSE.protobuf.txt (New BSD License) - * HOMEPAGE: - * http://code.google.com/p/protobuf/ - -This product optionally depends on 'Bouncy Castle Crypto APIs' to generate -a temporary self-signed X.509 certificate when the JVM does not provide the -equivalent functionality. It can be obtained at: - - * LICENSE: - * license/LICENSE.bouncycastle.txt (MIT License) - * HOMEPAGE: - * http://www.bouncycastle.org/ - -This product optionally depends on 'Snappy', a compression library produced -by Google Inc, which can be obtained at: - - * LICENSE: - * license/LICENSE.snappy.txt (New BSD License) - * HOMEPAGE: - * http://code.google.com/p/snappy/ - -This product optionally depends on 'JBoss Marshalling', an alternative Java -serialization API, which can be obtained at: - - * LICENSE: - * license/LICENSE.jboss-marshalling.txt (GNU LGPL 2.1) - * HOMEPAGE: - * http://www.jboss.org/jbossmarshalling - -This product optionally depends on 'Caliper', Google's micro- -benchmarking framework, which can be obtained at: - - * LICENSE: - * license/LICENSE.caliper.txt (Apache License 2.0) - * HOMEPAGE: - * http://code.google.com/p/caliper/ - -This product optionally depends on 'Apache Commons Logging', a logging -framework, which can be obtained at: - - * LICENSE: - * license/LICENSE.commons-logging.txt (Apache License 2.0) - * HOMEPAGE: - * http://commons.apache.org/logging/ - -This product optionally depends on 'Apache Log4J', a logging framework, which -can be obtained at: - - * LICENSE: - * license/LICENSE.log4j.txt (Apache License 2.0) - * HOMEPAGE: - * http://logging.apache.org/log4j/ - -This product optionally depends on 'Aalto XML', an ultra-high performance -non-blocking XML processor, which can be obtained at: - - * LICENSE: - * license/LICENSE.aalto-xml.txt (Apache License 2.0) - * HOMEPAGE: - * http://wiki.fasterxml.com/AaltoHome - -This product contains a modified version of 'HPACK', a Java implementation of -the HTTP/2 HPACK algorithm written by Twitter. It can be obtained at: - - * LICENSE: - * license/LICENSE.hpack.txt (Apache License 2.0) - * HOMEPAGE: - * https://github.com/twitter/hpack - -This product contains a modified portion of 'Apache Commons Lang', a Java library -provides utilities for the java.lang API, which can be obtained at: - - * LICENSE: - * license/LICENSE.commons-lang.txt (Apache License 2.0) - * HOMEPAGE: - * https://commons.apache.org/proper/commons-lang/ - -The binary distribution of this product bundles binaries of -Commons Codec 1.4, -which has the following notices: - * src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.javacontains test data from http://aspell.net/test/orig/batch0.tab.Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org) - =============================================================================== - The content of package org.apache.commons.codec.language.bm has been translated - from the original php source code available at http://stevemorse.org/phoneticinfo.htm - with permission from the original authors. - Original source copyright:Copyright (c) 2008 Alexander Beider & Stephen P. Morse. - -The binary distribution of this product bundles binaries of -Commons Lang 2.6, -which has the following notices: - * This product includes software from the Spring Framework,under the Apache License 2.0 (see: StringUtils.containsWhitespace()) - -The binary distribution of this product bundles binaries of -Apache Log4j 1.2.17, -which has the following notices: - * ResolverUtil.java - Copyright 2005-2006 Tim Fennell - Dumbster SMTP test server - Copyright 2004 Jason Paul Kitchen - TypeUtil.java - Copyright 2002-2012 Ramnivas Laddad, Juergen Hoeller, Chris Beams - -The binary distribution of this product bundles binaries of -Jetty 6.1.26, -which has the following notices: - * ============================================================== - Jetty Web Container - Copyright 1995-2016 Mort Bay Consulting Pty Ltd. - ============================================================== - - The Jetty Web Container is Copyright Mort Bay Consulting Pty Ltd - unless otherwise noted. - - Jetty is dual licensed under both - - * The Apache 2.0 License - http://www.apache.org/licenses/LICENSE-2.0.html - - and - - * The Eclipse Public 1.0 License - http://www.eclipse.org/legal/epl-v10.html - - Jetty may be distributed under either license. - - ------ - Eclipse - - The following artifacts are EPL. - * org.eclipse.jetty.orbit:org.eclipse.jdt.core - - The following artifacts are EPL and ASL2. - * org.eclipse.jetty.orbit:javax.security.auth.message - - The following artifacts are EPL and CDDL 1.0. - * org.eclipse.jetty.orbit:javax.mail.glassfish - - ------ - Oracle - - The following artifacts are CDDL + GPLv2 with classpath exception. - https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html - - * javax.servlet:javax.servlet-api - * javax.annotation:javax.annotation-api - * javax.transaction:javax.transaction-api - * javax.websocket:javax.websocket-api - - ------ - Oracle OpenJDK - - If ALPN is used to negotiate HTTP/2 connections, then the following - artifacts may be included in the distribution or downloaded when ALPN - module is selected. - - * java.sun.security.ssl - - These artifacts replace/modify OpenJDK classes. The modififications - are hosted at github and both modified and original are under GPL v2 with - classpath exceptions. - http://openjdk.java.net/legal/gplv2+ce.html - - ------ - OW2 - - The following artifacts are licensed by the OW2 Foundation according to the - terms of http://asm.ow2.org/license.html - - org.ow2.asm:asm-commons - org.ow2.asm:asm - - ------ - Apache - - The following artifacts are ASL2 licensed. - - org.apache.taglibs:taglibs-standard-spec - org.apache.taglibs:taglibs-standard-impl - - ------ - MortBay - - The following artifacts are ASL2 licensed. Based on selected classes from - following Apache Tomcat jars, all ASL2 licensed. - - org.mortbay.jasper:apache-jsp - org.apache.tomcat:tomcat-jasper - org.apache.tomcat:tomcat-juli - org.apache.tomcat:tomcat-jsp-api - org.apache.tomcat:tomcat-el-api - org.apache.tomcat:tomcat-jasper-el - org.apache.tomcat:tomcat-api - org.apache.tomcat:tomcat-util-scan - org.apache.tomcat:tomcat-util - - org.mortbay.jasper:apache-el - org.apache.tomcat:tomcat-jasper-el - org.apache.tomcat:tomcat-el-api - - ------ - Mortbay - - The following artifacts are CDDL + GPLv2 with classpath exception. - - https://glassfish.dev.java.net/nonav/public/CDDL+GPL.html - - org.eclipse.jetty.toolchain:jetty-schemas - - ------ - Assorted - - The UnixCrypt.java code implements the one way cryptography used by - Unix systems for simple password protection. Copyright 1996 Aki Yoshida, - modified April 2001 by Iris Van den Broeke, Daniel Deville. - Permission to use, copy, modify and distribute UnixCrypt - for non-commercial or commercial purposes and without fee is - granted provided that the copyright notice appears in all copies./ - -The binary distribution of this product bundles binaries of -Snappy for Java 1.0.4.1, -which has the following notices: - * This product includes software developed by Google - Snappy: http://code.google.com/p/snappy/ (New BSD License) - - This product includes software developed by Apache - PureJavaCrc32C from apache-hadoop-common http://hadoop.apache.org/ - (Apache 2.0 license) - - This library contains statically linked libstdc++. This inclusion is allowed by - "GCC RUntime Library Exception" - http://gcc.gnu.org/onlinedocs/libstdc++/manual/license.html - - == Contributors == - * Tatu Saloranta - * Providing benchmark suite - * Alec Wysoker - * Performance and memory usage improvement - -The binary distribution of this product bundles binaries of -Xerces2 Java Parser 2.9.1, -which has the following notices: - * ========================================================================= - == NOTICE file corresponding to section 4(d) of the Apache License, == - == Version 2.0, in this case for the Apache Xerces Java distribution. == - ========================================================================= - - Apache Xerces Java - Copyright 1999-2007 The Apache Software Foundation - - This product includes software developed at - The Apache Software Foundation (http://www.apache.org/). - - Portions of this software were originally based on the following: - - software copyright (c) 1999, IBM Corporation., http://www.ibm.com. - - software copyright (c) 1999, Sun Microsystems., http://www.sun.com. - - voluntary contributions made by Paul Eng on behalf of the - Apache Software Foundation that were originally developed at iClick, Inc., - software copyright (c) 1999. - -Apache Commons Collections -Copyright 2001-2015 The Apache Software Foundation - -Apache Commons Configuration -Copyright 2001-2008 The Apache Software Foundation - -Apache Jakarta Commons Digester -Copyright 2001-2006 The Apache Software Foundation - -Apache Commons BeanUtils -Copyright 2000-2008 The Apache Software Foundation - -ApacheDS Protocol Kerberos Codec -Copyright 2003-2013 The Apache Software Foundation - -ApacheDS I18n -Copyright 2003-2013 The Apache Software Foundation - -Apache Directory API ASN.1 API -Copyright 2003-2013 The Apache Software Foundation - -Apache Directory LDAP API Utilities -Copyright 2003-2013 The Apache Software Foundation - -Curator Client -Copyright 2011-2015 The Apache Software Foundation - -htrace-core -Copyright 2015 The Apache Software Foundation - - ========================================================================= - == NOTICE file corresponding to section 4(d) of the Apache License, == - == Version 2.0, in this case for the Apache Xerces Java distribution. == - ========================================================================= - - Portions of this software were originally based on the following: - - software copyright (c) 1999, IBM Corporation., http://www.ibm.com. - - software copyright (c) 1999, Sun Microsystems., http://www.sun.com. - - voluntary contributions made by Paul Eng on behalf of the - Apache Software Foundation that were originally developed at iClick, Inc., - software copyright (c) 1999. - -# Jackson JSON processor - -Jackson is a high-performance, Free/Open Source JSON processing library. -It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has -been in development since 2007. -It is currently developed by a community of developers, as well as supported -commercially by FasterXML.com. - -## Licensing - -Jackson core and extension components may licensed under different licenses. -To find the details that apply to this artifact see the accompanying LICENSE file. -For more information, including possible other licensing options, contact -FasterXML.com (http://fasterxml.com). - -## Credits - -A list of contributors may be found from CREDITS file, which is included -in some artifacts (usually source distributions); but is always available -from the source code management (SCM) system project uses. - -Apache HttpCore -Copyright 2005-2017 The Apache Software Foundation - -Curator Recipes -Copyright 2011-2015 The Apache Software Foundation - -Curator Framework -Copyright 2011-2015 The Apache Software Foundation - -Apache Commons Lang -Copyright 2001-2016 The Apache Software Foundation - -This product includes software from the Spring Framework, -under the Apache License 2.0 (see: StringUtils.containsWhitespace()) - -Apache Commons Math -Copyright 2001-2015 The Apache Software Foundation - -This product includes software developed for Orekit by -CS Systèmes d'Information (http://www.c-s.fr/) -Copyright 2010-2012 CS Systèmes d'Information - -Apache log4j -Copyright 2007 The Apache Software Foundation - -# Compress LZF - -This library contains efficient implementation of LZF compression format, -as well as additional helper classes that build on JDK-provided gzip (deflat) -codec. - -Library is licensed under Apache License 2.0, as per accompanying LICENSE file. - -## Credit - -Library has been written by Tatu Saloranta (tatu.saloranta@iki.fi). -It was started at Ning, inc., as an official Open Source process used by -platform backend, but after initial versions has been developed outside of -Ning by supporting community. - -Other contributors include: - -* Jon Hartlaub (first versions of streaming reader/writer; unit tests) -* Cedrik Lime: parallel LZF implementation - -Various community members have contributed bug reports, and suggested minor -fixes; these can be found from file "VERSION.txt" in SCM. - -Apache Commons Net -Copyright 2001-2012 The Apache Software Foundation - -Copyright 2011 The Netty Project - -http://www.apache.org/licenses/LICENSE-2.0 - -This product contains a modified version of 'JZlib', a re-implementation of -zlib in pure Java, which can be obtained at: - - * LICENSE: - * license/LICENSE.jzlib.txt (BSD Style License) - * HOMEPAGE: - * http://www.jcraft.com/jzlib/ - -This product contains a modified version of 'Webbit', a Java event based -WebSocket and HTTP server: - -This product optionally depends on 'Protocol Buffers', Google's data -interchange format, which can be obtained at: - -This product optionally depends on 'SLF4J', a simple logging facade for Java, -which can be obtained at: - -This product optionally depends on 'Apache Log4J', a logging framework, -which can be obtained at: - -This product optionally depends on 'JBoss Logging', a logging framework, -which can be obtained at: - - * LICENSE: - * license/LICENSE.jboss-logging.txt (GNU LGPL 2.1) - * HOMEPAGE: - * http://anonsvn.jboss.org/repos/common/common-logging-spi/ - -This product optionally depends on 'Apache Felix', an open source OSGi -framework implementation, which can be obtained at: - - * LICENSE: - * license/LICENSE.felix.txt (Apache License 2.0) - * HOMEPAGE: - * http://felix.apache.org/ - -Jackson core and extension components may be licensed under different licenses. -To find the details that apply to this artifact see the accompanying LICENSE file. -For more information, including possible other licensing options, contact -FasterXML.com (http://fasterxml.com). - -Apache Ivy (TM) -Copyright 2007-2014 The Apache Software Foundation - -Portions of Ivy were originally developed at -Jayasoft SARL (http://www.jayasoft.fr/) -and are licensed to the Apache Software Foundation under the -"Software Grant License Agreement" - -SSH and SFTP support is provided by the JCraft JSch package, -which is open source software, available under -the terms of a BSD style license. -The original software and related information is available -at http://www.jcraft.com/jsch/. - - -ORC Core -Copyright 2013-2018 The Apache Software Foundation - -Apache Commons Lang -Copyright 2001-2011 The Apache Software Foundation - -ORC MapReduce -Copyright 2013-2018 The Apache Software Foundation - -Apache Parquet Format -Copyright 2017 The Apache Software Foundation - -Arrow Vectors -Copyright 2017 The Apache Software Foundation - -Arrow Format -Copyright 2017 The Apache Software Foundation - -Arrow Memory -Copyright 2017 The Apache Software Foundation - -Apache Commons CLI -Copyright 2001-2009 The Apache Software Foundation - -Google Guice - Extensions - Servlet -Copyright 2006-2011 Google, Inc. - -Apache Commons IO -Copyright 2002-2012 The Apache Software Foundation - -Google Guice - Core Library -Copyright 2006-2011 Google, Inc. - -mesos -Copyright 2017 The Apache Software Foundation - -Apache Parquet Hadoop Bundle (Incubating) -Copyright 2015 The Apache Software Foundation - -Hive Query Language -Copyright 2016 The Apache Software Foundation - -Apache Extras Companion for log4j 1.2. -Copyright 2007 The Apache Software Foundation - -Hive Metastore -Copyright 2016 The Apache Software Foundation - -Apache Commons Logging -Copyright 2003-2013 The Apache Software Foundation - -========================================================================= -== NOTICE file corresponding to section 4(d) of the Apache License, == -== Version 2.0, in this case for the DataNucleus distribution. == -========================================================================= - -=================================================================== -This product includes software developed by many individuals, -including the following: -=================================================================== -Erik Bengtson -Andy Jefferson - -=================================================================== -This product has included contributions from some individuals, -including the following: -=================================================================== - -=================================================================== -This product includes software developed by many individuals, -including the following: -=================================================================== -Andy Jefferson -Erik Bengtson -Joerg von Frantzius -Marco Schulze - -=================================================================== -This product has included contributions from some individuals, -including the following: -=================================================================== -Barry Haddow -Ralph Ullrich -David Ezzio -Brendan de Beer -David Eaves -Martin Taal -Tony Lai -Roland Szabo -Anton Troshin (Timesten) - -=================================================================== -This product also includes software developed by the TJDO project -(http://tjdo.sourceforge.net/). -=================================================================== - -=================================================================== -This product also includes software developed by the Apache Commons project -(http://commons.apache.org/). -=================================================================== - -Apache Commons Pool -Copyright 1999-2009 The Apache Software Foundation - -Apache Commons DBCP -Copyright 2001-2010 The Apache Software Foundation - -Apache Java Data Objects (JDO) -Copyright 2005-2006 The Apache Software Foundation - -Apache Jakarta HttpClient -Copyright 1999-2007 The Apache Software Foundation - -Calcite Avatica -Copyright 2012-2015 The Apache Software Foundation - -Calcite Core -Copyright 2012-2015 The Apache Software Foundation - -Calcite Linq4j -Copyright 2012-2015 The Apache Software Foundation - -Apache HttpClient -Copyright 1999-2017 The Apache Software Foundation - -Apache Commons Codec -Copyright 2002-2014 The Apache Software Foundation - -src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java -contains test data from http://aspell.net/test/orig/batch0.tab. -Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org) - -=============================================================================== - -The content of package org.apache.commons.codec.language.bm has been translated -from the original php source code available at http://stevemorse.org/phoneticinfo.htm -with permission from the original authors. -Original source copyright: -Copyright (c) 2008 Alexander Beider & Stephen P. Morse. - -============================================================================= -= NOTICE file corresponding to section 4d of the Apache License Version 2.0 = -============================================================================= -This product includes software developed by -Joda.org (http://www.joda.org/). - -=================================================================== -This product has included contributions from some individuals, -including the following: -=================================================================== -Joerg von Frantzius -Thomas Marti -Barry Haddow -Marco Schulze -Ralph Ullrich -David Ezzio -Brendan de Beer -David Eaves -Martin Taal -Tony Lai -Roland Szabo -Marcus Mennemeier -Xuan Baldauf -Eric Sultan - -Apache Thrift -Copyright 2006-2010 The Apache Software Foundation. - -========================================================================= -== NOTICE file corresponding to section 4(d) of the Apache License, -== Version 2.0, in this case for the Apache Derby distribution. -== -== DO NOT EDIT THIS FILE DIRECTLY. IT IS GENERATED -== BY THE buildnotice TARGET IN THE TOP LEVEL build.xml FILE. -== -========================================================================= - -Apache Derby -Copyright 2004-2015 The Apache Software Foundation - -========================================================================= - -Portions of Derby were originally developed by -International Business Machines Corporation and are -licensed to the Apache Software Foundation under the -"Software Grant and Corporate Contribution License Agreement", -informally known as the "Derby CLA". -The following copyright notice(s) were affixed to portions of the code -with which this file is now or was at one time distributed -and are placed here unaltered. - -(C) Copyright 1997,2004 International Business Machines Corporation. All rights reserved. - -(C) Copyright IBM Corp. 2003. - -The portion of the functionTests under 'nist' was originally -developed by the National Institute of Standards and Technology (NIST), -an agency of the United States Department of Commerce, and adapted by -International Business Machines Corporation in accordance with the NIST -Software Acknowledgment and Redistribution document at -http://www.itl.nist.gov/div897/ctg/sql_form.htm - -The JDBC apis for small devices and JDBC3 (under java/stubs/jsr169 and -java/stubs/jdbc3) were produced by trimming sources supplied by the -Apache Harmony project. In addition, the Harmony SerialBlob and -SerialClob implementations are used. The following notice covers the Harmony sources: - -Portions of Harmony were originally developed by -Intel Corporation and are licensed to the Apache Software -Foundation under the "Software Grant and Corporate Contribution -License Agreement", informally known as the "Intel Harmony CLA". - -The Derby build relies on source files supplied by the Apache Felix -project. The following notice covers the Felix files: - - Apache Felix Main - Copyright 2008 The Apache Software Foundation - - I. Included Software - - This product includes software developed at - The Apache Software Foundation (http://www.apache.org/). - Licensed under the Apache License 2.0. - - This product includes software developed at - The OSGi Alliance (http://www.osgi.org/). - Copyright (c) OSGi Alliance (2000, 2007). - Licensed under the Apache License 2.0. - - This product includes software from http://kxml.sourceforge.net. - Copyright (c) 2002,2003, Stefan Haustein, Oberhausen, Rhld., Germany. - Licensed under BSD License. - - II. Used Software - - This product uses software developed at - The OSGi Alliance (http://www.osgi.org/). - Copyright (c) OSGi Alliance (2000, 2007). - Licensed under the Apache License 2.0. - - III. License Summary - - Apache License 2.0 - - BSD License - -The Derby build relies on jar files supplied by the Apache Lucene -project. The following notice covers the Lucene files: - -Apache Lucene -Copyright 2013 The Apache Software Foundation - -Includes software from other Apache Software Foundation projects, -including, but not limited to: - - Apache Ant - - Apache Jakarta Regexp - - Apache Commons - - Apache Xerces - -ICU4J, (under analysis/icu) is licensed under an MIT styles license -and Copyright (c) 1995-2008 International Business Machines Corporation and others - -Some data files (under analysis/icu/src/data) are derived from Unicode data such -as the Unicode Character Database. See http://unicode.org/copyright.html for more -details. - -Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is -BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ - -The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were -automatically generated with the moman/finenight FSA library, created by -Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, -see http://sites.google.com/site/rrettesite/moman and -http://bitbucket.org/jpbarrette/moman/overview/ - -The class org.apache.lucene.util.WeakIdentityMap was derived from -the Apache CXF project and is Apache License 2.0. - -The Google Code Prettify is Apache License 2.0. -See http://code.google.com/p/google-code-prettify/ - -JUnit (junit-4.10) is licensed under the Common Public License v. 1.0 -See http://junit.sourceforge.net/cpl-v10.html - -This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin -g Package (jaspell): http://jaspell.sourceforge.net/ -License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) - -The snowball stemmers in - analysis/common/src/java/net/sf/snowball -were developed by Martin Porter and Richard Boulton. -The snowball stopword lists in - analysis/common/src/resources/org/apache/lucene/analysis/snowball -were developed by Martin Porter and Richard Boulton. -The full snowball package is available from - http://snowball.tartarus.org/ - -The KStem stemmer in - analysis/common/src/org/apache/lucene/analysis/en -was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) -under the BSD-license. - -The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default -stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: -analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, -analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, -analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, -analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, -analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt -See http://members.unine.ch/jacques.savoy/clef/index.html. - -The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers -(common) are based on BSD-licensed reference implementations created by Jacques Savoy and -Ljiljana Dolamic. These files reside in: -analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java -analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java -analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java -analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java -analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java -analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java -analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java -analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java -analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java -analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java -analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java - -The Stempel analyzer (stempel) includes BSD-licensed software developed -by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, -and Edmond Nolan. - -The Polish analyzer (stempel) comes with a default -stopword list that is BSD-licensed created by the Carrot2 project. The file resides -in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. -See http://project.carrot2.org/license.html. - -The SmartChineseAnalyzer source code (smartcn) was -provided by Xiaoping Gao and copyright 2009 by www.imdict.net. - -WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) -is derived from Unicode data such as the Unicode Character Database. -See http://unicode.org/copyright.html for more details. - -The Morfologik analyzer (morfologik) includes BSD-licensed software -developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/). - -Morfologik uses data from Polish ispell/myspell dictionary -(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia) -LGPL and Creative Commons ShareAlike. - -Morfologic includes data from BSD-licensed dictionary of Polish (SGJP) -(http://sgjp.pl/morfeusz/) - -Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original -source code for this can be found at http://www.eclipse.org/jetty/downloads.php - -=========================================================================== -Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration -=========================================================================== - -This software includes a binary and/or source version of data from - - mecab-ipadic-2.7.0-20070801 - -which can be obtained from - - http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz - -or - - http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz - -=========================================================================== -mecab-ipadic-2.7.0-20070801 Notice -=========================================================================== - -Nara Institute of Science and Technology (NAIST), -the copyright holders, disclaims all warranties with regard to this -software, including all implied warranties of merchantability and -fitness, in no event shall NAIST be liable for -any special, indirect or consequential damages or any damages -whatsoever resulting from loss of use, data or profits, whether in an -action of contract, negligence or other tortuous action, arising out -of or in connection with the use or performance of this software. - -A large portion of the dictionary entries -originate from ICOT Free Software. The following conditions for ICOT -Free Software applies to the current dictionary as well. - -Each User may also freely distribute the Program, whether in its -original form or modified, to any third party or parties, PROVIDED -that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear -on, or be attached to, the Program, which is distributed substantially -in the same form as set out herein and that such intended -distribution, if actually made, will neither violate or otherwise -contravene any of the laws and regulations of the countries having -jurisdiction over the User or the intended distribution itself. - -NO WARRANTY - -The program was produced on an experimental basis in the course of the -research and development conducted during the project and is provided -to users as so produced on an experimental basis. Accordingly, the -program is provided without any warranty whatsoever, whether express, -implied, statutory or otherwise. The term "warranty" used herein -includes, but is not limited to, any warranty of the quality, -performance, merchantability and fitness for a particular purpose of -the program and the nonexistence of any infringement or violation of -any right of any third party. - -Each user of the program will agree and understand, and be deemed to -have agreed and understood, that there is no warranty whatsoever for -the program and, accordingly, the entire risk arising from or -otherwise connected with the program is assumed by the user. - -Therefore, neither ICOT, the copyright holder, or any other -organization that participated in or was otherwise related to the -development of the program and their respective officials, directors, -officers and other employees shall be held liable for any and all -damages, including, without limitation, general, special, incidental -and consequential damages, arising out of or otherwise in connection -with the use or inability to use the program or any product, material -or result produced or otherwise obtained by using the program, -regardless of whether they have been advised of, or otherwise had -knowledge of, the possibility of such damages at any time during the -project or thereafter. Each user will be deemed to have agreed to the -foregoing by his or her commencement of use of the program. The term -"use" as used herein includes, but is not limited to, the use, -modification, copying and distribution of the program and the -production of secondary products from the program. - -In the case where the program, whether in its original form or -modified, was distributed or delivered to or received by a user from -any person, organization or entity other than ICOT, unless it makes or -grants independently of ICOT any specific warranty to the user in -writing, such person, organization or entity, will also be exempted -from and not be held liable to the user for any such damages as noted -above as far as the program is concerned. - -The Derby build relies on a jar file supplied by the JSON Simple -project, hosted at https://code.google.com/p/json-simple/. -The JSON simple jar file is licensed under the Apache 2.0 License. - -Hive CLI -Copyright 2016 The Apache Software Foundation - -Hive JDBC -Copyright 2016 The Apache Software Foundation - - -Chill is a set of Scala extensions for Kryo. -Copyright 2012 Twitter, Inc. - -Third Party Dependencies: - -Kryo 2.17 -BSD 3-Clause License -http://code.google.com/p/kryo - -Commons-Codec 1.7 -Apache Public License 2.0 -http://hadoop.apache.org - - - -Breeze is distributed under an Apache License V2.0 (See LICENSE) - -=============================================================================== - -Proximal algorithms outlined in Proximal.scala (package breeze.optimize.proximal) -are based on https://github.com/cvxgrp/proximal (see LICENSE for details) and distributed with -Copyright (c) 2014 by Debasish Das (Verizon), all rights reserved. - -=============================================================================== - -QuadraticMinimizer class in package breeze.optimize.proximal is distributed with Copyright (c) -2014, Debasish Das (Verizon), all rights reserved. - -=============================================================================== - -NonlinearMinimizer class in package breeze.optimize.proximal is distributed with Copyright (c) -2015, Debasish Das (Verizon), all rights reserved. - - -stream-lib -Copyright 2016 AddThis - -This product includes software developed by AddThis. - -This product also includes code adapted from: - -Apache Solr (http://lucene.apache.org/solr/) -Copyright 2014 The Apache Software Foundation - -Apache Mahout (http://mahout.apache.org/) -Copyright 2014 The Apache Software Foundation diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/DESCRIPTION b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/DESCRIPTION deleted file mode 100644 index a4937d9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/DESCRIPTION +++ /dev/null @@ -1,35 +0,0 @@ -Package: SparkR -Type: Package -Version: 2.4.3 -Title: R Front End for 'Apache Spark' -Description: Provides an R Front end for 'Apache Spark' . -Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), - email = "shivaram@cs.berkeley.edu"), - person("Xiangrui", "Meng", role = "aut", - email = "meng@databricks.com"), - person("Felix", "Cheung", role = "aut", - email = "felixcheung@apache.org"), - person(family = "The Apache Software Foundation", role = c("aut", "cph"))) -License: Apache License (== 2.0) -URL: https://www.apache.org/ https://spark.apache.org/ -BugReports: https://spark.apache.org/contributing.html -SystemRequirements: Java (== 8) -Depends: R (>= 3.0), methods -Suggests: knitr, rmarkdown, testthat, e1071, survival -Collate: 'schema.R' 'generics.R' 'jobj.R' 'column.R' 'group.R' 'RDD.R' - 'pairRDD.R' 'DataFrame.R' 'SQLContext.R' 'WindowSpec.R' - 'backend.R' 'broadcast.R' 'catalog.R' 'client.R' 'context.R' - 'deserialize.R' 'functions.R' 'install.R' 'jvm.R' - 'mllib_classification.R' 'mllib_clustering.R' 'mllib_fpm.R' - 'mllib_recommendation.R' 'mllib_regression.R' 'mllib_stat.R' - 'mllib_tree.R' 'mllib_utils.R' 'serialize.R' 'sparkR.R' - 'stats.R' 'streaming.R' 'types.R' 'utils.R' 'window.R' -RoxygenNote: 6.1.0 -VignetteBuilder: knitr -NeedsCompilation: no -Author: Shivaram Venkataraman [aut, cre], - Xiangrui Meng [aut], - Felix Cheung [aut], - The Apache Software Foundation [aut, cph] -Maintainer: Shivaram Venkataraman -Built: R 3.4.4; ; 2019-05-01 05:09:34 UTC; unix diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/INDEX b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/INDEX deleted file mode 100644 index 63e37ad..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/INDEX +++ /dev/null @@ -1,318 +0,0 @@ -%<=>% %<=>% -%in% Match a column with given values. -AFTSurvivalRegressionModel-class - S4 class that represents a - AFTSurvivalRegressionModel -ALSModel-class S4 class that represents an ALSModel -BisectingKMeansModel-class - S4 class that represents a BisectingKMeansModel -DecisionTreeClassificationModel-class - S4 class that represents a - DecisionTreeClassificationModel -DecisionTreeRegressionModel-class - S4 class that represents a - DecisionTreeRegressionModel -FPGrowthModel-class S4 class that represents a FPGrowthModel -GBTClassificationModel-class - S4 class that represents a - GBTClassificationModel -GBTRegressionModel-class - S4 class that represents a GBTRegressionModel -GaussianMixtureModel-class - S4 class that represents a GaussianMixtureModel -GeneralizedLinearRegressionModel-class - S4 class that represents a generalized linear - model -GroupedData-class S4 class that represents a GroupedData -IsotonicRegressionModel-class - S4 class that represents an - IsotonicRegressionModel -KMeansModel-class S4 class that represents a KMeansModel -KSTest-class S4 class that represents an KSTest -LDAModel-class S4 class that represents an LDAModel -LinearSVCModel-class S4 class that represents an LinearSVCModel -LogisticRegressionModel-class - S4 class that represents an - LogisticRegressionModel -MultilayerPerceptronClassificationModel-class - S4 class that represents a - MultilayerPerceptronClassificationModel -NaiveBayesModel-class S4 class that represents a NaiveBayesModel -RandomForestClassificationModel-class - S4 class that represents a - RandomForestClassificationModel -RandomForestRegressionModel-class - S4 class that represents a - RandomForestRegressionModel -SparkDataFrame-class S4 class that represents a SparkDataFrame -StreamingQuery-class S4 class that represents a StreamingQuery -WindowSpec-class S4 class that represents a WindowSpec -agg summarize -alias alias -approxQuantile Calculates the approximate quantiles of - numerical columns of a SparkDataFrame -arrange Arrange Rows by Variables -as.data.frame Download data from a SparkDataFrame into a R - data.frame -asc A set of operations working with SparkDataFrame - columns -attach,SparkDataFrame-method - Attach SparkDataFrame to R search path -avg avg -awaitTermination awaitTermination -between between -broadcast broadcast -cache Cache -cacheTable Cache Table -cancelJobGroup Cancel active jobs for the specified group -cast Casts the column to a different data type. -checkpoint checkpoint -clearCache Clear Cache -clearJobGroup Clear current job group ID and its description -coalesce Coalesce -collect Collects all the elements of a SparkDataFrame - and coerces them into an R data.frame. -colnames Column Names of SparkDataFrame -coltypes coltypes -column S4 class that represents a SparkDataFrame - column -column_aggregate_functions - Aggregate functions for Column operations -column_collection_functions - Collection functions for Column operations -column_datetime_diff_functions - Date time arithmetic functions for Column - operations -column_datetime_functions - Date time functions for Column operations -column_math_functions Math functions for Column operations -column_misc_functions Miscellaneous functions for Column operations -column_nonaggregate_functions - Non-aggregate functions for Column operations -column_string_functions - String functions for Column operations -column_window_functions - Window functions for Column operations -corr corr -count Count -cov cov -createDataFrame Create a SparkDataFrame -createExternalTable (Deprecated) Create an external table -createOrReplaceTempView - Creates a temporary view using the given name. -createTable Creates a table based on the dataset in a data - source -crossJoin CrossJoin -crosstab Computes a pair-wise frequency table of the - given columns -cube cube -currentDatabase Returns the current default database -dapply dapply -dapplyCollect dapplyCollect -describe describe -dim Returns the dimensions of SparkDataFrame -distinct Distinct -drop drop -dropDuplicates dropDuplicates -dropTempTable (Deprecated) Drop Temporary Table -dropTempView Drops the temporary view with the given view - name in the catalog. -dropna A set of SparkDataFrame functions working with - NA values -dtypes DataTypes -endsWith endsWith -except except -exceptAll exceptAll -explain Explain -filter Filter -first Return the first row of a SparkDataFrame -fitted Get fitted result from a k-means model -freqItems Finding frequent items for columns, possibly - with false positives -gapply gapply -gapplyCollect gapplyCollect -getLocalProperty Get a local property set in this thread, or - 'NULL' if it is missing. See - 'setLocalProperty'. -getNumPartitions getNumPartitions -glm,formula,ANY,SparkDataFrame-method - Generalized Linear Models (R-compliant) -group_by GroupBy -hashCode Compute the hashCode of an object -head Head -hint hint -histogram Compute histogram statistics for given column -insertInto insertInto -install.spark Download and Install Apache Spark to a Local - Directory -intersect Intersect -intersectAll intersectAll -isActive isActive -isLocal isLocal -isStreaming isStreaming -join Join -last last -lastProgress lastProgress -limit Limit -listColumns Returns a list of columns for the given - table/view in the specified database -listDatabases Returns a list of databases available -listFunctions Returns a list of functions registered in the - specified database -listTables Returns a list of tables or views in the - specified database -localCheckpoint localCheckpoint -merge Merges two data frames -mutate Mutate -ncol Returns the number of columns in a - SparkDataFrame -not ! -nrow Returns the number of rows in a SparkDataFrame -orderBy Ordering Columns in a WindowSpec -otherwise otherwise -over over -partitionBy partitionBy -persist Persist -pivot Pivot a column of the GroupedData and perform - the specified aggregation. -predict Makes predictions from a MLlib model -print.jobj Print a JVM object reference. -print.structField Print a Spark StructField. -print.structType Print a Spark StructType. -printSchema Print Schema of a SparkDataFrame -queryName queryName -randomSplit randomSplit -rangeBetween rangeBetween -rbind Union two or more SparkDataFrames -read.df Load a SparkDataFrame -read.jdbc Create a SparkDataFrame representing the - database table accessible via JDBC URL -read.json Create a SparkDataFrame from a JSON file. -read.ml Load a fitted MLlib model from the input path. -read.orc Create a SparkDataFrame from an ORC file. -read.parquet Create a SparkDataFrame from a Parquet file. -read.stream Load a streaming SparkDataFrame -read.text Create a SparkDataFrame from a text file. -recoverPartitions Recovers all the partitions in the directory of - a table and update the catalog -refreshByPath Invalidates and refreshes all the cached data - and metadata for SparkDataFrame containing path -refreshTable Invalidates and refreshes all the cached data - and metadata of the given table -registerTempTable (Deprecated) Register Temporary Table -rename rename -repartition Repartition -repartitionByRange Repartition by range -rollup rollup -rowsBetween rowsBetween -sample Sample -sampleBy Returns a stratified sample without replacement -saveAsTable Save the contents of the SparkDataFrame to a - data source as a table -schema Get schema object -select Select -selectExpr SelectExpr -setCheckpointDir Set checkpoint directory -setCurrentDatabase Sets the current default database -setJobDescription Set a human readable description of the current - job. -setJobGroup Assigns a group ID to all the jobs started by - this thread until the group ID is set to a - different value or cleared. -setLocalProperty Set a local property that affects jobs - submitted from this thread, such as the Spark - fair scheduler pool. -setLogLevel Set new log level -show show -showDF showDF -spark.addFile Add a file or directory to be downloaded with - this Spark job on every node. -spark.als Alternating Least Squares (ALS) for - Collaborative Filtering -spark.bisectingKmeans Bisecting K-Means Clustering Model -spark.decisionTree Decision Tree Model for Regression and - Classification -spark.fpGrowth FP-growth -spark.gaussianMixture Multivariate Gaussian Mixture Model (GMM) -spark.gbt Gradient Boosted Tree Model for Regression and - Classification -spark.getSparkFiles Get the absolute path of a file added through - spark.addFile. -spark.getSparkFilesRootDirectory - Get the root directory that contains files - added through spark.addFile. -spark.glm Generalized Linear Models -spark.isoreg Isotonic Regression Model -spark.kmeans K-Means Clustering Model -spark.kstest (One-Sample) Kolmogorov-Smirnov Test -spark.lapply Run a function over a list of elements, - distributing the computations with Spark -spark.lda Latent Dirichlet Allocation -spark.logit Logistic Regression Model -spark.mlp Multilayer Perceptron Classification Model -spark.naiveBayes Naive Bayes Models -spark.randomForest Random Forest Model for Regression and - Classification -spark.survreg Accelerated Failure Time (AFT) Survival - Regression Model -spark.svmLinear Linear SVM Model -sparkR.callJMethod Call Java Methods -sparkR.callJStatic Call Static Java Methods -sparkR.conf Get Runtime Config from the current active - SparkSession -sparkR.init (Deprecated) Initialize a new Spark Context -sparkR.newJObject Create Java Objects -sparkR.session Get the existing SparkSession or initialize a - new SparkSession. -sparkR.session.stop Stop the Spark Session and Spark Context -sparkR.uiWebUrl Get the URL of the SparkUI instance for the - current active SparkSession -sparkR.version Get version of Spark on which this application - is running -sparkRHive.init (Deprecated) Initialize a new HiveContext -sparkRSQL.init (Deprecated) Initialize a new SQLContext -sql SQL Query -startsWith startsWith -status status -stopQuery stopQuery -storageLevel StorageLevel -str Compactly display the structure of a dataset -structField structField -structType structType -subset Subset -substr substr -summary summary -tableNames Table Names -tableToDF Create a SparkDataFrame from a SparkSQL table - or view -tables Tables -take Take the first NUM rows of a SparkDataFrame and - return the results as a R data.frame -toJSON toJSON -uncacheTable Uncache Table -union Return a new SparkDataFrame containing the - union of rows -unionByName Return a new SparkDataFrame containing the - union of rows, matched by column names -unpersist Unpersist -windowOrderBy windowOrderBy -windowPartitionBy windowPartitionBy -with Evaluate a R expression in an environment - constructed from a SparkDataFrame -withColumn WithColumn -withWatermark withWatermark -write.df Save the contents of SparkDataFrame to a data - source. -write.jdbc Save the content of SparkDataFrame to an - external database table via JDBC. -write.json Save the contents of SparkDataFrame as a JSON - file -write.ml Saves the MLlib model to the input path -write.orc Save the contents of SparkDataFrame as an ORC - file, preserving the schema. -write.parquet Save the contents of SparkDataFrame as a - Parquet file, preserving the schema. -write.stream Write the streaming SparkDataFrame to a data - source. -write.text Save the content of SparkDataFrame in a text - file at the specified path. diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/Rd.rds b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/Rd.rds deleted file mode 100644 index a4e0988..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/Rd.rds and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/features.rds b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/features.rds deleted file mode 100644 index 4400f98..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/features.rds and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/hsearch.rds b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/hsearch.rds deleted file mode 100644 index 40405fd..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/hsearch.rds and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/links.rds b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/links.rds deleted file mode 100644 index cbbb857..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/links.rds and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/nsInfo.rds b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/nsInfo.rds deleted file mode 100644 index 04cb65a..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/nsInfo.rds and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/package.rds b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/package.rds deleted file mode 100644 index b237713..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/Meta/package.rds and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/NAMESPACE b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/NAMESPACE deleted file mode 100644 index d77c62a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/NAMESPACE +++ /dev/null @@ -1,503 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Imports from base R -# Do not include stats:: "rpois", "runif" - causes error at runtime -importFrom("methods", "setGeneric", "setMethod", "setOldClass") -importFrom("methods", "is", "new", "signature", "show") -importFrom("stats", "gaussian", "setNames") -importFrom("utils", "download.file", "object.size", "packageVersion", "tail", "untar") - -# Disable native libraries till we figure out how to package it -# See SPARKR-7839 -#useDynLib(SparkR, stringHashCode) - -# S3 methods exported -export("sparkR.session") -export("sparkR.init") -export("sparkR.stop") -export("sparkR.session.stop") -export("sparkR.conf") -export("sparkR.version") -export("sparkR.uiWebUrl") -export("print.jobj") - -export("sparkR.newJObject") -export("sparkR.callJMethod") -export("sparkR.callJStatic") - -export("install.spark") - -export("sparkRSQL.init", - "sparkRHive.init") - -# MLlib integration -exportMethods("glm", - "spark.glm", - "predict", - "summary", - "spark.kmeans", - "fitted", - "spark.mlp", - "spark.naiveBayes", - "spark.survreg", - "spark.lda", - "spark.posterior", - "spark.perplexity", - "spark.isoreg", - "spark.gaussianMixture", - "spark.als", - "spark.kstest", - "spark.logit", - "spark.decisionTree", - "spark.randomForest", - "spark.gbt", - "spark.bisectingKmeans", - "spark.svmLinear", - "spark.fpGrowth", - "spark.freqItemsets", - "spark.associationRules") - -# Job group lifecycle management methods -export("setJobGroup", - "clearJobGroup", - "cancelJobGroup", - "setJobDescription", - "setLocalProperty", - "getLocalProperty") - -# Export Utility methods -export("setLogLevel") - -exportClasses("SparkDataFrame") - -exportMethods("arrange", - "as.data.frame", - "attach", - "broadcast", - "cache", - "checkpoint", - "coalesce", - "collect", - "colnames", - "colnames<-", - "coltypes", - "coltypes<-", - "columns", - "count", - "cov", - "corr", - "covar_samp", - "covar_pop", - "createOrReplaceTempView", - "crossJoin", - "crosstab", - "cube", - "dapply", - "dapplyCollect", - "describe", - "dim", - "distinct", - "drop", - "dropDuplicates", - "dropna", - "dtypes", - "except", - "exceptAll", - "explain", - "fillna", - "filter", - "first", - "freqItems", - "gapply", - "gapplyCollect", - "getNumPartitions", - "group_by", - "groupBy", - "head", - "hint", - "insertInto", - "intersect", - "intersectAll", - "isLocal", - "isStreaming", - "join", - "limit", - "localCheckpoint", - "merge", - "mutate", - "na.omit", - "names", - "names<-", - "ncol", - "nrow", - "orderBy", - "persist", - "printSchema", - "randomSplit", - "rbind", - "registerTempTable", - "rename", - "repartition", - "repartitionByRange", - "rollup", - "sample", - "sample_frac", - "sampleBy", - "saveAsParquetFile", - "saveAsTable", - "saveDF", - "schema", - "select", - "selectExpr", - "show", - "showDF", - "storageLevel", - "subset", - "summarize", - "summary", - "take", - "toJSON", - "transform", - "union", - "unionAll", - "unionByName", - "unique", - "unpersist", - "where", - "with", - "withColumn", - "withColumnRenamed", - "withWatermark", - "write.df", - "write.jdbc", - "write.json", - "write.orc", - "write.parquet", - "write.stream", - "write.text", - "write.ml") - -exportClasses("Column") - -exportMethods("%<=>%", - "%in%", - "abs", - "acos", - "add_months", - "alias", - "approxCountDistinct", - "approxQuantile", - "array_contains", - "array_distinct", - "array_except", - "array_intersect", - "array_join", - "array_max", - "array_min", - "array_position", - "array_remove", - "array_repeat", - "array_sort", - "arrays_overlap", - "array_union", - "arrays_zip", - "asc", - "ascii", - "asin", - "atan", - "atan2", - "avg", - "base64", - "between", - "bin", - "bitwiseNOT", - "bround", - "cast", - "cbrt", - "ceil", - "ceiling", - "collect_list", - "collect_set", - "column", - "concat", - "concat_ws", - "contains", - "conv", - "cos", - "cosh", - "count", - "countDistinct", - "crc32", - "create_array", - "create_map", - "current_date", - "current_timestamp", - "hash", - "cume_dist", - "date_add", - "date_format", - "date_sub", - "date_trunc", - "datediff", - "dayofmonth", - "dayofweek", - "dayofyear", - "decode", - "dense_rank", - "desc", - "element_at", - "encode", - "endsWith", - "exp", - "explode", - "explode_outer", - "expm1", - "expr", - "factorial", - "first", - "flatten", - "floor", - "format_number", - "format_string", - "from_json", - "from_unixtime", - "from_utc_timestamp", - "getField", - "getItem", - "greatest", - "grouping_bit", - "grouping_id", - "hex", - "histogram", - "hour", - "hypot", - "ifelse", - "initcap", - "input_file_name", - "instr", - "isNaN", - "isNotNull", - "isNull", - "is.nan", - "isnan", - "kurtosis", - "lag", - "last", - "last_day", - "lead", - "least", - "length", - "levenshtein", - "like", - "lit", - "locate", - "log", - "log10", - "log1p", - "log2", - "lower", - "lpad", - "ltrim", - "map_from_arrays", - "map_keys", - "map_values", - "max", - "md5", - "mean", - "min", - "minute", - "monotonically_increasing_id", - "month", - "months_between", - "n", - "n_distinct", - "nanvl", - "negate", - "next_day", - "not", - "ntile", - "otherwise", - "over", - "percent_rank", - "pmod", - "posexplode", - "posexplode_outer", - "quarter", - "rand", - "randn", - "rank", - "regexp_extract", - "regexp_replace", - "repeat_string", - "reverse", - "rint", - "rlike", - "round", - "row_number", - "rpad", - "rtrim", - "second", - "sha1", - "sha2", - "shiftLeft", - "shiftRight", - "shiftRightUnsigned", - "shuffle", - "sd", - "sign", - "signum", - "sin", - "sinh", - "size", - "skewness", - "slice", - "sort_array", - "soundex", - "spark_partition_id", - "split_string", - "stddev", - "stddev_pop", - "stddev_samp", - "struct", - "sqrt", - "startsWith", - "substr", - "substring_index", - "sum", - "sumDistinct", - "tan", - "tanh", - "toDegrees", - "toRadians", - "to_date", - "to_json", - "to_timestamp", - "to_utc_timestamp", - "translate", - "trim", - "trunc", - "unbase64", - "unhex", - "unix_timestamp", - "upper", - "var", - "variance", - "var_pop", - "var_samp", - "weekofyear", - "when", - "window", - "year") - -exportClasses("GroupedData") -exportMethods("agg") -exportMethods("pivot") - -export("as.DataFrame", - "cacheTable", - "clearCache", - "createDataFrame", - "createExternalTable", - "createTable", - "currentDatabase", - "dropTempTable", - "dropTempView", - "jsonFile", - "listColumns", - "listDatabases", - "listFunctions", - "listTables", - "loadDF", - "parquetFile", - "read.df", - "read.jdbc", - "read.json", - "read.orc", - "read.parquet", - "read.stream", - "read.text", - "recoverPartitions", - "refreshByPath", - "refreshTable", - "setCheckpointDir", - "setCurrentDatabase", - "spark.lapply", - "spark.addFile", - "spark.getSparkFilesRootDirectory", - "spark.getSparkFiles", - "sql", - "str", - "tableToDF", - "tableNames", - "tables", - "uncacheTable", - "print.summary.GeneralizedLinearRegressionModel", - "read.ml", - "print.summary.KSTest", - "print.summary.DecisionTreeRegressionModel", - "print.summary.DecisionTreeClassificationModel", - "print.summary.RandomForestRegressionModel", - "print.summary.RandomForestClassificationModel", - "print.summary.GBTRegressionModel", - "print.summary.GBTClassificationModel") - -export("structField", - "structField.jobj", - "structField.character", - "print.structField", - "structType", - "structType.character", - "structType.jobj", - "structType.structField", - "print.structType") - -exportClasses("WindowSpec") - -export("partitionBy", - "rowsBetween", - "rangeBetween") - -export("windowPartitionBy", - "windowOrderBy") - -exportClasses("StreamingQuery") - -export("awaitTermination", - "isActive", - "lastProgress", - "queryName", - "status", - "stopQuery") - - -S3method(print, jobj) -S3method(print, structField) -S3method(print, structType) -S3method(print, summary.GeneralizedLinearRegressionModel) -S3method(print, summary.KSTest) -S3method(print, summary.DecisionTreeRegressionModel) -S3method(print, summary.DecisionTreeClassificationModel) -S3method(print, summary.RandomForestRegressionModel) -S3method(print, summary.RandomForestClassificationModel) -S3method(print, summary.GBTRegressionModel) -S3method(print, summary.GBTClassificationModel) -S3method(structField, character) -S3method(structField, jobj) -S3method(structType, character) -S3method(structType, jobj) -S3method(structType, structField) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/R/SparkR b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/R/SparkR deleted file mode 100644 index 3b65e3c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/R/SparkR +++ /dev/null @@ -1,27 +0,0 @@ -# File share/R/nspackloader.R -# Part of the R package, http://www.R-project.org -# -# Copyright (C) 1995-2012 The R Core Team -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# A copy of the GNU General Public License is available at -# http://www.r-project.org/Licenses/ - -local({ - info <- loadingNamespaceInfo() - pkg <- info$pkgname - ns <- .getNamespace(as.name(pkg)) - if (is.null(ns)) - stop("cannot find namespace environment for ", pkg, domain = NA); - dbbase <- file.path(info$libname, pkg, "R", pkg) - lazyLoad(dbbase, ns, filter = function(n) n != ".__NAMESPACE__.") -}) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/R/SparkR.rdb b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/R/SparkR.rdb deleted file mode 100644 index f9ec836..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/R/SparkR.rdb and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/R/SparkR.rdx b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/R/SparkR.rdx deleted file mode 100644 index 4e182c0..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/R/SparkR.rdx and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/AnIndex b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/AnIndex deleted file mode 100644 index 8e7daf8..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/AnIndex +++ /dev/null @@ -1,943 +0,0 @@ -!,Column-method not -$ select -$,SparkDataFrame-method select -$<- select -$<-,SparkDataFrame-method select -%<=>% eq_null_safe -%<=>%,Column-method eq_null_safe -%in% match -%in%,Column-method match -abs column_math_functions -abs,Column-method column_math_functions -acos column_math_functions -acos,Column-method column_math_functions -add_months column_datetime_diff_functions -add_months,Column,numeric-method column_datetime_diff_functions -AFTSurvivalRegressionModel-class AFTSurvivalRegressionModel-class -agg summarize -agg,GroupedData-method summarize -agg,SparkDataFrame-method summarize -alias alias -alias,Column-method alias -alias,SparkDataFrame-method alias -ALSModel-class ALSModel-class -approxCountDistinct column_aggregate_functions -approxCountDistinct,Column-method column_aggregate_functions -approxQuantile approxQuantile -approxQuantile,SparkDataFrame,character,numeric,numeric-method approxQuantile -arrange arrange -arrange,SparkDataFrame,character-method arrange -arrange,SparkDataFrame,Column-method arrange -arrays_overlap column_collection_functions -arrays_overlap,Column,Column-method column_collection_functions -arrays_overlap,Column-method column_collection_functions -arrays_zip column_collection_functions -arrays_zip,Column-method column_collection_functions -array_contains column_collection_functions -array_contains,Column-method column_collection_functions -array_distinct column_collection_functions -array_distinct,Column-method column_collection_functions -array_except column_collection_functions -array_except,Column,Column-method column_collection_functions -array_except,Column-method column_collection_functions -array_intersect column_collection_functions -array_intersect,Column,Column-method column_collection_functions -array_intersect,Column-method column_collection_functions -array_join column_collection_functions -array_join,Column,character-method column_collection_functions -array_join,Column-method column_collection_functions -array_max column_collection_functions -array_max,Column-method column_collection_functions -array_min column_collection_functions -array_min,Column-method column_collection_functions -array_position column_collection_functions -array_position,Column-method column_collection_functions -array_remove column_collection_functions -array_remove,Column-method column_collection_functions -array_repeat column_collection_functions -array_repeat,Column,numericOrColumn-method column_collection_functions -array_sort column_collection_functions -array_sort,Column-method column_collection_functions -array_union column_collection_functions -array_union,Column,Column-method column_collection_functions -array_union,Column-method column_collection_functions -as.data.frame as.data.frame -as.data.frame,SparkDataFrame-method as.data.frame -as.DataFrame createDataFrame -as.DataFrame.default createDataFrame -asc columnfunctions -ascii column_string_functions -ascii,Column-method column_string_functions -asin column_math_functions -asin,Column-method column_math_functions -associationRules,FPGrowthModel-method spark.fpGrowth -atan column_math_functions -atan,Column-method column_math_functions -atan2 column_math_functions -atan2,Column-method column_math_functions -attach attach -attach,SparkDataFrame-method attach -avg avg -avg,Column-method avg -awaitTermination awaitTermination -awaitTermination,StreamingQuery-method awaitTermination -base64 column_string_functions -base64,Column-method column_string_functions -between between -between,Column-method between -bin column_math_functions -bin,Column-method column_math_functions -BisectingKMeansModel-class BisectingKMeansModel-class -bitwiseNOT column_nonaggregate_functions -bitwiseNOT,Column-method column_nonaggregate_functions -broadcast broadcast -broadcast,SparkDataFrame-method broadcast -bround column_math_functions -bround,Column-method column_math_functions -cache cache -cache,SparkDataFrame-method cache -cacheTable cacheTable -cacheTable.default cacheTable -cancelJobGroup cancelJobGroup -cancelJobGroup.default cancelJobGroup -cast cast -cast,Column-method cast -cbrt column_math_functions -cbrt,Column-method column_math_functions -ceil column_math_functions -ceil,Column-method column_math_functions -ceiling column_math_functions -ceiling,Column-method column_math_functions -checkpoint checkpoint -checkpoint,SparkDataFrame-method checkpoint -clearCache clearCache -clearCache.default clearCache -clearJobGroup clearJobGroup -clearJobGroup.default clearJobGroup -coalesce coalesce -coalesce,Column-method column_nonaggregate_functions -coalesce,SparkDataFrame-method coalesce -collect collect -collect,SparkDataFrame-method collect -collect_list column_aggregate_functions -collect_list,Column-method column_aggregate_functions -collect_set column_aggregate_functions -collect_set,Column-method column_aggregate_functions -colnames columns -colnames,SparkDataFrame-method columns -colnames<- columns -colnames<-,SparkDataFrame-method columns -coltypes coltypes -coltypes,SparkDataFrame-method coltypes -coltypes<- coltypes -coltypes<-,SparkDataFrame,character-method coltypes -column column -column,character-method column -column,jobj-method column -Column-class column -columnfunctions columnfunctions -columns columns -columns,SparkDataFrame-method columns -column_aggregate_functions column_aggregate_functions -column_collection_functions column_collection_functions -column_datetime_diff_functions column_datetime_diff_functions -column_datetime_functions column_datetime_functions -column_math_functions column_math_functions -column_misc_functions column_misc_functions -column_nonaggregate_functions column_nonaggregate_functions -column_string_functions column_string_functions -column_window_functions column_window_functions -concat column_collection_functions -concat,Column-method column_collection_functions -concat_ws column_string_functions -concat_ws,character,Column-method column_string_functions -contains columnfunctions -conv column_math_functions -conv,Column,numeric,numeric-method column_math_functions -corr corr -corr,Column-method corr -corr,SparkDataFrame-method corr -cos column_math_functions -cos,Column-method column_math_functions -cosh column_math_functions -cosh,Column-method column_math_functions -count count -count,Column-method count -count,GroupedData-method count -count,SparkDataFrame-method nrow -countDistinct column_aggregate_functions -countDistinct,Column-method column_aggregate_functions -cov cov -cov,characterOrColumn-method cov -cov,SparkDataFrame-method cov -covar_pop cov -covar_pop,characterOrColumn,characterOrColumn-method cov -covar_samp cov -covar_samp,characterOrColumn,characterOrColumn-method cov -crc32 column_misc_functions -crc32,Column-method column_misc_functions -createDataFrame createDataFrame -createDataFrame.default createDataFrame -createExternalTable createExternalTable-deprecated -createExternalTable.default createExternalTable-deprecated -createOrReplaceTempView createOrReplaceTempView -createOrReplaceTempView,SparkDataFrame,character-method createOrReplaceTempView -createTable createTable -create_array column_nonaggregate_functions -create_array,Column-method column_nonaggregate_functions -create_map column_nonaggregate_functions -create_map,Column-method column_nonaggregate_functions -crossJoin crossJoin -crossJoin,SparkDataFrame,SparkDataFrame-method crossJoin -crosstab crosstab -crosstab,SparkDataFrame,character,character-method crosstab -cube cube -cube,SparkDataFrame-method cube -cume_dist column_window_functions -cume_dist,missing-method column_window_functions -currentDatabase currentDatabase -current_date column_datetime_functions -current_date,missing-method column_datetime_functions -current_timestamp column_datetime_functions -current_timestamp,missing-method column_datetime_functions -dapply dapply -dapply,SparkDataFrame,function,characterOrstructType-method dapply -dapplyCollect dapplyCollect -dapplyCollect,SparkDataFrame,function-method dapplyCollect -datediff column_datetime_diff_functions -datediff,Column-method column_datetime_diff_functions -date_add column_datetime_diff_functions -date_add,Column,numeric-method column_datetime_diff_functions -date_format column_datetime_diff_functions -date_format,Column,character-method column_datetime_diff_functions -date_sub column_datetime_diff_functions -date_sub,Column,numeric-method column_datetime_diff_functions -date_trunc column_datetime_functions -date_trunc,character,Column-method column_datetime_functions -dayofmonth column_datetime_functions -dayofmonth,Column-method column_datetime_functions -dayofweek column_datetime_functions -dayofweek,Column-method column_datetime_functions -dayofyear column_datetime_functions -dayofyear,Column-method column_datetime_functions -DecisionTreeClassificationModel-class DecisionTreeClassificationModel-class -DecisionTreeRegressionModel-class DecisionTreeRegressionModel-class -decode column_string_functions -decode,Column,character-method column_string_functions -dense_rank column_window_functions -dense_rank,missing-method column_window_functions -desc columnfunctions -describe describe -describe,SparkDataFrame,ANY-method describe -describe,SparkDataFrame,character-method describe -describe,SparkDataFrame-method describe -dim dim -dim,SparkDataFrame-method dim -distinct distinct -distinct,SparkDataFrame-method distinct -drop drop -drop,ANY-method drop -drop,SparkDataFrame-method drop -dropDuplicates dropDuplicates -dropDuplicates,SparkDataFrame-method dropDuplicates -dropna nafunctions -dropna,SparkDataFrame-method nafunctions -dropTempTable dropTempTable-deprecated -dropTempTable.default dropTempTable-deprecated -dropTempView dropTempView -dtypes dtypes -dtypes,SparkDataFrame-method dtypes -element_at column_collection_functions -element_at,Column-method column_collection_functions -encode column_string_functions -encode,Column,character-method column_string_functions -endsWith endsWith -endsWith,Column-method endsWith -except except -except,SparkDataFrame,SparkDataFrame-method except -exceptAll exceptAll -exceptAll,SparkDataFrame,SparkDataFrame-method exceptAll -exp column_math_functions -exp,Column-method column_math_functions -explain explain -explain,SparkDataFrame-method explain -explain,StreamingQuery-method explain -explode column_collection_functions -explode,Column-method column_collection_functions -explode_outer column_collection_functions -explode_outer,Column-method column_collection_functions -expm1 column_math_functions -expm1,Column-method column_math_functions -expr column_nonaggregate_functions -expr,character-method column_nonaggregate_functions -factorial column_math_functions -factorial,Column-method column_math_functions -fillna nafunctions -fillna,SparkDataFrame-method nafunctions -filter filter -filter,SparkDataFrame,characterOrColumn-method filter -first first -first,characterOrColumn-method first -first,SparkDataFrame-method first -fitted fitted -fitted,BisectingKMeansModel-method spark.bisectingKmeans -fitted,KMeansModel-method fitted -flatten column_collection_functions -flatten,Column-method column_collection_functions -floor column_math_functions -floor,Column-method column_math_functions -format_number column_string_functions -format_number,Column,numeric-method column_string_functions -format_string column_string_functions -format_string,character,Column-method column_string_functions -FPGrowthModel-class FPGrowthModel-class -freqItems freqItems -freqItems,SparkDataFrame,character-method freqItems -freqItemsets,FPGrowthModel-method spark.fpGrowth -from_json column_collection_functions -from_json,Column,characterOrstructType-method column_collection_functions -from_unixtime column_datetime_functions -from_unixtime,Column-method column_datetime_functions -from_utc_timestamp column_datetime_diff_functions -from_utc_timestamp,Column,character-method column_datetime_diff_functions -gapply gapply -gapply,GroupedData-method gapply -gapply,SparkDataFrame-method gapply -gapplyCollect gapplyCollect -gapplyCollect,GroupedData-method gapplyCollect -gapplyCollect,SparkDataFrame-method gapplyCollect -GaussianMixtureModel-class GaussianMixtureModel-class -GBTClassificationModel-class GBTClassificationModel-class -GBTRegressionModel-class GBTRegressionModel-class -GeneralizedLinearRegressionModel-class GeneralizedLinearRegressionModel-class -getField columnfunctions -getItem columnfunctions -getLocalProperty getLocalProperty -getNumPartitions getNumPartitions -getNumPartitions,SparkDataFrame-method getNumPartitions -glm glm -glm,formula,ANY,SparkDataFrame-method glm -greatest column_nonaggregate_functions -greatest,Column-method column_nonaggregate_functions -groupBy groupBy -groupBy,SparkDataFrame-method groupBy -groupedData GroupedData -GroupedData-class GroupedData -grouping_bit column_aggregate_functions -grouping_bit,Column-method column_aggregate_functions -grouping_id column_aggregate_functions -grouping_id,Column-method column_aggregate_functions -group_by groupBy -group_by,SparkDataFrame-method groupBy -hash column_misc_functions -hash,Column-method column_misc_functions -hashCode hashCode -head head -head,SparkDataFrame-method head -hex column_math_functions -hex,Column-method column_math_functions -hint hint -hint,SparkDataFrame,character-method hint -histogram histogram -histogram,SparkDataFrame,characterOrColumn-method histogram -hour column_datetime_functions -hour,Column-method column_datetime_functions -hypot column_math_functions -hypot,Column-method column_math_functions -ifelse column_nonaggregate_functions -ifelse,Column-method column_nonaggregate_functions -initcap column_string_functions -initcap,Column-method column_string_functions -input_file_name column_nonaggregate_functions -input_file_name,missing-method column_nonaggregate_functions -insertInto insertInto -insertInto,SparkDataFrame,character-method insertInto -install.spark install.spark -instr column_string_functions -instr,Column,character-method column_string_functions -intersect intersect -intersect,SparkDataFrame,SparkDataFrame-method intersect -intersectAll intersectAll -intersectAll,SparkDataFrame,SparkDataFrame-method intersectAll -is.nan column_nonaggregate_functions -is.nan,Column-method column_nonaggregate_functions -isActive isActive -isActive,StreamingQuery-method isActive -isLocal isLocal -isLocal,SparkDataFrame-method isLocal -isNaN columnfunctions -isnan column_nonaggregate_functions -isnan,Column-method column_nonaggregate_functions -isNotNull columnfunctions -isNull columnfunctions -IsotonicRegressionModel-class IsotonicRegressionModel-class -isStreaming isStreaming -isStreaming,SparkDataFrame-method isStreaming -join join -join,SparkDataFrame,SparkDataFrame-method join -jsonFile read.json -jsonFile.default read.json -KMeansModel-class KMeansModel-class -KSTest-class KSTest-class -kurtosis column_aggregate_functions -kurtosis,Column-method column_aggregate_functions -lag column_window_functions -lag,characterOrColumn-method column_window_functions -last last -last,characterOrColumn-method last -lastProgress lastProgress -lastProgress,StreamingQuery-method lastProgress -last_day column_datetime_functions -last_day,Column-method column_datetime_functions -LDAModel-class LDAModel-class -lead column_window_functions -lead,characterOrColumn,numeric-method column_window_functions -least column_nonaggregate_functions -least,Column-method column_nonaggregate_functions -length column_string_functions -length,Column-method column_string_functions -levenshtein column_string_functions -levenshtein,Column-method column_string_functions -like columnfunctions -limit limit -limit,SparkDataFrame,numeric-method limit -LinearSVCModel-class LinearSVCModel-class -listColumns listColumns -listDatabases listDatabases -listFunctions listFunctions -listTables listTables -lit column_nonaggregate_functions -lit,ANY-method column_nonaggregate_functions -loadDF read.df -loadDF.default read.df -localCheckpoint localCheckpoint -localCheckpoint,SparkDataFrame-method localCheckpoint -locate column_string_functions -locate,character,Column-method column_string_functions -log column_math_functions -log,Column-method column_math_functions -log10 column_math_functions -log10,Column-method column_math_functions -log1p column_math_functions -log1p,Column-method column_math_functions -log2 column_math_functions -log2,Column-method column_math_functions -LogisticRegressionModel-class LogisticRegressionModel-class -lower column_string_functions -lower,Column-method column_string_functions -lpad column_string_functions -lpad,Column,numeric,character-method column_string_functions -ltrim column_string_functions -ltrim,Column,character-method column_string_functions -ltrim,Column,missing-method column_string_functions -map_from_arrays column_collection_functions -map_from_arrays,Column,Column-method column_collection_functions -map_from_arrays,Column-method column_collection_functions -map_keys column_collection_functions -map_keys,Column-method column_collection_functions -map_values column_collection_functions -map_values,Column-method column_collection_functions -max column_aggregate_functions -max,Column-method column_aggregate_functions -md5 column_misc_functions -md5,Column-method column_misc_functions -mean column_aggregate_functions -mean,Column-method column_aggregate_functions -merge merge -merge,SparkDataFrame,SparkDataFrame-method merge -min column_aggregate_functions -min,Column-method column_aggregate_functions -minute column_datetime_functions -minute,Column-method column_datetime_functions -monotonically_increasing_id column_nonaggregate_functions -monotonically_increasing_id,missing-method column_nonaggregate_functions -month column_datetime_functions -month,Column-method column_datetime_functions -months_between column_datetime_diff_functions -months_between,Column-method column_datetime_diff_functions -MultilayerPerceptronClassificationModel-class MultilayerPerceptronClassificationModel-class -mutate mutate -mutate,SparkDataFrame-method mutate -n count -n,Column-method count -na.omit nafunctions -na.omit,SparkDataFrame-method nafunctions -NaiveBayesModel-class NaiveBayesModel-class -names columns -names,SparkDataFrame-method columns -names<- columns -names<-,SparkDataFrame-method columns -nanvl column_nonaggregate_functions -nanvl,Column-method column_nonaggregate_functions -ncol ncol -ncol,SparkDataFrame-method ncol -negate column_nonaggregate_functions -negate,Column-method column_nonaggregate_functions -next_day column_datetime_diff_functions -next_day,Column,character-method column_datetime_diff_functions -not not -not,Column-method not -nrow nrow -nrow,SparkDataFrame-method nrow -ntile column_window_functions -ntile,numeric-method column_window_functions -n_distinct column_aggregate_functions -n_distinct,Column-method column_aggregate_functions -orderBy orderBy -orderBy,SparkDataFrame,characterOrColumn-method arrange -orderBy,WindowSpec,character-method orderBy -orderBy,WindowSpec,Column-method orderBy -otherwise otherwise -otherwise,Column-method otherwise -over over -over,Column,WindowSpec-method over -parquetFile read.parquet -parquetFile.default read.parquet -partitionBy partitionBy -partitionBy,WindowSpec-method partitionBy -percent_rank column_window_functions -percent_rank,missing-method column_window_functions -persist persist -persist,SparkDataFrame,character-method persist -pivot pivot -pivot,GroupedData,character-method pivot -pmod column_math_functions -pmod,Column-method column_math_functions -posexplode column_collection_functions -posexplode,Column-method column_collection_functions -posexplode_outer column_collection_functions -posexplode_outer,Column-method column_collection_functions -predict predict -predict,AFTSurvivalRegressionModel-method spark.survreg -predict,ALSModel-method spark.als -predict,BisectingKMeansModel-method spark.bisectingKmeans -predict,DecisionTreeClassificationModel-method spark.decisionTree -predict,DecisionTreeRegressionModel-method spark.decisionTree -predict,FPGrowthModel-method spark.fpGrowth -predict,GaussianMixtureModel,SparkDataFrame-method spark.gaussianMixture -predict,GaussianMixtureModel-method spark.gaussianMixture -predict,GBTClassificationModel-method spark.gbt -predict,GBTRegressionModel-method spark.gbt -predict,GeneralizedLinearRegressionModel-method spark.glm -predict,IsotonicRegressionModel,SparkDataFrame-method spark.isoreg -predict,IsotonicRegressionModel-method spark.isoreg -predict,KMeansModel-method spark.kmeans -predict,LinearSVCModel,SparkDataFrame-method spark.svmLinear -predict,LinearSVCModel-method spark.svmLinear -predict,LogisticRegressionModel,SparkDataFrame-method spark.logit -predict,LogisticRegressionModel-method spark.logit -predict,MultilayerPerceptronClassificationModel-method spark.mlp -predict,NaiveBayesModel-method spark.naiveBayes -predict,RandomForestClassificationModel-method spark.randomForest -predict,RandomForestRegressionModel-method spark.randomForest -print.jobj print.jobj -print.structField print.structField -print.structType print.structType -print.summary.DecisionTreeClassificationModel spark.decisionTree -print.summary.DecisionTreeRegressionModel spark.decisionTree -print.summary.GBTClassificationModel spark.gbt -print.summary.GBTRegressionModel spark.gbt -print.summary.GeneralizedLinearRegressionModel spark.glm -print.summary.KSTest spark.kstest -print.summary.RandomForestClassificationModel spark.randomForest -print.summary.RandomForestRegressionModel spark.randomForest -printSchema printSchema -printSchema,SparkDataFrame-method printSchema -quarter column_datetime_functions -quarter,Column-method column_datetime_functions -queryName queryName -queryName,StreamingQuery-method queryName -rand column_nonaggregate_functions -rand,missing-method column_nonaggregate_functions -rand,numeric-method column_nonaggregate_functions -randn column_nonaggregate_functions -randn,missing-method column_nonaggregate_functions -randn,numeric-method column_nonaggregate_functions -RandomForestClassificationModel-class RandomForestClassificationModel-class -RandomForestRegressionModel-class RandomForestRegressionModel-class -randomSplit randomSplit -randomSplit,SparkDataFrame,numeric-method randomSplit -rangeBetween rangeBetween -rangeBetween,WindowSpec,numeric,numeric-method rangeBetween -rank column_window_functions -rank,ANY-method column_window_functions -rank,missing-method column_window_functions -rbind rbind -rbind,SparkDataFrame-method rbind -read.df read.df -read.df.default read.df -read.jdbc read.jdbc -read.json read.json -read.json.default read.json -read.ml read.ml -read.orc read.orc -read.parquet read.parquet -read.parquet.default read.parquet -read.stream read.stream -read.text read.text -read.text.default read.text -recoverPartitions recoverPartitions -refreshByPath refreshByPath -refreshTable refreshTable -regexp_extract column_string_functions -regexp_extract,Column,character,numeric-method column_string_functions -regexp_replace column_string_functions -regexp_replace,Column,character,character-method column_string_functions -registerTempTable registerTempTable-deprecated -registerTempTable,SparkDataFrame,character-method registerTempTable-deprecated -rename rename -rename,SparkDataFrame-method rename -repartition repartition -repartition,SparkDataFrame-method repartition -repartitionByRange repartitionByRange -repartitionByRange,SparkDataFrame-method repartitionByRange -repeat_string column_string_functions -repeat_string,Column,numeric-method column_string_functions -repeat_string,Column-method column_string_functions -reverse column_collection_functions -reverse,Column-method column_collection_functions -rint column_math_functions -rint,Column-method column_math_functions -rlike columnfunctions -rollup rollup -rollup,SparkDataFrame-method rollup -round column_math_functions -round,Column-method column_math_functions -rowsBetween rowsBetween -rowsBetween,WindowSpec,numeric,numeric-method rowsBetween -row_number column_window_functions -row_number,missing-method column_window_functions -rpad column_string_functions -rpad,Column,numeric,character-method column_string_functions -rtrim column_string_functions -rtrim,Column,character-method column_string_functions -rtrim,Column,missing-method column_string_functions -sample sample -sample,SparkDataFrame-method sample -sampleBy sampleBy -sampleBy,SparkDataFrame,character,list,numeric-method sampleBy -sample_frac sample -sample_frac,SparkDataFrame-method sample -saveAsParquetFile write.parquet -saveAsParquetFile,SparkDataFrame,character-method write.parquet -saveAsTable saveAsTable -saveAsTable,SparkDataFrame,character-method saveAsTable -saveDF write.df -saveDF,SparkDataFrame,character-method write.df -schema schema -schema,SparkDataFrame-method schema -sd column_aggregate_functions -sd,Column-method column_aggregate_functions -second column_datetime_functions -second,Column-method column_datetime_functions -select select -select,SparkDataFrame,character-method select -select,SparkDataFrame,Column-method select -select,SparkDataFrame,list-method select -selectExpr selectExpr -selectExpr,SparkDataFrame,character-method selectExpr -setCheckpointDir setCheckpointDir -setCurrentDatabase setCurrentDatabase -setJobDescription setJobDescription -setJobGroup setJobGroup -setJobGroup.default setJobGroup -setLocalProperty setLocalProperty -setLogLevel setLogLevel -sha1 column_misc_functions -sha1,Column-method column_misc_functions -sha2 column_misc_functions -sha2,Column,numeric-method column_misc_functions -shiftLeft column_math_functions -shiftLeft,Column,numeric-method column_math_functions -shiftRight column_math_functions -shiftRight,Column,numeric-method column_math_functions -shiftRightUnsigned column_math_functions -shiftRightUnsigned,Column,numeric-method column_math_functions -show show -show,Column-method show -show,GroupedData-method show -show,SparkDataFrame-method show -show,StreamingQuery-method show -show,WindowSpec-method show -showDF showDF -showDF,SparkDataFrame-method showDF -shuffle column_collection_functions -shuffle,Column-method column_collection_functions -sign column_math_functions -sign,Column-method column_math_functions -signum column_math_functions -signum,Column-method column_math_functions -sin column_math_functions -sin,Column-method column_math_functions -sinh column_math_functions -sinh,Column-method column_math_functions -size column_collection_functions -size,Column-method column_collection_functions -skewness column_aggregate_functions -skewness,Column-method column_aggregate_functions -slice column_collection_functions -slice,Column-method column_collection_functions -sort_array column_collection_functions -sort_array,Column-method column_collection_functions -soundex column_string_functions -soundex,Column-method column_string_functions -spark.addFile spark.addFile -spark.als spark.als -spark.als,SparkDataFrame-method spark.als -spark.associationRules spark.fpGrowth -spark.associationRules,FPGrowthModel-method spark.fpGrowth -spark.bisectingKmeans spark.bisectingKmeans -spark.bisectingKmeans,SparkDataFrame,formula-method spark.bisectingKmeans -spark.decisionTree spark.decisionTree -spark.decisionTree,SparkDataFrame,formula-method spark.decisionTree -spark.fpGrowth spark.fpGrowth -spark.fpGrowth,SparkDataFrame-method spark.fpGrowth -spark.freqItemsets spark.fpGrowth -spark.freqItemsets,FPGrowthModel-method spark.fpGrowth -spark.gaussianMixture spark.gaussianMixture -spark.gaussianMixture,SparkDataFrame,formula-method spark.gaussianMixture -spark.gbt spark.gbt -spark.gbt,SparkDataFrame,formula-method spark.gbt -spark.getSparkFiles spark.getSparkFiles -spark.getSparkFilesRootDirectory spark.getSparkFilesRootDirectory -spark.glm spark.glm -spark.glm,SparkDataFrame,formula-method spark.glm -spark.isoreg spark.isoreg -spark.isoreg,SparkDataFrame,formula-method spark.isoreg -spark.kmeans spark.kmeans -spark.kmeans,SparkDataFrame,formula-method spark.kmeans -spark.kstest spark.kstest -spark.kstest,SparkDataFrame-method spark.kstest -spark.lapply spark.lapply -spark.lda spark.lda -spark.lda,SparkDataFrame-method spark.lda -spark.logit spark.logit -spark.logit,SparkDataFrame,formula-method spark.logit -spark.mlp spark.mlp -spark.mlp,SparkDataFrame,formula-method spark.mlp -spark.naiveBayes spark.naiveBayes -spark.naiveBayes,SparkDataFrame,formula-method spark.naiveBayes -spark.perplexity spark.lda -spark.perplexity,LDAModel,SparkDataFrame-method spark.lda -spark.perplexity,LDAModel-method spark.lda -spark.posterior spark.lda -spark.posterior,LDAModel,SparkDataFrame-method spark.lda -spark.randomForest spark.randomForest -spark.randomForest,SparkDataFrame,formula-method spark.randomForest -spark.survreg spark.survreg -spark.survreg,SparkDataFrame,formula-method spark.survreg -spark.svmLinear spark.svmLinear -spark.svmLinear,SparkDataFrame,formula-method spark.svmLinear -SparkDataFrame-class SparkDataFrame -sparkR.callJMethod sparkR.callJMethod -sparkR.callJStatic sparkR.callJStatic -sparkR.conf sparkR.conf -sparkR.init sparkR.init-deprecated -sparkR.newJObject sparkR.newJObject -sparkR.session sparkR.session -sparkR.session.stop sparkR.session.stop -sparkR.stop sparkR.session.stop -sparkR.uiWebUrl sparkR.uiWebUrl -sparkR.version sparkR.version -sparkRHive.init sparkRHive.init-deprecated -sparkRSQL.init sparkRSQL.init-deprecated -spark_partition_id column_nonaggregate_functions -spark_partition_id,missing-method column_nonaggregate_functions -split_string column_string_functions -split_string,Column,character-method column_string_functions -split_string,Column-method column_string_functions -sql sql -sql.default sql -sqrt column_math_functions -sqrt,Column-method column_math_functions -startsWith startsWith -startsWith,Column-method startsWith -status status -status,StreamingQuery-method status -stddev column_aggregate_functions -stddev,Column-method column_aggregate_functions -stddev_pop column_aggregate_functions -stddev_pop,Column-method column_aggregate_functions -stddev_samp column_aggregate_functions -stddev_samp,Column-method column_aggregate_functions -stopQuery stopQuery -stopQuery,StreamingQuery-method stopQuery -storageLevel storageLevel -storageLevel,SparkDataFrame-method storageLevel -str str -str,SparkDataFrame-method str -StreamingQuery-class StreamingQuery -struct column_nonaggregate_functions -struct,characterOrColumn-method column_nonaggregate_functions -structField structField -structField.character structField -structField.jobj structField -structType structType -structType.character structType -structType.jobj structType -structType.structField structType -subset subset -subset,SparkDataFrame-method subset -substr substr -substr,Column-method substr -substring_index column_string_functions -substring_index,Column,character,numeric-method column_string_functions -sum column_aggregate_functions -sum,Column-method column_aggregate_functions -sumDistinct column_aggregate_functions -sumDistinct,Column-method column_aggregate_functions -summarize summarize -summarize,GroupedData-method summarize -summarize,SparkDataFrame-method summarize -summary summary -summary,AFTSurvivalRegressionModel-method spark.survreg -summary,ALSModel-method spark.als -summary,BisectingKMeansModel-method spark.bisectingKmeans -summary,DecisionTreeClassificationModel-method spark.decisionTree -summary,DecisionTreeRegressionModel-method spark.decisionTree -summary,GaussianMixtureModel-method spark.gaussianMixture -summary,GBTClassificationModel-method spark.gbt -summary,GBTRegressionModel-method spark.gbt -summary,GeneralizedLinearRegressionModel-method spark.glm -summary,IsotonicRegressionModel-method spark.isoreg -summary,KMeansModel-method spark.kmeans -summary,KSTest-method spark.kstest -summary,LDAModel-method spark.lda -summary,LinearSVCModel-method spark.svmLinear -summary,LogisticRegressionModel-method spark.logit -summary,MultilayerPerceptronClassificationModel-method spark.mlp -summary,NaiveBayesModel-method spark.naiveBayes -summary,RandomForestClassificationModel-method spark.randomForest -summary,RandomForestRegressionModel-method spark.randomForest -summary,SparkDataFrame-method summary -tableNames tableNames -tableNames.default tableNames -tables tables -tables.default tables -tableToDF tableToDF -take take -take,SparkDataFrame,numeric-method take -tan column_math_functions -tan,Column-method column_math_functions -tanh column_math_functions -tanh,Column-method column_math_functions -toDegrees column_math_functions -toDegrees,Column-method column_math_functions -toJSON toJSON -toJSON,SparkDataFrame-method toJSON -toRadians column_math_functions -toRadians,Column-method column_math_functions -to_date column_datetime_functions -to_date,Column,character-method column_datetime_functions -to_date,Column,missing-method column_datetime_functions -to_json column_collection_functions -to_json,Column-method column_collection_functions -to_timestamp column_datetime_functions -to_timestamp,Column,character-method column_datetime_functions -to_timestamp,Column,missing-method column_datetime_functions -to_utc_timestamp column_datetime_diff_functions -to_utc_timestamp,Column,character-method column_datetime_diff_functions -transform mutate -transform,SparkDataFrame-method mutate -translate column_string_functions -translate,Column,character,character-method column_string_functions -trim column_string_functions -trim,Column,character-method column_string_functions -trim,Column,missing-method column_string_functions -trunc column_datetime_functions -trunc,Column-method column_datetime_functions -unbase64 column_string_functions -unbase64,Column-method column_string_functions -uncacheTable uncacheTable -uncacheTable.default uncacheTable -unhex column_math_functions -unhex,Column-method column_math_functions -union union -union,SparkDataFrame,SparkDataFrame-method union -unionAll union -unionAll,SparkDataFrame,SparkDataFrame-method union -unionByName unionByName -unionByName,SparkDataFrame,SparkDataFrame-method unionByName -unique distinct -unique,SparkDataFrame-method distinct -unix_timestamp column_datetime_functions -unix_timestamp,Column,character-method column_datetime_functions -unix_timestamp,Column,missing-method column_datetime_functions -unix_timestamp,missing,missing-method column_datetime_functions -unpersist unpersist -unpersist,SparkDataFrame-method unpersist -upper column_string_functions -upper,Column-method column_string_functions -var column_aggregate_functions -var,Column-method column_aggregate_functions -variance column_aggregate_functions -variance,Column-method column_aggregate_functions -var_pop column_aggregate_functions -var_pop,Column-method column_aggregate_functions -var_samp column_aggregate_functions -var_samp,Column-method column_aggregate_functions -weekofyear column_datetime_functions -weekofyear,Column-method column_datetime_functions -when column_nonaggregate_functions -when,Column-method column_nonaggregate_functions -where filter -where,SparkDataFrame,characterOrColumn-method filter -window column_datetime_functions -window,Column-method column_datetime_functions -windowOrderBy windowOrderBy -windowOrderBy,character-method windowOrderBy -windowOrderBy,Column-method windowOrderBy -windowPartitionBy windowPartitionBy -windowPartitionBy,character-method windowPartitionBy -windowPartitionBy,Column-method windowPartitionBy -WindowSpec-class WindowSpec -with with -with,SparkDataFrame-method with -withColumn withColumn -withColumn,SparkDataFrame,character-method withColumn -withColumnRenamed rename -withColumnRenamed,SparkDataFrame,character,character-method rename -withWatermark withWatermark -withWatermark,SparkDataFrame,character,character-method withWatermark -write.df write.df -write.df,SparkDataFrame-method write.df -write.jdbc write.jdbc -write.jdbc,SparkDataFrame,character,character-method write.jdbc -write.json write.json -write.json,SparkDataFrame,character-method write.json -write.ml write.ml -write.ml,AFTSurvivalRegressionModel,character-method spark.survreg -write.ml,ALSModel,character-method spark.als -write.ml,BisectingKMeansModel,character-method spark.bisectingKmeans -write.ml,DecisionTreeClassificationModel,character-method spark.decisionTree -write.ml,DecisionTreeRegressionModel,character-method spark.decisionTree -write.ml,FPGrowthModel,character-method spark.fpGrowth -write.ml,GaussianMixtureModel,character-method spark.gaussianMixture -write.ml,GBTClassificationModel,character-method spark.gbt -write.ml,GBTRegressionModel,character-method spark.gbt -write.ml,GeneralizedLinearRegressionModel,character-method spark.glm -write.ml,IsotonicRegressionModel,character-method spark.isoreg -write.ml,KMeansModel,character-method spark.kmeans -write.ml,LDAModel,character-method spark.lda -write.ml,LinearSVCModel,character-method spark.svmLinear -write.ml,LogisticRegressionModel,character-method spark.logit -write.ml,MultilayerPerceptronClassificationModel,character-method spark.mlp -write.ml,NaiveBayesModel,character-method spark.naiveBayes -write.ml,RandomForestClassificationModel,character-method spark.randomForest -write.ml,RandomForestRegressionModel,character-method spark.randomForest -write.orc write.orc -write.orc,SparkDataFrame,character-method write.orc -write.parquet write.parquet -write.parquet,SparkDataFrame,character-method write.parquet -write.stream write.stream -write.stream,SparkDataFrame-method write.stream -write.text write.text -write.text,SparkDataFrame,character-method write.text -year column_datetime_functions -year,Column-method column_datetime_functions -[ subset -[,SparkDataFrame-method subset -[[ subset -[[,SparkDataFrame,numericOrcharacter-method subset -[[<- subset -[[<-,SparkDataFrame,numericOrcharacter-method subset diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/SparkR.rdb b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/SparkR.rdb deleted file mode 100644 index b199ea1..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/SparkR.rdb and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/SparkR.rdx b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/SparkR.rdx deleted file mode 100644 index 9c64802..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/SparkR.rdx and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/aliases.rds b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/aliases.rds deleted file mode 100644 index 44be2a4..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/aliases.rds and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/paths.rds b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/paths.rds deleted file mode 100644 index b271a7f..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/help/paths.rds and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/html/00Index.html b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/html/00Index.html deleted file mode 100644 index 36a351d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/html/00Index.html +++ /dev/null @@ -1,1949 +0,0 @@ - - -R: R Front End for 'Apache Spark' - - - -

R Front End for 'Apache Spark' - -

-
-
-[Up] -[Top] -

Documentation for package ‘SparkR’ version 2.4.3

- - - -

Help Pages

- - -

-A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -Y -misc -

- - -

-- A --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
absMath functions for Column operations
abs-methodMath functions for Column operations
acosMath functions for Column operations
acos-methodMath functions for Column operations
add_monthsDate time arithmetic functions for Column operations
add_months-methodDate time arithmetic functions for Column operations
AFTSurvivalRegressionModel-classS4 class that represents a AFTSurvivalRegressionModel
aggsummarize
agg-methodsummarize
aliasalias
alias-methodalias
ALSModel-classS4 class that represents an ALSModel
approxCountDistinctAggregate functions for Column operations
approxCountDistinct-methodAggregate functions for Column operations
approxQuantileCalculates the approximate quantiles of numerical columns of a SparkDataFrame
approxQuantile-methodCalculates the approximate quantiles of numerical columns of a SparkDataFrame
arrangeArrange Rows by Variables
arrange-methodArrange Rows by Variables
arrays_overlapCollection functions for Column operations
arrays_overlap-methodCollection functions for Column operations
arrays_zipCollection functions for Column operations
arrays_zip-methodCollection functions for Column operations
array_containsCollection functions for Column operations
array_contains-methodCollection functions for Column operations
array_distinctCollection functions for Column operations
array_distinct-methodCollection functions for Column operations
array_exceptCollection functions for Column operations
array_except-methodCollection functions for Column operations
array_intersectCollection functions for Column operations
array_intersect-methodCollection functions for Column operations
array_joinCollection functions for Column operations
array_join-methodCollection functions for Column operations
array_maxCollection functions for Column operations
array_max-methodCollection functions for Column operations
array_minCollection functions for Column operations
array_min-methodCollection functions for Column operations
array_positionCollection functions for Column operations
array_position-methodCollection functions for Column operations
array_removeCollection functions for Column operations
array_remove-methodCollection functions for Column operations
array_repeatCollection functions for Column operations
array_repeat-methodCollection functions for Column operations
array_sortCollection functions for Column operations
array_sort-methodCollection functions for Column operations
array_unionCollection functions for Column operations
array_union-methodCollection functions for Column operations
as.data.frameDownload data from a SparkDataFrame into a R data.frame
as.data.frame-methodDownload data from a SparkDataFrame into a R data.frame
as.DataFrameCreate a SparkDataFrame
as.DataFrame.defaultCreate a SparkDataFrame
ascA set of operations working with SparkDataFrame columns
asciiString functions for Column operations
ascii-methodString functions for Column operations
asinMath functions for Column operations
asin-methodMath functions for Column operations
associationRules-methodFP-growth
atanMath functions for Column operations
atan-methodMath functions for Column operations
atan2Math functions for Column operations
atan2-methodMath functions for Column operations
attachAttach SparkDataFrame to R search path
attach-methodAttach SparkDataFrame to R search path
avgavg
avg-methodavg
awaitTerminationawaitTermination
awaitTermination-methodawaitTermination
- -

-- B --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
base64String functions for Column operations
base64-methodString functions for Column operations
betweenbetween
between-methodbetween
binMath functions for Column operations
bin-methodMath functions for Column operations
BisectingKMeansModel-classS4 class that represents a BisectingKMeansModel
bitwiseNOTNon-aggregate functions for Column operations
bitwiseNOT-methodNon-aggregate functions for Column operations
broadcastbroadcast
broadcast-methodbroadcast
broundMath functions for Column operations
bround-methodMath functions for Column operations
- -

-- C --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cacheCache
cache-methodCache
cacheTableCache Table
cacheTable.defaultCache Table
cancelJobGroupCancel active jobs for the specified group
cancelJobGroup.defaultCancel active jobs for the specified group
castCasts the column to a different data type.
cast-methodCasts the column to a different data type.
cbrtMath functions for Column operations
cbrt-methodMath functions for Column operations
ceilMath functions for Column operations
ceil-methodMath functions for Column operations
ceilingMath functions for Column operations
ceiling-methodMath functions for Column operations
checkpointcheckpoint
checkpoint-methodcheckpoint
clearCacheClear Cache
clearCache.defaultClear Cache
clearJobGroupClear current job group ID and its description
clearJobGroup.defaultClear current job group ID and its description
coalesceCoalesce
coalesce-methodCoalesce
coalesce-methodNon-aggregate functions for Column operations
collectCollects all the elements of a SparkDataFrame and coerces them into an R data.frame.
collect-methodCollects all the elements of a SparkDataFrame and coerces them into an R data.frame.
collect_listAggregate functions for Column operations
collect_list-methodAggregate functions for Column operations
collect_setAggregate functions for Column operations
collect_set-methodAggregate functions for Column operations
colnamesColumn Names of SparkDataFrame
colnames-methodColumn Names of SparkDataFrame
colnames<-Column Names of SparkDataFrame
colnames<--methodColumn Names of SparkDataFrame
coltypescoltypes
coltypes-methodcoltypes
coltypes<-coltypes
coltypes<--methodcoltypes
columnS4 class that represents a SparkDataFrame column
Column-classS4 class that represents a SparkDataFrame column
column-methodS4 class that represents a SparkDataFrame column
columnfunctionsA set of operations working with SparkDataFrame columns
columnsColumn Names of SparkDataFrame
columns-methodColumn Names of SparkDataFrame
column_aggregate_functionsAggregate functions for Column operations
column_collection_functionsCollection functions for Column operations
column_datetime_diff_functionsDate time arithmetic functions for Column operations
column_datetime_functionsDate time functions for Column operations
column_math_functionsMath functions for Column operations
column_misc_functionsMiscellaneous functions for Column operations
column_nonaggregate_functionsNon-aggregate functions for Column operations
column_string_functionsString functions for Column operations
column_window_functionsWindow functions for Column operations
concatCollection functions for Column operations
concat-methodCollection functions for Column operations
concat_wsString functions for Column operations
concat_ws-methodString functions for Column operations
containsA set of operations working with SparkDataFrame columns
convMath functions for Column operations
conv-methodMath functions for Column operations
corrcorr
corr-methodcorr
cosMath functions for Column operations
cos-methodMath functions for Column operations
coshMath functions for Column operations
cosh-methodMath functions for Column operations
countCount
count-methodCount
count-methodReturns the number of rows in a SparkDataFrame
countDistinctAggregate functions for Column operations
countDistinct-methodAggregate functions for Column operations
covcov
cov-methodcov
covar_popcov
covar_pop-methodcov
covar_sampcov
covar_samp-methodcov
crc32Miscellaneous functions for Column operations
crc32-methodMiscellaneous functions for Column operations
createDataFrameCreate a SparkDataFrame
createDataFrame.defaultCreate a SparkDataFrame
createExternalTable(Deprecated) Create an external table
createExternalTable.default(Deprecated) Create an external table
createOrReplaceTempViewCreates a temporary view using the given name.
createOrReplaceTempView-methodCreates a temporary view using the given name.
createTableCreates a table based on the dataset in a data source
create_arrayNon-aggregate functions for Column operations
create_array-methodNon-aggregate functions for Column operations
create_mapNon-aggregate functions for Column operations
create_map-methodNon-aggregate functions for Column operations
crossJoinCrossJoin
crossJoin-methodCrossJoin
crosstabComputes a pair-wise frequency table of the given columns
crosstab-methodComputes a pair-wise frequency table of the given columns
cubecube
cube-methodcube
cume_distWindow functions for Column operations
cume_dist-methodWindow functions for Column operations
currentDatabaseReturns the current default database
current_dateDate time functions for Column operations
current_date-methodDate time functions for Column operations
current_timestampDate time functions for Column operations
current_timestamp-methodDate time functions for Column operations
- -

-- D --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
dapplydapply
dapply-methoddapply
dapplyCollectdapplyCollect
dapplyCollect-methoddapplyCollect
datediffDate time arithmetic functions for Column operations
datediff-methodDate time arithmetic functions for Column operations
date_addDate time arithmetic functions for Column operations
date_add-methodDate time arithmetic functions for Column operations
date_formatDate time arithmetic functions for Column operations
date_format-methodDate time arithmetic functions for Column operations
date_subDate time arithmetic functions for Column operations
date_sub-methodDate time arithmetic functions for Column operations
date_truncDate time functions for Column operations
date_trunc-methodDate time functions for Column operations
dayofmonthDate time functions for Column operations
dayofmonth-methodDate time functions for Column operations
dayofweekDate time functions for Column operations
dayofweek-methodDate time functions for Column operations
dayofyearDate time functions for Column operations
dayofyear-methodDate time functions for Column operations
DecisionTreeClassificationModel-classS4 class that represents a DecisionTreeClassificationModel
DecisionTreeRegressionModel-classS4 class that represents a DecisionTreeRegressionModel
decodeString functions for Column operations
decode-methodString functions for Column operations
dense_rankWindow functions for Column operations
dense_rank-methodWindow functions for Column operations
descA set of operations working with SparkDataFrame columns
describedescribe
describe-methoddescribe
dimReturns the dimensions of SparkDataFrame
dim-methodReturns the dimensions of SparkDataFrame
distinctDistinct
distinct-methodDistinct
dropdrop
drop-methoddrop
dropDuplicatesdropDuplicates
dropDuplicates-methoddropDuplicates
dropnaA set of SparkDataFrame functions working with NA values
dropna-methodA set of SparkDataFrame functions working with NA values
dropTempTable(Deprecated) Drop Temporary Table
dropTempTable.default(Deprecated) Drop Temporary Table
dropTempViewDrops the temporary view with the given view name in the catalog.
dtypesDataTypes
dtypes-methodDataTypes
- -

-- E --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
element_atCollection functions for Column operations
element_at-methodCollection functions for Column operations
encodeString functions for Column operations
encode-methodString functions for Column operations
endsWithendsWith
endsWith-methodendsWith
exceptexcept
except-methodexcept
exceptAllexceptAll
exceptAll-methodexceptAll
expMath functions for Column operations
exp-methodMath functions for Column operations
explainExplain
explain-methodExplain
explodeCollection functions for Column operations
explode-methodCollection functions for Column operations
explode_outerCollection functions for Column operations
explode_outer-methodCollection functions for Column operations
expm1Math functions for Column operations
expm1-methodMath functions for Column operations
exprNon-aggregate functions for Column operations
expr-methodNon-aggregate functions for Column operations
- -

-- F --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
factorialMath functions for Column operations
factorial-methodMath functions for Column operations
fillnaA set of SparkDataFrame functions working with NA values
fillna-methodA set of SparkDataFrame functions working with NA values
filterFilter
filter-methodFilter
firstReturn the first row of a SparkDataFrame
first-methodReturn the first row of a SparkDataFrame
fittedGet fitted result from a k-means model
fitted-methodGet fitted result from a k-means model
fitted-methodBisecting K-Means Clustering Model
flattenCollection functions for Column operations
flatten-methodCollection functions for Column operations
floorMath functions for Column operations
floor-methodMath functions for Column operations
format_numberString functions for Column operations
format_number-methodString functions for Column operations
format_stringString functions for Column operations
format_string-methodString functions for Column operations
FPGrowthModel-classS4 class that represents a FPGrowthModel
freqItemsFinding frequent items for columns, possibly with false positives
freqItems-methodFinding frequent items for columns, possibly with false positives
freqItemsets-methodFP-growth
from_jsonCollection functions for Column operations
from_json-methodCollection functions for Column operations
from_unixtimeDate time functions for Column operations
from_unixtime-methodDate time functions for Column operations
from_utc_timestampDate time arithmetic functions for Column operations
from_utc_timestamp-methodDate time arithmetic functions for Column operations
- -

-- G --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
gapplygapply
gapply-methodgapply
gapplyCollectgapplyCollect
gapplyCollect-methodgapplyCollect
GaussianMixtureModel-classS4 class that represents a GaussianMixtureModel
GBTClassificationModel-classS4 class that represents a GBTClassificationModel
GBTRegressionModel-classS4 class that represents a GBTRegressionModel
GeneralizedLinearRegressionModel-classS4 class that represents a generalized linear model
getFieldA set of operations working with SparkDataFrame columns
getItemA set of operations working with SparkDataFrame columns
getLocalPropertyGet a local property set in this thread, or 'NULL' if it is missing. See 'setLocalProperty'.
getNumPartitionsgetNumPartitions
getNumPartitions-methodgetNumPartitions
glmGeneralized Linear Models (R-compliant)
glm-methodGeneralized Linear Models (R-compliant)
greatestNon-aggregate functions for Column operations
greatest-methodNon-aggregate functions for Column operations
groupByGroupBy
groupBy-methodGroupBy
groupedDataS4 class that represents a GroupedData
GroupedData-classS4 class that represents a GroupedData
grouping_bitAggregate functions for Column operations
grouping_bit-methodAggregate functions for Column operations
grouping_idAggregate functions for Column operations
grouping_id-methodAggregate functions for Column operations
group_byGroupBy
group_by-methodGroupBy
- -

-- H --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
hashMiscellaneous functions for Column operations
hash-methodMiscellaneous functions for Column operations
hashCodeCompute the hashCode of an object
headHead
head-methodHead
hexMath functions for Column operations
hex-methodMath functions for Column operations
hinthint
hint-methodhint
histogramCompute histogram statistics for given column
histogram-methodCompute histogram statistics for given column
hourDate time functions for Column operations
hour-methodDate time functions for Column operations
hypotMath functions for Column operations
hypot-methodMath functions for Column operations
- -

-- I --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ifelseNon-aggregate functions for Column operations
ifelse-methodNon-aggregate functions for Column operations
initcapString functions for Column operations
initcap-methodString functions for Column operations
input_file_nameNon-aggregate functions for Column operations
input_file_name-methodNon-aggregate functions for Column operations
insertIntoinsertInto
insertInto-methodinsertInto
install.sparkDownload and Install Apache Spark to a Local Directory
instrString functions for Column operations
instr-methodString functions for Column operations
intersectIntersect
intersect-methodIntersect
intersectAllintersectAll
intersectAll-methodintersectAll
is.nanNon-aggregate functions for Column operations
is.nan-methodNon-aggregate functions for Column operations
isActiveisActive
isActive-methodisActive
isLocalisLocal
isLocal-methodisLocal
isNaNA set of operations working with SparkDataFrame columns
isnanNon-aggregate functions for Column operations
isnan-methodNon-aggregate functions for Column operations
isNotNullA set of operations working with SparkDataFrame columns
isNullA set of operations working with SparkDataFrame columns
IsotonicRegressionModel-classS4 class that represents an IsotonicRegressionModel
isStreamingisStreaming
isStreaming-methodisStreaming
- -

-- J --

- - - - - - - - - - -
joinJoin
join-methodJoin
jsonFileCreate a SparkDataFrame from a JSON file.
jsonFile.defaultCreate a SparkDataFrame from a JSON file.
- -

-- K --

- - - - - - - - - - -
KMeansModel-classS4 class that represents a KMeansModel
KSTest-classS4 class that represents an KSTest
kurtosisAggregate functions for Column operations
kurtosis-methodAggregate functions for Column operations
- -

-- L --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
lagWindow functions for Column operations
lag-methodWindow functions for Column operations
lastlast
last-methodlast
lastProgresslastProgress
lastProgress-methodlastProgress
last_dayDate time functions for Column operations
last_day-methodDate time functions for Column operations
LDAModel-classS4 class that represents an LDAModel
leadWindow functions for Column operations
lead-methodWindow functions for Column operations
leastNon-aggregate functions for Column operations
least-methodNon-aggregate functions for Column operations
lengthString functions for Column operations
length-methodString functions for Column operations
levenshteinString functions for Column operations
levenshtein-methodString functions for Column operations
likeA set of operations working with SparkDataFrame columns
limitLimit
limit-methodLimit
LinearSVCModel-classS4 class that represents an LinearSVCModel
listColumnsReturns a list of columns for the given table/view in the specified database
listDatabasesReturns a list of databases available
listFunctionsReturns a list of functions registered in the specified database
listTablesReturns a list of tables or views in the specified database
litNon-aggregate functions for Column operations
lit-methodNon-aggregate functions for Column operations
loadDFLoad a SparkDataFrame
loadDF.defaultLoad a SparkDataFrame
localCheckpointlocalCheckpoint
localCheckpoint-methodlocalCheckpoint
locateString functions for Column operations
locate-methodString functions for Column operations
logMath functions for Column operations
log-methodMath functions for Column operations
log10Math functions for Column operations
log10-methodMath functions for Column operations
log1pMath functions for Column operations
log1p-methodMath functions for Column operations
log2Math functions for Column operations
log2-methodMath functions for Column operations
LogisticRegressionModel-classS4 class that represents an LogisticRegressionModel
lowerString functions for Column operations
lower-methodString functions for Column operations
lpadString functions for Column operations
lpad-methodString functions for Column operations
ltrimString functions for Column operations
ltrim-methodString functions for Column operations
- -

-- M --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
map_from_arraysCollection functions for Column operations
map_from_arrays-methodCollection functions for Column operations
map_keysCollection functions for Column operations
map_keys-methodCollection functions for Column operations
map_valuesCollection functions for Column operations
map_values-methodCollection functions for Column operations
maxAggregate functions for Column operations
max-methodAggregate functions for Column operations
md5Miscellaneous functions for Column operations
md5-methodMiscellaneous functions for Column operations
meanAggregate functions for Column operations
mean-methodAggregate functions for Column operations
mergeMerges two data frames
merge-methodMerges two data frames
minAggregate functions for Column operations
min-methodAggregate functions for Column operations
minuteDate time functions for Column operations
minute-methodDate time functions for Column operations
monotonically_increasing_idNon-aggregate functions for Column operations
monotonically_increasing_id-methodNon-aggregate functions for Column operations
monthDate time functions for Column operations
month-methodDate time functions for Column operations
months_betweenDate time arithmetic functions for Column operations
months_between-methodDate time arithmetic functions for Column operations
MultilayerPerceptronClassificationModel-classS4 class that represents a MultilayerPerceptronClassificationModel
mutateMutate
mutate-methodMutate
- -

-- N --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nCount
n-methodCount
na.omitA set of SparkDataFrame functions working with NA values
na.omit-methodA set of SparkDataFrame functions working with NA values
NaiveBayesModel-classS4 class that represents a NaiveBayesModel
namesColumn Names of SparkDataFrame
names-methodColumn Names of SparkDataFrame
names<-Column Names of SparkDataFrame
names<--methodColumn Names of SparkDataFrame
nanvlNon-aggregate functions for Column operations
nanvl-methodNon-aggregate functions for Column operations
ncolReturns the number of columns in a SparkDataFrame
ncol-methodReturns the number of columns in a SparkDataFrame
negateNon-aggregate functions for Column operations
negate-methodNon-aggregate functions for Column operations
next_dayDate time arithmetic functions for Column operations
next_day-methodDate time arithmetic functions for Column operations
not!
not-method!
nrowReturns the number of rows in a SparkDataFrame
nrow-methodReturns the number of rows in a SparkDataFrame
ntileWindow functions for Column operations
ntile-methodWindow functions for Column operations
n_distinctAggregate functions for Column operations
n_distinct-methodAggregate functions for Column operations
- -

-- O --

- - - - - - - - - - - - - - - - -
orderByOrdering Columns in a WindowSpec
orderBy-methodArrange Rows by Variables
orderBy-methodOrdering Columns in a WindowSpec
otherwiseotherwise
otherwise-methodotherwise
overover
over-methodover
- -

-- P --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
parquetFileCreate a SparkDataFrame from a Parquet file.
parquetFile.defaultCreate a SparkDataFrame from a Parquet file.
partitionBypartitionBy
partitionBy-methodpartitionBy
percent_rankWindow functions for Column operations
percent_rank-methodWindow functions for Column operations
persistPersist
persist-methodPersist
pivotPivot a column of the GroupedData and perform the specified aggregation.
pivot-methodPivot a column of the GroupedData and perform the specified aggregation.
pmodMath functions for Column operations
pmod-methodMath functions for Column operations
posexplodeCollection functions for Column operations
posexplode-methodCollection functions for Column operations
posexplode_outerCollection functions for Column operations
posexplode_outer-methodCollection functions for Column operations
predictMakes predictions from a MLlib model
predict-methodAlternating Least Squares (ALS) for Collaborative Filtering
predict-methodBisecting K-Means Clustering Model
predict-methodDecision Tree Model for Regression and Classification
predict-methodFP-growth
predict-methodMultivariate Gaussian Mixture Model (GMM)
predict-methodGradient Boosted Tree Model for Regression and Classification
predict-methodGeneralized Linear Models
predict-methodIsotonic Regression Model
predict-methodK-Means Clustering Model
predict-methodLogistic Regression Model
predict-methodMultilayer Perceptron Classification Model
predict-methodNaive Bayes Models
predict-methodRandom Forest Model for Regression and Classification
predict-methodAccelerated Failure Time (AFT) Survival Regression Model
predict-methodLinear SVM Model
print.jobjPrint a JVM object reference.
print.structFieldPrint a Spark StructField.
print.structTypePrint a Spark StructType.
print.summary.DecisionTreeClassificationModelDecision Tree Model for Regression and Classification
print.summary.DecisionTreeRegressionModelDecision Tree Model for Regression and Classification
print.summary.GBTClassificationModelGradient Boosted Tree Model for Regression and Classification
print.summary.GBTRegressionModelGradient Boosted Tree Model for Regression and Classification
print.summary.GeneralizedLinearRegressionModelGeneralized Linear Models
print.summary.KSTest(One-Sample) Kolmogorov-Smirnov Test
print.summary.RandomForestClassificationModelRandom Forest Model for Regression and Classification
print.summary.RandomForestRegressionModelRandom Forest Model for Regression and Classification
printSchemaPrint Schema of a SparkDataFrame
printSchema-methodPrint Schema of a SparkDataFrame
- -

-- Q --

- - - - - - - - - - -
quarterDate time functions for Column operations
quarter-methodDate time functions for Column operations
queryNamequeryName
queryName-methodqueryName
- -

-- R --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
randNon-aggregate functions for Column operations
rand-methodNon-aggregate functions for Column operations
randnNon-aggregate functions for Column operations
randn-methodNon-aggregate functions for Column operations
RandomForestClassificationModel-classS4 class that represents a RandomForestClassificationModel
RandomForestRegressionModel-classS4 class that represents a RandomForestRegressionModel
randomSplitrandomSplit
randomSplit-methodrandomSplit
rangeBetweenrangeBetween
rangeBetween-methodrangeBetween
rankWindow functions for Column operations
rank-methodWindow functions for Column operations
rbindUnion two or more SparkDataFrames
rbind-methodUnion two or more SparkDataFrames
read.dfLoad a SparkDataFrame
read.df.defaultLoad a SparkDataFrame
read.jdbcCreate a SparkDataFrame representing the database table accessible via JDBC URL
read.jsonCreate a SparkDataFrame from a JSON file.
read.json.defaultCreate a SparkDataFrame from a JSON file.
read.mlLoad a fitted MLlib model from the input path.
read.orcCreate a SparkDataFrame from an ORC file.
read.parquetCreate a SparkDataFrame from a Parquet file.
read.parquet.defaultCreate a SparkDataFrame from a Parquet file.
read.streamLoad a streaming SparkDataFrame
read.textCreate a SparkDataFrame from a text file.
read.text.defaultCreate a SparkDataFrame from a text file.
recoverPartitionsRecovers all the partitions in the directory of a table and update the catalog
refreshByPathInvalidates and refreshes all the cached data and metadata for SparkDataFrame containing path
refreshTableInvalidates and refreshes all the cached data and metadata of the given table
regexp_extractString functions for Column operations
regexp_extract-methodString functions for Column operations
regexp_replaceString functions for Column operations
regexp_replace-methodString functions for Column operations
registerTempTable(Deprecated) Register Temporary Table
registerTempTable-method(Deprecated) Register Temporary Table
renamerename
rename-methodrename
repartitionRepartition
repartition-methodRepartition
repartitionByRangeRepartition by range
repartitionByRange-methodRepartition by range
repeat_stringString functions for Column operations
repeat_string-methodString functions for Column operations
reverseCollection functions for Column operations
reverse-methodCollection functions for Column operations
rintMath functions for Column operations
rint-methodMath functions for Column operations
rlikeA set of operations working with SparkDataFrame columns
rolluprollup
rollup-methodrollup
roundMath functions for Column operations
round-methodMath functions for Column operations
rowsBetweenrowsBetween
rowsBetween-methodrowsBetween
row_numberWindow functions for Column operations
row_number-methodWindow functions for Column operations
rpadString functions for Column operations
rpad-methodString functions for Column operations
rtrimString functions for Column operations
rtrim-methodString functions for Column operations
- -

-- S --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sampleSample
sample-methodSample
sampleByReturns a stratified sample without replacement
sampleBy-methodReturns a stratified sample without replacement
sample_fracSample
sample_frac-methodSample
saveAsParquetFileSave the contents of SparkDataFrame as a Parquet file, preserving the schema.
saveAsParquetFile-methodSave the contents of SparkDataFrame as a Parquet file, preserving the schema.
saveAsTableSave the contents of the SparkDataFrame to a data source as a table
saveAsTable-methodSave the contents of the SparkDataFrame to a data source as a table
saveDFSave the contents of SparkDataFrame to a data source.
saveDF-methodSave the contents of SparkDataFrame to a data source.
schemaGet schema object
schema-methodGet schema object
sdAggregate functions for Column operations
sd-methodAggregate functions for Column operations
secondDate time functions for Column operations
second-methodDate time functions for Column operations
selectSelect
select-methodSelect
selectExprSelectExpr
selectExpr-methodSelectExpr
setCheckpointDirSet checkpoint directory
setCurrentDatabaseSets the current default database
setJobDescriptionSet a human readable description of the current job.
setJobGroupAssigns a group ID to all the jobs started by this thread until the group ID is set to a different value or cleared.
setJobGroup.defaultAssigns a group ID to all the jobs started by this thread until the group ID is set to a different value or cleared.
setLocalPropertySet a local property that affects jobs submitted from this thread, such as the Spark fair scheduler pool.
setLogLevelSet new log level
sha1Miscellaneous functions for Column operations
sha1-methodMiscellaneous functions for Column operations
sha2Miscellaneous functions for Column operations
sha2-methodMiscellaneous functions for Column operations
shiftLeftMath functions for Column operations
shiftLeft-methodMath functions for Column operations
shiftRightMath functions for Column operations
shiftRight-methodMath functions for Column operations
shiftRightUnsignedMath functions for Column operations
shiftRightUnsigned-methodMath functions for Column operations
showshow
show-methodshow
showDFshowDF
showDF-methodshowDF
shuffleCollection functions for Column operations
shuffle-methodCollection functions for Column operations
signMath functions for Column operations
sign-methodMath functions for Column operations
signumMath functions for Column operations
signum-methodMath functions for Column operations
sinMath functions for Column operations
sin-methodMath functions for Column operations
sinhMath functions for Column operations
sinh-methodMath functions for Column operations
sizeCollection functions for Column operations
size-methodCollection functions for Column operations
skewnessAggregate functions for Column operations
skewness-methodAggregate functions for Column operations
sliceCollection functions for Column operations
slice-methodCollection functions for Column operations
sort_arrayCollection functions for Column operations
sort_array-methodCollection functions for Column operations
soundexString functions for Column operations
soundex-methodString functions for Column operations
spark.addFileAdd a file or directory to be downloaded with this Spark job on every node.
spark.alsAlternating Least Squares (ALS) for Collaborative Filtering
spark.als-methodAlternating Least Squares (ALS) for Collaborative Filtering
spark.associationRulesFP-growth
spark.associationRules-methodFP-growth
spark.bisectingKmeansBisecting K-Means Clustering Model
spark.bisectingKmeans-methodBisecting K-Means Clustering Model
spark.decisionTreeDecision Tree Model for Regression and Classification
spark.decisionTree-methodDecision Tree Model for Regression and Classification
spark.fpGrowthFP-growth
spark.fpGrowth-methodFP-growth
spark.freqItemsetsFP-growth
spark.freqItemsets-methodFP-growth
spark.gaussianMixtureMultivariate Gaussian Mixture Model (GMM)
spark.gaussianMixture-methodMultivariate Gaussian Mixture Model (GMM)
spark.gbtGradient Boosted Tree Model for Regression and Classification
spark.gbt-methodGradient Boosted Tree Model for Regression and Classification
spark.getSparkFilesGet the absolute path of a file added through spark.addFile.
spark.getSparkFilesRootDirectoryGet the root directory that contains files added through spark.addFile.
spark.glmGeneralized Linear Models
spark.glm-methodGeneralized Linear Models
spark.isoregIsotonic Regression Model
spark.isoreg-methodIsotonic Regression Model
spark.kmeansK-Means Clustering Model
spark.kmeans-methodK-Means Clustering Model
spark.kstest(One-Sample) Kolmogorov-Smirnov Test
spark.kstest-method(One-Sample) Kolmogorov-Smirnov Test
spark.lapplyRun a function over a list of elements, distributing the computations with Spark
spark.ldaLatent Dirichlet Allocation
spark.lda-methodLatent Dirichlet Allocation
spark.logitLogistic Regression Model
spark.logit-methodLogistic Regression Model
spark.mlpMultilayer Perceptron Classification Model
spark.mlp-methodMultilayer Perceptron Classification Model
spark.naiveBayesNaive Bayes Models
spark.naiveBayes-methodNaive Bayes Models
spark.perplexityLatent Dirichlet Allocation
spark.perplexity-methodLatent Dirichlet Allocation
spark.posteriorLatent Dirichlet Allocation
spark.posterior-methodLatent Dirichlet Allocation
spark.randomForestRandom Forest Model for Regression and Classification
spark.randomForest-methodRandom Forest Model for Regression and Classification
spark.survregAccelerated Failure Time (AFT) Survival Regression Model
spark.survreg-methodAccelerated Failure Time (AFT) Survival Regression Model
spark.svmLinearLinear SVM Model
spark.svmLinear-methodLinear SVM Model
SparkDataFrame-classS4 class that represents a SparkDataFrame
sparkR.callJMethodCall Java Methods
sparkR.callJStaticCall Static Java Methods
sparkR.confGet Runtime Config from the current active SparkSession
sparkR.init(Deprecated) Initialize a new Spark Context
sparkR.newJObjectCreate Java Objects
sparkR.sessionGet the existing SparkSession or initialize a new SparkSession.
sparkR.session.stopStop the Spark Session and Spark Context
sparkR.stopStop the Spark Session and Spark Context
sparkR.uiWebUrlGet the URL of the SparkUI instance for the current active SparkSession
sparkR.versionGet version of Spark on which this application is running
sparkRHive.init(Deprecated) Initialize a new HiveContext
sparkRSQL.init(Deprecated) Initialize a new SQLContext
spark_partition_idNon-aggregate functions for Column operations
spark_partition_id-methodNon-aggregate functions for Column operations
split_stringString functions for Column operations
split_string-methodString functions for Column operations
sqlSQL Query
sql.defaultSQL Query
sqrtMath functions for Column operations
sqrt-methodMath functions for Column operations
startsWithstartsWith
startsWith-methodstartsWith
statusstatus
status-methodstatus
stddevAggregate functions for Column operations
stddev-methodAggregate functions for Column operations
stddev_popAggregate functions for Column operations
stddev_pop-methodAggregate functions for Column operations
stddev_sampAggregate functions for Column operations
stddev_samp-methodAggregate functions for Column operations
stopQuerystopQuery
stopQuery-methodstopQuery
storageLevelStorageLevel
storageLevel-methodStorageLevel
strCompactly display the structure of a dataset
str-methodCompactly display the structure of a dataset
StreamingQuery-classS4 class that represents a StreamingQuery
structNon-aggregate functions for Column operations
struct-methodNon-aggregate functions for Column operations
structFieldstructField
structField.characterstructField
structField.jobjstructField
structTypestructType
structType.characterstructType
structType.jobjstructType
structType.structFieldstructType
subsetSubset
subset-methodSubset
substrsubstr
substr-methodsubstr
substring_indexString functions for Column operations
substring_index-methodString functions for Column operations
sumAggregate functions for Column operations
sum-methodAggregate functions for Column operations
sumDistinctAggregate functions for Column operations
sumDistinct-methodAggregate functions for Column operations
summarizesummarize
summarize-methodsummarize
summarysummary
summary-methodAlternating Least Squares (ALS) for Collaborative Filtering
summary-methodBisecting K-Means Clustering Model
summary-methodDecision Tree Model for Regression and Classification
summary-methodMultivariate Gaussian Mixture Model (GMM)
summary-methodGradient Boosted Tree Model for Regression and Classification
summary-methodGeneralized Linear Models
summary-methodIsotonic Regression Model
summary-methodK-Means Clustering Model
summary-method(One-Sample) Kolmogorov-Smirnov Test
summary-methodLatent Dirichlet Allocation
summary-methodLogistic Regression Model
summary-methodMultilayer Perceptron Classification Model
summary-methodNaive Bayes Models
summary-methodRandom Forest Model for Regression and Classification
summary-methodAccelerated Failure Time (AFT) Survival Regression Model
summary-methodLinear SVM Model
summary-methodsummary
- -

-- T --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
tableNamesTable Names
tableNames.defaultTable Names
tablesTables
tables.defaultTables
tableToDFCreate a SparkDataFrame from a SparkSQL table or view
takeTake the first NUM rows of a SparkDataFrame and return the results as a R data.frame
take-methodTake the first NUM rows of a SparkDataFrame and return the results as a R data.frame
tanMath functions for Column operations
tan-methodMath functions for Column operations
tanhMath functions for Column operations
tanh-methodMath functions for Column operations
toDegreesMath functions for Column operations
toDegrees-methodMath functions for Column operations
toJSONtoJSON
toJSON-methodtoJSON
toRadiansMath functions for Column operations
toRadians-methodMath functions for Column operations
to_dateDate time functions for Column operations
to_date-methodDate time functions for Column operations
to_jsonCollection functions for Column operations
to_json-methodCollection functions for Column operations
to_timestampDate time functions for Column operations
to_timestamp-methodDate time functions for Column operations
to_utc_timestampDate time arithmetic functions for Column operations
to_utc_timestamp-methodDate time arithmetic functions for Column operations
transformMutate
transform-methodMutate
translateString functions for Column operations
translate-methodString functions for Column operations
trimString functions for Column operations
trim-methodString functions for Column operations
truncDate time functions for Column operations
trunc-methodDate time functions for Column operations
- -

-- U --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
unbase64String functions for Column operations
unbase64-methodString functions for Column operations
uncacheTableUncache Table
uncacheTable.defaultUncache Table
unhexMath functions for Column operations
unhex-methodMath functions for Column operations
unionReturn a new SparkDataFrame containing the union of rows
union-methodReturn a new SparkDataFrame containing the union of rows
unionAllReturn a new SparkDataFrame containing the union of rows
unionAll-methodReturn a new SparkDataFrame containing the union of rows
unionByNameReturn a new SparkDataFrame containing the union of rows, matched by column names
unionByName-methodReturn a new SparkDataFrame containing the union of rows, matched by column names
uniqueDistinct
unique-methodDistinct
unix_timestampDate time functions for Column operations
unix_timestamp-methodDate time functions for Column operations
unpersistUnpersist
unpersist-methodUnpersist
upperString functions for Column operations
upper-methodString functions for Column operations
- -

-- V --

- - - - - - - - - - - - - - - - - - -
varAggregate functions for Column operations
var-methodAggregate functions for Column operations
varianceAggregate functions for Column operations
variance-methodAggregate functions for Column operations
var_popAggregate functions for Column operations
var_pop-methodAggregate functions for Column operations
var_sampAggregate functions for Column operations
var_samp-methodAggregate functions for Column operations
- -

-- W --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
weekofyearDate time functions for Column operations
weekofyear-methodDate time functions for Column operations
whenNon-aggregate functions for Column operations
when-methodNon-aggregate functions for Column operations
whereFilter
where-methodFilter
windowDate time functions for Column operations
window-methodDate time functions for Column operations
windowOrderBywindowOrderBy
windowOrderBy-methodwindowOrderBy
windowPartitionBywindowPartitionBy
windowPartitionBy-methodwindowPartitionBy
WindowSpec-classS4 class that represents a WindowSpec
withEvaluate a R expression in an environment constructed from a SparkDataFrame
with-methodEvaluate a R expression in an environment constructed from a SparkDataFrame
withColumnWithColumn
withColumn-methodWithColumn
withColumnRenamedrename
withColumnRenamed-methodrename
withWatermarkwithWatermark
withWatermark-methodwithWatermark
write.dfSave the contents of SparkDataFrame to a data source.
write.df-methodSave the contents of SparkDataFrame to a data source.
write.jdbcSave the content of SparkDataFrame to an external database table via JDBC.
write.jdbc-methodSave the content of SparkDataFrame to an external database table via JDBC.
write.jsonSave the contents of SparkDataFrame as a JSON file
write.json-methodSave the contents of SparkDataFrame as a JSON file
write.mlSaves the MLlib model to the input path
write.ml-methodAlternating Least Squares (ALS) for Collaborative Filtering
write.ml-methodBisecting K-Means Clustering Model
write.ml-methodDecision Tree Model for Regression and Classification
write.ml-methodFP-growth
write.ml-methodMultivariate Gaussian Mixture Model (GMM)
write.ml-methodGradient Boosted Tree Model for Regression and Classification
write.ml-methodGeneralized Linear Models
write.ml-methodIsotonic Regression Model
write.ml-methodK-Means Clustering Model
write.ml-methodLatent Dirichlet Allocation
write.ml-methodLogistic Regression Model
write.ml-methodMultilayer Perceptron Classification Model
write.ml-methodNaive Bayes Models
write.ml-methodRandom Forest Model for Regression and Classification
write.ml-methodAccelerated Failure Time (AFT) Survival Regression Model
write.ml-methodLinear SVM Model
write.orcSave the contents of SparkDataFrame as an ORC file, preserving the schema.
write.orc-methodSave the contents of SparkDataFrame as an ORC file, preserving the schema.
write.parquetSave the contents of SparkDataFrame as a Parquet file, preserving the schema.
write.parquet-methodSave the contents of SparkDataFrame as a Parquet file, preserving the schema.
write.streamWrite the streaming SparkDataFrame to a data source.
write.stream-methodWrite the streaming SparkDataFrame to a data source.
write.textSave the content of SparkDataFrame in a text file at the specified path.
write.text-methodSave the content of SparkDataFrame in a text file at the specified path.
- -

-- Y --

- - - - - - -
yearDate time functions for Column operations
year-methodDate time functions for Column operations
- -

-- misc --

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
!-method!
$Select
$-methodSelect
$<-Select
$<--methodSelect
%<=>%%<=>%
%<=>%-method%<=>%
%in%Match a column with given values.
%in%-methodMatch a column with given values.
[Subset
[-methodSubset
[[Subset
[[-methodSubset
[[<-Subset
[[<--methodSubset
- diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/html/R.css b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/html/R.css deleted file mode 100644 index f10f5ea..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/html/R.css +++ /dev/null @@ -1,97 +0,0 @@ -body { - background: white; - color: black; -} - -a:link { - background: white; - color: blue; -} - -a:visited { - background: white; - color: rgb(50%, 0%, 50%); -} - -h1 { - background: white; - color: rgb(55%, 55%, 55%); - font-family: monospace; - font-size: x-large; - text-align: center; -} - -h2 { - background: white; - color: rgb(40%, 40%, 40%); - font-family: monospace; - font-size: large; - text-align: center; -} - -h3 { - background: white; - color: rgb(40%, 40%, 40%); - font-family: monospace; - font-size: large; -} - -h4 { - background: white; - color: rgb(40%, 40%, 40%); - font-family: monospace; - font-style: italic; - font-size: large; -} - -h5 { - background: white; - color: rgb(40%, 40%, 40%); - font-family: monospace; -} - -h6 { - background: white; - color: rgb(40%, 40%, 40%); - font-family: monospace; - font-style: italic; -} - -img.toplogo { - width: 4em; - vertical-align: middle; -} - -img.arrow { - width: 30px; - height: 30px; - border: 0; -} - -span.acronym { - font-size: small; -} - -span.env { - font-family: monospace; -} - -span.file { - font-family: monospace; -} - -span.option{ - font-family: monospace; -} - -span.pkg { - font-weight: bold; -} - -span.samp{ - font-family: monospace; -} - -div.vignettes a:hover { - background: rgb(85%, 85%, 85%); -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/profile/general.R b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/profile/general.R deleted file mode 100644 index 8c75c19..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/profile/general.R +++ /dev/null @@ -1,23 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -.First <- function() { - packageDir <- Sys.getenv("SPARKR_PACKAGE_DIR") - dirs <- strsplit(packageDir, ",")[[1]] - .libPaths(c(dirs, .libPaths())) - Sys.setenv(NOAWT = 1) -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/profile/shell.R b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/profile/shell.R deleted file mode 100644 index 8a8111a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/profile/shell.R +++ /dev/null @@ -1,47 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -.First <- function() { - home <- Sys.getenv("SPARK_HOME") - .libPaths(c(file.path(home, "R", "lib"), .libPaths())) - Sys.setenv(NOAWT = 1) - - # Make sure SparkR package is the last loaded one - old <- getOption("defaultPackages") - options(defaultPackages = c(old, "SparkR")) - - spark <- SparkR::sparkR.session() - assign("spark", spark, envir = .GlobalEnv) - sc <- SparkR:::callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", spark) - assign("sc", sc, envir = .GlobalEnv) - sparkVer <- SparkR:::callJMethod(sc, "version") - cat("\n Welcome to") - cat("\n") - cat(" ____ __", "\n") - cat(" / __/__ ___ _____/ /__", "\n") - cat(" _\\ \\/ _ \\/ _ `/ __/ '_/", "\n") - cat(" /___/ .__/\\_,_/_/ /_/\\_\\") - if (nchar(sparkVer) == 0) { - cat("\n") - } else { - cat(" version ", sparkVer, "\n") - } - cat(" /_/", "\n") - cat("\n") - - cat("\n SparkSession available as 'spark'.\n") -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/tests/testthat/test_basic.R b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/tests/testthat/test_basic.R deleted file mode 100644 index 80df3d8..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/tests/testthat/test_basic.R +++ /dev/null @@ -1,100 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -context("basic tests for CRAN") - -test_that("create DataFrame from list or data.frame", { - tryCatch(checkJavaVersion(), - error = function(e) { skip("error on Java check") }, - warning = function(e) { skip("warning on Java check") }) - - sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE, - sparkConfig = sparkRTestConfig) - - i <- 4 - df <- createDataFrame(data.frame(dummy = 1:i)) - expect_equal(count(df), i) - - l <- list(list(a = 1, b = 2), list(a = 3, b = 4)) - df <- createDataFrame(l) - expect_equal(columns(df), c("a", "b")) - - a <- 1:3 - b <- c("a", "b", "c") - ldf <- data.frame(a, b) - df <- createDataFrame(ldf) - expect_equal(columns(df), c("a", "b")) - expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) - expect_equal(count(df), 3) - ldf2 <- collect(df) - expect_equal(ldf$a, ldf2$a) - - mtcarsdf <- createDataFrame(mtcars) - expect_equivalent(collect(mtcarsdf), mtcars) - - bytes <- as.raw(c(1, 2, 3)) - df <- createDataFrame(list(list(bytes))) - expect_equal(collect(df)[[1]][[1]], bytes) - - sparkR.session.stop() -}) - -test_that("spark.glm and predict", { - tryCatch(checkJavaVersion(), - error = function(e) { skip("error on Java check") }, - warning = function(e) { skip("warning on Java check") }) - - sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE, - sparkConfig = sparkRTestConfig) - - training <- suppressWarnings(createDataFrame(iris)) - # gaussian family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # Gamma family - x <- runif(100, -1, 1) - y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10) - df <- as.DataFrame(as.data.frame(list(x = x, y = y))) - model <- glm(y ~ x, family = Gamma, df) - out <- capture.output(print(summary(model))) - expect_true(any(grepl("Dispersion parameter for gamma family", out))) - - # tweedie family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = "tweedie", var.power = 1.2, link.power = 0.0) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - - # manual calculation of the R predicted values to avoid dependence on statmod - #' library(statmod) - #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, - #' family = tweedie(var.power = 1.2, link.power = 0.0)) - #' print(coef(rModel)) - - rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174) - rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, - data = iris) %*% rCoef)) - expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals) - - sparkR.session.stop() -}) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/worker/daemon.R b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/worker/daemon.R deleted file mode 100644 index fb9db63..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/worker/daemon.R +++ /dev/null @@ -1,102 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Worker daemon - -rLibDir <- Sys.getenv("SPARKR_RLIBDIR") -connectionTimeout <- as.integer(Sys.getenv("SPARKR_BACKEND_CONNECTION_TIMEOUT", "6000")) -dirs <- strsplit(rLibDir, ",")[[1]] -script <- file.path(dirs[[1]], "SparkR", "worker", "worker.R") - -# preload SparkR package, speedup worker -.libPaths(c(dirs, .libPaths())) -suppressPackageStartupMessages(library(SparkR)) - -port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT")) -inputCon <- socketConnection( - port = port, open = "wb", blocking = TRUE, timeout = connectionTimeout) - -SparkR:::doServerAuth(inputCon, Sys.getenv("SPARKR_WORKER_SECRET")) - -# Waits indefinitely for a socket connecion by default. -selectTimeout <- NULL - -while (TRUE) { - ready <- socketSelect(list(inputCon), timeout = selectTimeout) - - # Note that the children should be terminated in the parent. If each child terminates - # itself, it appears that the resource is not released properly, that causes an unexpected - # termination of this daemon due to, for example, running out of file descriptors - # (see SPARK-21093). Therefore, the current implementation tries to retrieve children - # that are exited (but not terminated) and then sends a kill signal to terminate them properly - # in the parent. - # - # There are two paths that it attempts to send a signal to terminate the children in the parent. - # - # 1. Every second if any socket connection is not available and if there are child workers - # running. - # 2. Right after a socket connection is available. - # - # In other words, the parent attempts to send the signal to the children every second if - # any worker is running or right before launching other worker children from the following - # new socket connection. - - # The process IDs of exited children are returned below. - children <- parallel:::selectChildren(timeout = 0) - - if (is.integer(children)) { - lapply(children, function(child) { - # This should be the PIDs of exited children. Otherwise, this returns raw bytes if any data - # was sent from this child. In this case, we discard it. - pid <- parallel:::readChild(child) - if (is.integer(pid)) { - # This checks if the data from this child is the same pid of this selected child. - if (child == pid) { - # If so, we terminate this child. - tools::pskill(child, tools::SIGUSR1) - } - } - }) - } else if (is.null(children)) { - # If it is NULL, there are no children. Waits indefinitely for a socket connecion. - selectTimeout <- NULL - } - - if (ready) { - port <- SparkR:::readInt(inputCon) - # There is a small chance that it could be interrupted by signal, retry one time - if (length(port) == 0) { - port <- SparkR:::readInt(inputCon) - if (length(port) == 0) { - cat("quitting daemon\n") - quit(save = "no") - } - } - p <- parallel:::mcfork() - if (inherits(p, "masterProcess")) { - # Reach here because this is a child process. - close(inputCon) - Sys.setenv(SPARKR_WORKER_PORT = port) - try(source(script)) - # Note that this mcexit does not fully terminate this child. - parallel:::mcexit(0L) - } else { - # Forking succeeded and we need to check if they finished their jobs every second. - selectTimeout <- 1 - } - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/worker/worker.R b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/worker/worker.R deleted file mode 100644 index c2adf61..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/SparkR/worker/worker.R +++ /dev/null @@ -1,267 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Worker class - -# Get current system time -currentTimeSecs <- function() { - as.numeric(Sys.time()) -} - -# Get elapsed time -elapsedSecs <- function() { - proc.time()[3] -} - -compute <- function(mode, partition, serializer, deserializer, key, - colNames, computeFunc, inputData) { - if (mode > 0) { - if (deserializer == "row") { - # Transform the list of rows into a data.frame - # Note that the optional argument stringsAsFactors for rbind is - # available since R 3.2.4. So we set the global option here. - oldOpt <- getOption("stringsAsFactors") - options(stringsAsFactors = FALSE) - - # Handle binary data types - if ("raw" %in% sapply(inputData[[1]], class)) { - inputData <- SparkR:::rbindRaws(inputData) - } else { - inputData <- do.call(rbind.data.frame, inputData) - } - - options(stringsAsFactors = oldOpt) - - names(inputData) <- colNames - } else { - # Check to see if inputData is a valid data.frame - stopifnot(deserializer == "byte") - stopifnot(class(inputData) == "data.frame") - } - - if (mode == 2) { - output <- computeFunc(key, inputData) - } else { - output <- computeFunc(inputData) - } - if (serializer == "row") { - # Transform the result data.frame back to a list of rows - output <- split(output, seq(nrow(output))) - } else { - # Serialize the output to a byte array - stopifnot(serializer == "byte") - } - } else { - output <- computeFunc(partition, inputData) - } - return(output) -} - -outputResult <- function(serializer, output, outputCon) { - if (serializer == "byte") { - SparkR:::writeRawSerialize(outputCon, output) - } else if (serializer == "row") { - SparkR:::writeRowSerialize(outputCon, output) - } else { - # write lines one-by-one with flag - lapply(output, function(line) SparkR:::writeString(outputCon, line)) - } -} - -# Constants -specialLengths <- list(END_OF_STERAM = 0L, TIMING_DATA = -1L) - -# Timing R process boot -bootTime <- currentTimeSecs() -bootElap <- elapsedSecs() - -rLibDir <- Sys.getenv("SPARKR_RLIBDIR") -connectionTimeout <- as.integer(Sys.getenv("SPARKR_BACKEND_CONNECTION_TIMEOUT", "6000")) -dirs <- strsplit(rLibDir, ",")[[1]] -# Set libPaths to include SparkR package as loadNamespace needs this -# TODO: Figure out if we can avoid this by not loading any objects that require -# SparkR namespace -.libPaths(c(dirs, .libPaths())) -suppressPackageStartupMessages(library(SparkR)) - -port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT")) -inputCon <- socketConnection( - port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout) -SparkR:::doServerAuth(inputCon, Sys.getenv("SPARKR_WORKER_SECRET")) - -outputCon <- socketConnection( - port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout) -SparkR:::doServerAuth(outputCon, Sys.getenv("SPARKR_WORKER_SECRET")) - -# read the index of the current partition inside the RDD -partition <- SparkR:::readInt(inputCon) - -deserializer <- SparkR:::readString(inputCon) -serializer <- SparkR:::readString(inputCon) - -# Include packages as required -packageNames <- unserialize(SparkR:::readRaw(inputCon)) -for (pkg in packageNames) { - suppressPackageStartupMessages(library(as.character(pkg), character.only = TRUE)) -} - -# read function dependencies -funcLen <- SparkR:::readInt(inputCon) -computeFunc <- unserialize(SparkR:::readRawLen(inputCon, funcLen)) -env <- environment(computeFunc) -parent.env(env) <- .GlobalEnv # Attach under global environment. - -# Timing init envs for computing -initElap <- elapsedSecs() - -# Read and set broadcast variables -numBroadcastVars <- SparkR:::readInt(inputCon) -if (numBroadcastVars > 0) { - for (bcast in seq(1:numBroadcastVars)) { - bcastId <- SparkR:::readInt(inputCon) - value <- unserialize(SparkR:::readRaw(inputCon)) - SparkR:::setBroadcastValue(bcastId, value) - } -} - -# Timing broadcast -broadcastElap <- elapsedSecs() -# Initial input timing -inputElap <- broadcastElap - -# If -1: read as normal RDD; if >= 0, treat as pairwise RDD and treat the int -# as number of partitions to create. -numPartitions <- SparkR:::readInt(inputCon) - -# 0 - RDD mode, 1 - dapply mode, 2 - gapply mode -mode <- SparkR:::readInt(inputCon) - -if (mode > 0) { - colNames <- SparkR:::readObject(inputCon) -} - -isEmpty <- SparkR:::readInt(inputCon) -computeInputElapsDiff <- 0 -outputComputeElapsDiff <- 0 - -if (isEmpty != 0) { - if (numPartitions == -1) { - if (deserializer == "byte") { - # Now read as many characters as described in funcLen - data <- SparkR:::readDeserialize(inputCon) - } else if (deserializer == "string") { - data <- as.list(readLines(inputCon)) - } else if (deserializer == "row" && mode == 2) { - dataWithKeys <- SparkR:::readMultipleObjectsWithKeys(inputCon) - keys <- dataWithKeys$keys - data <- dataWithKeys$data - } else if (deserializer == "row") { - data <- SparkR:::readMultipleObjects(inputCon) - } - - # Timing reading input data for execution - inputElap <- elapsedSecs() - if (mode > 0) { - if (mode == 1) { - output <- compute(mode, partition, serializer, deserializer, NULL, - colNames, computeFunc, data) - } else { - # gapply mode - for (i in 1:length(data)) { - # Timing reading input data for execution - inputElap <- elapsedSecs() - output <- compute(mode, partition, serializer, deserializer, keys[[i]], - colNames, computeFunc, data[[i]]) - computeElap <- elapsedSecs() - outputResult(serializer, output, outputCon) - outputElap <- elapsedSecs() - computeInputElapsDiff <- computeInputElapsDiff + (computeElap - inputElap) - outputComputeElapsDiff <- outputComputeElapsDiff + (outputElap - computeElap) - } - } - } else { - output <- compute(mode, partition, serializer, deserializer, NULL, - colNames, computeFunc, data) - } - if (mode != 2) { - # Not a gapply mode - computeElap <- elapsedSecs() - outputResult(serializer, output, outputCon) - outputElap <- elapsedSecs() - computeInputElapsDiff <- computeElap - inputElap - outputComputeElapsDiff <- outputElap - computeElap - } - } else { - if (deserializer == "byte") { - # Now read as many characters as described in funcLen - data <- SparkR:::readDeserialize(inputCon) - } else if (deserializer == "string") { - data <- readLines(inputCon) - } else if (deserializer == "row") { - data <- SparkR:::readMultipleObjects(inputCon) - } - # Timing reading input data for execution - inputElap <- elapsedSecs() - - res <- new.env() - - # Step 1: hash the data to an environment - hashTupleToEnvir <- function(tuple) { - # NOTE: execFunction is the hash function here - hashVal <- computeFunc(tuple[[1]]) - bucket <- as.character(hashVal %% numPartitions) - acc <- res[[bucket]] - # Create a new accumulator - if (is.null(acc)) { - acc <- SparkR:::initAccumulator() - } - SparkR:::addItemToAccumulator(acc, tuple) - res[[bucket]] <- acc - } - invisible(lapply(data, hashTupleToEnvir)) - # Timing computing - computeElap <- elapsedSecs() - - # Step 2: write out all of the environment as key-value pairs. - for (name in ls(res)) { - SparkR:::writeInt(outputCon, 2L) - SparkR:::writeInt(outputCon, as.integer(name)) - # Truncate the accumulator list to the number of elements we have - length(res[[name]]$data) <- res[[name]]$counter - SparkR:::writeRawSerialize(outputCon, res[[name]]$data) - } - # Timing output - outputElap <- elapsedSecs() - computeInputElapsDiff <- computeElap - inputElap - outputComputeElapsDiff <- outputElap - computeElap - } -} - -# Report timing -SparkR:::writeInt(outputCon, specialLengths$TIMING_DATA) -SparkR:::writeDouble(outputCon, bootTime) -SparkR:::writeDouble(outputCon, initElap - bootElap) # init -SparkR:::writeDouble(outputCon, broadcastElap - initElap) # broadcast -SparkR:::writeDouble(outputCon, inputElap - broadcastElap) # input -SparkR:::writeDouble(outputCon, computeInputElapsDiff) # compute -SparkR:::writeDouble(outputCon, outputComputeElapsDiff) # output - -# End of output -SparkR:::writeInt(outputCon, specialLengths$END_OF_STERAM) - -close(outputCon) -close(inputCon) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/sparkr.zip b/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/sparkr.zip deleted file mode 100644 index 9e54521..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/R/lib/sparkr.zip and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/README.md b/scripts/spark-2.4.3-bin-hadoop2.7/README.md deleted file mode 100644 index fd8c7f6..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/README.md +++ /dev/null @@ -1,105 +0,0 @@ -# Apache Spark - -Spark is a fast and general cluster computing system for Big Data. It provides -high-level APIs in Scala, Java, Python, and R, and an optimized engine that -supports general computation graphs for data analysis. It also supports a -rich set of higher-level tools including Spark SQL for SQL and DataFrames, -MLlib for machine learning, GraphX for graph processing, -and Spark Streaming for stream processing. - - - - -## Online Documentation - -You can find the latest Spark documentation, including a programming -guide, on the [project web page](http://spark.apache.org/documentation.html). -This README file only contains basic setup instructions. - -## Building Spark - -Spark is built using [Apache Maven](http://maven.apache.org/). -To build Spark and its example programs, run: - - build/mvn -DskipTests clean package - -(You do not need to do this if you downloaded a pre-built package.) - -You can build Spark using more than one thread by using the -T option with Maven, see ["Parallel builds in Maven 3"](https://cwiki.apache.org/confluence/display/MAVEN/Parallel+builds+in+Maven+3). -More detailed documentation is available from the project site, at -["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html). - -For general development tips, including info on developing Spark using an IDE, see ["Useful Developer Tools"](http://spark.apache.org/developer-tools.html). - -## Interactive Scala Shell - -The easiest way to start using Spark is through the Scala shell: - - ./bin/spark-shell - -Try the following command, which should return 1000: - - scala> sc.parallelize(1 to 1000).count() - -## Interactive Python Shell - -Alternatively, if you prefer Python, you can use the Python shell: - - ./bin/pyspark - -And run the following command, which should also return 1000: - - >>> sc.parallelize(range(1000)).count() - -## Example Programs - -Spark also comes with several sample programs in the `examples` directory. -To run one of them, use `./bin/run-example [params]`. For example: - - ./bin/run-example SparkPi - -will run the Pi example locally. - -You can set the MASTER environment variable when running examples to submit -examples to a cluster. This can be a mesos:// or spark:// URL, -"yarn" to run on YARN, and "local" to run -locally with one thread, or "local[N]" to run locally with N threads. You -can also use an abbreviated class name if the class is in the `examples` -package. For instance: - - MASTER=spark://host:7077 ./bin/run-example SparkPi - -Many of the example programs print usage help if no params are given. - -## Running Tests - -Testing first requires [building Spark](#building-spark). Once Spark is built, tests -can be run using: - - ./dev/run-tests - -Please see the guidance on how to -[run tests for a module, or individual tests](http://spark.apache.org/developer-tools.html#individual-tests). - -There is also a Kubernetes integration test, see resource-managers/kubernetes/integration-tests/README.md - -## A Note About Hadoop Versions - -Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported -storage systems. Because the protocols have changed in different versions of -Hadoop, you must build Spark against the same version that your cluster runs. - -Please refer to the build documentation at -["Specifying the Hadoop Version and Enabling YARN"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version-and-enabling-yarn) -for detailed guidance on building for a particular distribution of Hadoop, including -building for particular Hive and Hive Thriftserver distributions. - -## Configuration - -Please refer to the [Configuration Guide](http://spark.apache.org/docs/latest/configuration.html) -in the online documentation for an overview on how to configure Spark. - -## Contributing - -Please review the [Contribution to Spark guide](http://spark.apache.org/contributing.html) -for information on how to get started contributing to the project. diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/RELEASE b/scripts/spark-2.4.3-bin-hadoop2.7/RELEASE deleted file mode 100644 index e33cf4f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/RELEASE +++ /dev/null @@ -1,2 +0,0 @@ -Spark 2.4.3 built for Hadoop 2.7.3 -Build flags: -B -Pmesos -Pyarn -Pkubernetes -Pflume -Psparkr -Pkafka-0-8 -Phadoop-2.7 -Phive -Phive-thriftserver -DzincPort=3036 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/beeline b/scripts/spark-2.4.3-bin-hadoop2.7/bin/beeline deleted file mode 100755 index 0585346..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/beeline +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Shell script for starting BeeLine - -# Enter posix mode for bash -set -o posix - -# Figure out if SPARK_HOME is set -if [ -z "${SPARK_HOME}" ]; then - source "$(dirname "$0")"/find-spark-home -fi - -CLASS="org.apache.hive.beeline.BeeLine" -exec "${SPARK_HOME}/bin/spark-class" $CLASS "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/beeline.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/beeline.cmd deleted file mode 100644 index 902d143..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/beeline.cmd +++ /dev/null @@ -1,22 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem The outermost quotes are used to prevent Windows command line parse error -rem when there are some quotes in parameters, see SPARK-21877. -cmd /V /E /C ""%~dp0spark-class.cmd" org.apache.hive.beeline.BeeLine %*" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/docker-image-tool.sh b/scripts/spark-2.4.3-bin-hadoop2.7/bin/docker-image-tool.sh deleted file mode 100755 index 5e8eaff..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/docker-image-tool.sh +++ /dev/null @@ -1,183 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This script builds and pushes docker images when run from a release of Spark -# with Kubernetes support. - -function error { - echo "$@" 1>&2 - exit 1 -} - -if [ -z "${SPARK_HOME}" ]; then - SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi -. "${SPARK_HOME}/bin/load-spark-env.sh" - -function image_ref { - local image="$1" - local add_repo="${2:-1}" - if [ $add_repo = 1 ] && [ -n "$REPO" ]; then - image="$REPO/$image" - fi - if [ -n "$TAG" ]; then - image="$image:$TAG" - fi - echo "$image" -} - -function build { - local BUILD_ARGS - local IMG_PATH - - if [ ! -f "$SPARK_HOME/RELEASE" ]; then - # Set image build arguments accordingly if this is a source repo and not a distribution archive. - IMG_PATH=resource-managers/kubernetes/docker/src/main/dockerfiles - BUILD_ARGS=( - ${BUILD_PARAMS} - --build-arg - img_path=$IMG_PATH - --build-arg - spark_jars=assembly/target/scala-$SPARK_SCALA_VERSION/jars - --build-arg - k8s_tests=resource-managers/kubernetes/integration-tests/tests - ) - else - # Not passed as an argument to docker, but used to validate the Spark directory. - IMG_PATH="kubernetes/dockerfiles" - BUILD_ARGS=(${BUILD_PARAMS}) - fi - - if [ ! -d "$IMG_PATH" ]; then - error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark." - fi - local BINDING_BUILD_ARGS=( - ${BUILD_PARAMS} - --build-arg - base_img=$(image_ref spark) - ) - local BASEDOCKERFILE=${BASEDOCKERFILE:-"$IMG_PATH/spark/Dockerfile"} - local PYDOCKERFILE=${PYDOCKERFILE:-"$IMG_PATH/spark/bindings/python/Dockerfile"} - local RDOCKERFILE=${RDOCKERFILE:-"$IMG_PATH/spark/bindings/R/Dockerfile"} - - docker build $NOCACHEARG "${BUILD_ARGS[@]}" \ - -t $(image_ref spark) \ - -f "$BASEDOCKERFILE" . - - docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \ - -t $(image_ref spark-py) \ - -f "$PYDOCKERFILE" . - - docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \ - -t $(image_ref spark-r) \ - -f "$RDOCKERFILE" . -} - -function push { - docker push "$(image_ref spark)" - docker push "$(image_ref spark-py)" - docker push "$(image_ref spark-r)" -} - -function usage { - cat </dev/null; then - error "Cannot find minikube." - fi - eval $(minikube docker-env) - ;; - esac -done - -case "${@: -1}" in - build) - build - ;; - push) - if [ -z "$REPO" ]; then - usage - exit 1 - fi - push - ;; - *) - usage - exit 1 - ;; -esac diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/find-spark-home b/scripts/spark-2.4.3-bin-hadoop2.7/bin/find-spark-home deleted file mode 100755 index 617dbaa..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/find-spark-home +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Attempts to find a proper value for SPARK_HOME. Should be included using "source" directive. - -FIND_SPARK_HOME_PYTHON_SCRIPT="$(cd "$(dirname "$0")"; pwd)/find_spark_home.py" - -# Short circuit if the user already has this set. -if [ ! -z "${SPARK_HOME}" ]; then - exit 0 -elif [ ! -f "$FIND_SPARK_HOME_PYTHON_SCRIPT" ]; then - # If we are not in the same directory as find_spark_home.py we are not pip installed so we don't - # need to search the different Python directories for a Spark installation. - # Note only that, if the user has pip installed PySpark but is directly calling pyspark-shell or - # spark-submit in another directory we want to use that version of PySpark rather than the - # pip installed version of PySpark. - export SPARK_HOME="$(cd "$(dirname "$0")"/..; pwd)" -else - # We are pip installed, use the Python script to resolve a reasonable SPARK_HOME - # Default to standard python interpreter unless told otherwise - if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then - PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}" - fi - export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT") -fi diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/find-spark-home.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/find-spark-home.cmd deleted file mode 100644 index 6f5009c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/find-spark-home.cmd +++ /dev/null @@ -1,60 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem Path to Python script finding SPARK_HOME -set FIND_SPARK_HOME_PYTHON_SCRIPT=%~dp0find_spark_home.py - -rem Default to standard python interpreter unless told otherwise -set PYTHON_RUNNER=python -rem If PYSPARK_DRIVER_PYTHON is set, it overwrites the python version -if not "x%PYSPARK_DRIVER_PYTHON%"=="x" ( - set PYTHON_RUNNER=%PYSPARK_DRIVER_PYTHON% -) -rem If PYSPARK_PYTHON is set, it overwrites the python version -if not "x%PYSPARK_PYTHON%"=="x" ( - set PYTHON_RUNNER=%PYSPARK_PYTHON% -) - -rem If there is python installed, trying to use the root dir as SPARK_HOME -where %PYTHON_RUNNER% > nul 2>&1 -if %ERRORLEVEL% neq 0 ( - if not exist %PYTHON_RUNNER% ( - if "x%SPARK_HOME%"=="x" ( - echo Missing Python executable '%PYTHON_RUNNER%', defaulting to '%~dp0..' for SPARK_HOME ^ -environment variable. Please install Python or specify the correct Python executable in ^ -PYSPARK_DRIVER_PYTHON or PYSPARK_PYTHON environment variable to detect SPARK_HOME safely. - set SPARK_HOME=%~dp0.. - ) - ) -) - -rem Only attempt to find SPARK_HOME if it is not set. -if "x%SPARK_HOME%"=="x" ( - if not exist "%FIND_SPARK_HOME_PYTHON_SCRIPT%" ( - rem If we are not in the same directory as find_spark_home.py we are not pip installed so we don't - rem need to search the different Python directories for a Spark installation. - rem Note only that, if the user has pip installed PySpark but is directly calling pyspark-shell or - rem spark-submit in another directory we want to use that version of PySpark rather than the - rem pip installed version of PySpark. - set SPARK_HOME=%~dp0.. - ) else ( - rem We are pip installed, use the Python script to resolve a reasonable SPARK_HOME - for /f "delims=" %%i in ('%PYTHON_RUNNER% %FIND_SPARK_HOME_PYTHON_SCRIPT%') do set SPARK_HOME=%%i - ) -) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/load-spark-env.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/load-spark-env.cmd deleted file mode 100644 index 46c7a93..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/load-spark-env.cmd +++ /dev/null @@ -1,57 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem This script loads spark-env.cmd if it exists, and ensures it is only loaded once. -rem spark-env.cmd is loaded from SPARK_CONF_DIR if set, or within the current directory's -rem conf\ subdirectory. - -if [%SPARK_ENV_LOADED%] == [] ( - set SPARK_ENV_LOADED=1 - - if [%SPARK_CONF_DIR%] == [] ( - set SPARK_CONF_DIR=%~dp0..\conf - ) - - call :LoadSparkEnv -) - -rem Setting SPARK_SCALA_VERSION if not already set. - -set ASSEMBLY_DIR2="%SPARK_HOME%\assembly\target\scala-2.11" -set ASSEMBLY_DIR1="%SPARK_HOME%\assembly\target\scala-2.12" - -if [%SPARK_SCALA_VERSION%] == [] ( - - if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% ( - echo "Presence of build for multiple Scala versions detected." - echo "Either clean one of them or, set SPARK_SCALA_VERSION in spark-env.cmd." - exit 1 - ) - if exist %ASSEMBLY_DIR2% ( - set SPARK_SCALA_VERSION=2.11 - ) else ( - set SPARK_SCALA_VERSION=2.12 - ) -) -exit /b 0 - -:LoadSparkEnv -if exist "%SPARK_CONF_DIR%\spark-env.cmd" ( - call "%SPARK_CONF_DIR%\spark-env.cmd" -) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/load-spark-env.sh b/scripts/spark-2.4.3-bin-hadoop2.7/bin/load-spark-env.sh deleted file mode 100644 index 0b5006d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/load-spark-env.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This script loads spark-env.sh if it exists, and ensures it is only loaded once. -# spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's -# conf/ subdirectory. - -# Figure out where Spark is installed -if [ -z "${SPARK_HOME}" ]; then - source "$(dirname "$0")"/find-spark-home -fi - -if [ -z "$SPARK_ENV_LOADED" ]; then - export SPARK_ENV_LOADED=1 - - export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}"/conf}" - - if [ -f "${SPARK_CONF_DIR}/spark-env.sh" ]; then - # Promote all variable declarations to environment (exported) variables - set -a - . "${SPARK_CONF_DIR}/spark-env.sh" - set +a - fi -fi - -# Setting SPARK_SCALA_VERSION if not already set. - -if [ -z "$SPARK_SCALA_VERSION" ]; then - - ASSEMBLY_DIR2="${SPARK_HOME}/assembly/target/scala-2.11" - ASSEMBLY_DIR1="${SPARK_HOME}/assembly/target/scala-2.12" - - if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then - echo -e "Presence of build for multiple Scala versions detected." 1>&2 - echo -e 'Either clean one of them or, export SPARK_SCALA_VERSION in spark-env.sh.' 1>&2 - exit 1 - fi - - if [ -d "$ASSEMBLY_DIR2" ]; then - export SPARK_SCALA_VERSION="2.11" - else - export SPARK_SCALA_VERSION="2.12" - fi -fi diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/pyspark b/scripts/spark-2.4.3-bin-hadoop2.7/bin/pyspark deleted file mode 100755 index 5d5affb..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/pyspark +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -if [ -z "${SPARK_HOME}" ]; then - source "$(dirname "$0")"/find-spark-home -fi - -source "${SPARK_HOME}"/bin/load-spark-env.sh -export _SPARK_CMD_USAGE="Usage: ./bin/pyspark [options]" - -# In Spark 2.0, IPYTHON and IPYTHON_OPTS are removed and pyspark fails to launch if either option -# is set in the user's environment. Instead, users should set PYSPARK_DRIVER_PYTHON=ipython -# to use IPython and set PYSPARK_DRIVER_PYTHON_OPTS to pass options when starting the Python driver -# (e.g. PYSPARK_DRIVER_PYTHON_OPTS='notebook'). This supports full customization of the IPython -# and executor Python executables. - -# Fail noisily if removed options are set -if [[ -n "$IPYTHON" || -n "$IPYTHON_OPTS" ]]; then - echo "Error in pyspark startup:" - echo "IPYTHON and IPYTHON_OPTS are removed in Spark 2.0+. Remove these from the environment and set PYSPARK_DRIVER_PYTHON and PYSPARK_DRIVER_PYTHON_OPTS instead." - exit 1 -fi - -# Default to standard python interpreter unless told otherwise -if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then - PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}" -fi - -WORKS_WITH_IPYTHON=$(python -c 'import sys; print(sys.version_info >= (2, 7, 0))') - -# Determine the Python executable to use for the executors: -if [[ -z "$PYSPARK_PYTHON" ]]; then - if [[ $PYSPARK_DRIVER_PYTHON == *ipython* && ! $WORKS_WITH_IPYTHON ]]; then - echo "IPython requires Python 2.7+; please install python2.7 or set PYSPARK_PYTHON" 1>&2 - exit 1 - else - PYSPARK_PYTHON=python - fi -fi -export PYSPARK_PYTHON - -# Add the PySpark classes to the Python path: -export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH" -export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:$PYTHONPATH" - -# Load the PySpark shell.py script when ./pyspark is used interactively: -export OLD_PYTHONSTARTUP="$PYTHONSTARTUP" -export PYTHONSTARTUP="${SPARK_HOME}/python/pyspark/shell.py" - -# For pyspark tests -if [[ -n "$SPARK_TESTING" ]]; then - unset YARN_CONF_DIR - unset HADOOP_CONF_DIR - export PYTHONHASHSEED=0 - exec "$PYSPARK_DRIVER_PYTHON" -m "$@" - exit -fi - -export PYSPARK_DRIVER_PYTHON -export PYSPARK_DRIVER_PYTHON_OPTS -exec "${SPARK_HOME}"/bin/spark-submit pyspark-shell-main --name "PySparkShell" "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/pyspark.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/pyspark.cmd deleted file mode 100644 index 7d1b752..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/pyspark.cmd +++ /dev/null @@ -1,25 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem This is the entry point for running PySpark. To avoid polluting the -rem environment, it just launches a new cmd to do the real work. - -rem The outermost quotes are used to prevent Windows command line parse error -rem when there are some quotes in parameters, see SPARK-21877. -cmd /V /E /C ""%~dp0pyspark2.cmd" %*" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/pyspark2.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/pyspark2.cmd deleted file mode 100644 index b678ed6..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/pyspark2.cmd +++ /dev/null @@ -1,38 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem Figure out where the Spark framework is installed -call "%~dp0find-spark-home.cmd" - -call "%SPARK_HOME%\bin\load-spark-env.cmd" -set _SPARK_CMD_USAGE=Usage: bin\pyspark.cmd [options] - -rem Figure out which Python to use. -if "x%PYSPARK_DRIVER_PYTHON%"=="x" ( - set PYSPARK_DRIVER_PYTHON=python - if not [%PYSPARK_PYTHON%] == [] set PYSPARK_DRIVER_PYTHON=%PYSPARK_PYTHON% -) - -set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH% -set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.7-src.zip;%PYTHONPATH% - -set OLD_PYTHONSTARTUP=%PYTHONSTARTUP% -set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py - -call "%SPARK_HOME%\bin\spark-submit2.cmd" pyspark-shell-main --name "PySparkShell" %* diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/run-example b/scripts/spark-2.4.3-bin-hadoop2.7/bin/run-example deleted file mode 100755 index 4ba5399..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/run-example +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -if [ -z "${SPARK_HOME}" ]; then - source "$(dirname "$0")"/find-spark-home -fi - -export _SPARK_CMD_USAGE="Usage: ./bin/run-example [options] example-class [example args]" -exec "${SPARK_HOME}"/bin/spark-submit run-example "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/run-example.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/run-example.cmd deleted file mode 100644 index 02b7423..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/run-example.cmd +++ /dev/null @@ -1,27 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem Figure out where the Spark framework is installed -call "%~dp0find-spark-home.cmd" - -set _SPARK_CMD_USAGE=Usage: .\bin\run-example [options] example-class [example args] - -rem The outermost quotes are used to prevent Windows command line parse error -rem when there are some quotes in parameters, see SPARK-21877. -cmd /V /E /C ""%~dp0spark-submit.cmd" run-example %*" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-class b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-class deleted file mode 100755 index 65d3b96..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-class +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -if [ -z "${SPARK_HOME}" ]; then - source "$(dirname "$0")"/find-spark-home -fi - -. "${SPARK_HOME}"/bin/load-spark-env.sh - -# Find the java binary -if [ -n "${JAVA_HOME}" ]; then - RUNNER="${JAVA_HOME}/bin/java" -else - if [ "$(command -v java)" ]; then - RUNNER="java" - else - echo "JAVA_HOME is not set" >&2 - exit 1 - fi -fi - -# Find Spark jars. -if [ -d "${SPARK_HOME}/jars" ]; then - SPARK_JARS_DIR="${SPARK_HOME}/jars" -else - SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars" -fi - -if [ ! -d "$SPARK_JARS_DIR" ] && [ -z "$SPARK_TESTING$SPARK_SQL_TESTING" ]; then - echo "Failed to find Spark jars directory ($SPARK_JARS_DIR)." 1>&2 - echo "You need to build Spark with the target \"package\" before running this program." 1>&2 - exit 1 -else - LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*" -fi - -# Add the launcher build dir to the classpath if requested. -if [ -n "$SPARK_PREPEND_CLASSES" ]; then - LAUNCH_CLASSPATH="${SPARK_HOME}/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH" -fi - -# For tests -if [[ -n "$SPARK_TESTING" ]]; then - unset YARN_CONF_DIR - unset HADOOP_CONF_DIR -fi - -# The launcher library will print arguments separated by a NULL character, to allow arguments with -# characters that would be otherwise interpreted by the shell. Read that in a while loop, populating -# an array that will be used to exec the final command. -# -# The exit code of the launcher is appended to the output, so the parent shell removes it from the -# command array and checks the value to see if the launcher succeeded. -build_command() { - "$RUNNER" -Xmx128m -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@" - printf "%d\0" $? -} - -# Turn off posix mode since it does not allow process substitution -set +o posix -CMD=() -while IFS= read -d '' -r ARG; do - CMD+=("$ARG") -done < <(build_command "$@") - -COUNT=${#CMD[@]} -LAST=$((COUNT - 1)) -LAUNCHER_EXIT_CODE=${CMD[$LAST]} - -# Certain JVM failures result in errors being printed to stdout (instead of stderr), which causes -# the code that parses the output of the launcher to get confused. In those cases, check if the -# exit code is an integer, and if it's not, handle it as a special error case. -if ! [[ $LAUNCHER_EXIT_CODE =~ ^[0-9]+$ ]]; then - echo "${CMD[@]}" | head -n-1 1>&2 - exit 1 -fi - -if [ $LAUNCHER_EXIT_CODE != 0 ]; then - exit $LAUNCHER_EXIT_CODE -fi - -CMD=("${CMD[@]:0:$LAST}") -exec "${CMD[@]}" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-class.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-class.cmd deleted file mode 100644 index 4a7a92e..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-class.cmd +++ /dev/null @@ -1,25 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem This is the entry point for running a Spark class. To avoid polluting -rem the environment, it just launches a new cmd to do the real work. - -rem The outermost quotes are used to prevent Windows command line parse error -rem when there are some quotes in parameters, see SPARK-21877. -cmd /V /E /C ""%~dp0spark-class2.cmd" %*" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-class2.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-class2.cmd deleted file mode 100644 index 400fbd7..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-class2.cmd +++ /dev/null @@ -1,72 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem Figure out where the Spark framework is installed -call "%~dp0find-spark-home.cmd" - -call "%SPARK_HOME%\bin\load-spark-env.cmd" - -rem Test that an argument was given -if "x%1"=="x" ( - echo Usage: spark-class ^ [^] - exit /b 1 -) - -rem Find Spark jars. -if exist "%SPARK_HOME%\jars" ( - set SPARK_JARS_DIR="%SPARK_HOME%\jars" -) else ( - set SPARK_JARS_DIR="%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%\jars" -) - -if not exist "%SPARK_JARS_DIR%"\ ( - echo Failed to find Spark jars directory. - echo You need to build Spark before running this program. - exit /b 1 -) - -set LAUNCH_CLASSPATH=%SPARK_JARS_DIR%\* - -rem Add the launcher build dir to the classpath if requested. -if not "x%SPARK_PREPEND_CLASSES%"=="x" ( - set LAUNCH_CLASSPATH="%SPARK_HOME%\launcher\target\scala-%SPARK_SCALA_VERSION%\classes;%LAUNCH_CLASSPATH%" -) - -rem Figure out where java is. -set RUNNER=java -if not "x%JAVA_HOME%"=="x" ( - set RUNNER=%JAVA_HOME%\bin\java -) else ( - where /q "%RUNNER%" - if ERRORLEVEL 1 ( - echo Java not found and JAVA_HOME environment variable is not set. - echo Install Java and set JAVA_HOME to point to the Java installation directory. - exit /b 1 - ) -) - -rem The launcher library prints the command to be executed in a single line suitable for being -rem executed by the batch interpreter. So read all the output of the launcher into a variable. -set LAUNCHER_OUTPUT=%temp%\spark-class-launcher-output-%RANDOM%.txt -"%RUNNER%" -Xmx128m -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* > %LAUNCHER_OUTPUT% -for /f "tokens=*" %%i in (%LAUNCHER_OUTPUT%) do ( - set SPARK_CMD=%%i -) -del %LAUNCHER_OUTPUT% -%SPARK_CMD% diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-shell b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-shell deleted file mode 100755 index e920137..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-shell +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Shell script for starting the Spark Shell REPL - -cygwin=false -case "$(uname)" in - CYGWIN*) cygwin=true;; -esac - -# Enter posix mode for bash -set -o posix - -if [ -z "${SPARK_HOME}" ]; then - source "$(dirname "$0")"/find-spark-home -fi - -export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options] - -Scala REPL options: - -I preload , enforcing line-by-line interpretation" - -# SPARK-4161: scala does not assume use of the java classpath, -# so we need to add the "-Dscala.usejavacp=true" flag manually. We -# do this specifically for the Spark shell because the scala REPL -# has its own class loader, and any additional classpath specified -# through spark.driver.extraClassPath is not automatically propagated. -SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Dscala.usejavacp=true" - -function main() { - if $cygwin; then - # Workaround for issue involving JLine and Cygwin - # (see http://sourceforge.net/p/jline/bugs/40/). - # If you're using the Mintty terminal emulator in Cygwin, may need to set the - # "Backspace sends ^H" setting in "Keys" section of the Mintty options - # (see https://github.com/sbt/sbt/issues/562). - stty -icanon min 1 -echo > /dev/null 2>&1 - export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix" - "${SPARK_HOME}"/bin/spark-submit --class org.apache.spark.repl.Main --name "Spark shell" "$@" - stty icanon echo > /dev/null 2>&1 - else - export SPARK_SUBMIT_OPTS - "${SPARK_HOME}"/bin/spark-submit --class org.apache.spark.repl.Main --name "Spark shell" "$@" - fi -} - -# Copy restore-TTY-on-exit functions from Scala script so spark-shell exits properly even in -# binary distribution of Spark where Scala is not installed -exit_status=127 -saved_stty="" - -# restore stty settings (echo in particular) -function restoreSttySettings() { - stty $saved_stty - saved_stty="" -} - -function onExit() { - if [[ "$saved_stty" != "" ]]; then - restoreSttySettings - fi - exit $exit_status -} - -# to reenable echo if we are interrupted before completing. -trap onExit INT - -# save terminal settings -saved_stty=$(stty -g 2>/dev/null) -# clear on error so we don't later try to restore them -if [[ ! $? ]]; then - saved_stty="" -fi - -main "$@" - -# record the exit status lest it be overwritten: -# then reenable echo and propagate the code. -exit_status=$? -onExit - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-shell.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-shell.cmd deleted file mode 100644 index 2a47eb0..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-shell.cmd +++ /dev/null @@ -1,25 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem This is the entry point for running Spark shell. To avoid polluting the -rem environment, it just launches a new cmd to do the real work. - -rem The outermost quotes are used to prevent Windows command line parse error -rem when there are some quotes in parameters, see SPARK-21877. -cmd /V /E /C ""%~dp0spark-shell2.cmd" %*" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-shell2.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-shell2.cmd deleted file mode 100644 index 1b290bd..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-shell2.cmd +++ /dev/null @@ -1,43 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem Figure out where the Spark framework is installed -call "%~dp0find-spark-home.cmd" - -set LF=^ - - -rem two empty lines are required -set _SPARK_CMD_USAGE=Usage: .\bin\spark-shell.cmd [options]^%LF%%LF%^%LF%%LF%^ -Scala REPL options:^%LF%%LF%^ - -I ^ preload ^, enforcing line-by-line interpretation - -rem SPARK-4161: scala does not assume use of the java classpath, -rem so we need to add the "-Dscala.usejavacp=true" flag manually. We -rem do this specifically for the Spark shell because the scala REPL -rem has its own class loader, and any additional classpath specified -rem through spark.driver.extraClassPath is not automatically propagated. -if "x%SPARK_SUBMIT_OPTS%"=="x" ( - set SPARK_SUBMIT_OPTS=-Dscala.usejavacp=true - goto run_shell -) -set SPARK_SUBMIT_OPTS="%SPARK_SUBMIT_OPTS% -Dscala.usejavacp=true" - -:run_shell -"%SPARK_HOME%\bin\spark-submit2.cmd" --class org.apache.spark.repl.Main --name "Spark shell" %* diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-sql b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-sql deleted file mode 100755 index b08b944..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-sql +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -if [ -z "${SPARK_HOME}" ]; then - source "$(dirname "$0")"/find-spark-home -fi - -export _SPARK_CMD_USAGE="Usage: ./bin/spark-sql [options] [cli option]" -exec "${SPARK_HOME}"/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-sql.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-sql.cmd deleted file mode 100644 index a5763a1..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-sql.cmd +++ /dev/null @@ -1,25 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem This is the entry point for running SparkSQL. To avoid polluting the -rem environment, it just launches a new cmd to do the real work. - -rem The outermost quotes are used to prevent Windows command line parse error -rem when there are some quotes in parameters, see SPARK-21877. -cmd /V /E /C ""%~dp0spark-sql2.cmd" %*" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-sql2.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-sql2.cmd deleted file mode 100644 index baf6877..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-sql2.cmd +++ /dev/null @@ -1,25 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem Figure out where the Spark framework is installed -call "%~dp0find-spark-home.cmd" - -set _SPARK_CMD_USAGE=Usage: .\bin\spark-sql [options] [cli option] - -call "%SPARK_HOME%\bin\spark-submit2.cmd" --class org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver %* diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-submit b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-submit deleted file mode 100755 index 4e9d361..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-submit +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -if [ -z "${SPARK_HOME}" ]; then - source "$(dirname "$0")"/find-spark-home -fi - -# disable randomized hash for string in Python 3.3+ -export PYTHONHASHSEED=0 - -exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-submit.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-submit.cmd deleted file mode 100644 index 64bdea2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-submit.cmd +++ /dev/null @@ -1,25 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem This is the entry point for running Spark submit. To avoid polluting the -rem environment, it just launches a new cmd to do the real work. - -rem The outermost quotes are used to prevent Windows command line parse error -rem when there are some quotes in parameters, see SPARK-21877. -cmd /V /E /C ""%~dp0spark-submit2.cmd" %*" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-submit2.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-submit2.cmd deleted file mode 100644 index 58d6ed3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/spark-submit2.cmd +++ /dev/null @@ -1,27 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem This is the entry point for running Spark submit. To avoid polluting the -rem environment, it just launches a new cmd to do the real work. - -rem disable randomized hash for string in Python 3.3+ -set PYTHONHASHSEED=0 - -set CLASS=org.apache.spark.deploy.SparkSubmit -"%~dp0spark-class2.cmd" %CLASS% %* diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/sparkR b/scripts/spark-2.4.3-bin-hadoop2.7/bin/sparkR deleted file mode 100755 index 29ab10d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/sparkR +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -if [ -z "${SPARK_HOME}" ]; then - source "$(dirname "$0")"/find-spark-home -fi - -source "${SPARK_HOME}"/bin/load-spark-env.sh -export _SPARK_CMD_USAGE="Usage: ./bin/sparkR [options]" -exec "${SPARK_HOME}"/bin/spark-submit sparkr-shell-main "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/sparkR.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/sparkR.cmd deleted file mode 100644 index d870924..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/sparkR.cmd +++ /dev/null @@ -1,25 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem This is the entry point for running SparkR. To avoid polluting the -rem environment, it just launches a new cmd to do the real work. - -rem The outermost quotes are used to prevent Windows command line parse error -rem when there are some quotes in parameters, see SPARK-21877. -cmd /V /E /C ""%~dp0sparkR2.cmd" %*" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/bin/sparkR2.cmd b/scripts/spark-2.4.3-bin-hadoop2.7/bin/sparkR2.cmd deleted file mode 100644 index 28f1a8a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/bin/sparkR2.cmd +++ /dev/null @@ -1,25 +0,0 @@ -@echo off - -rem -rem Licensed to the Apache Software Foundation (ASF) under one or more -rem contributor license agreements. See the NOTICE file distributed with -rem this work for additional information regarding copyright ownership. -rem The ASF licenses this file to You under the Apache License, Version 2.0 -rem (the "License"); you may not use this file except in compliance with -rem the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, software -rem distributed under the License is distributed on an "AS IS" BASIS, -rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -rem See the License for the specific language governing permissions and -rem limitations under the License. -rem - -rem Figure out where the Spark framework is installed -call "%~dp0find-spark-home.cmd" - -call "%SPARK_HOME%\bin\load-spark-env.cmd" -set _SPARK_CMD_USAGE=Usage: .\bin\sparkR [options] -call "%SPARK_HOME%\bin\spark-submit2.cmd" sparkr-shell-main %* diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/conf/docker.properties.template b/scripts/spark-2.4.3-bin-hadoop2.7/conf/docker.properties.template deleted file mode 100644 index 2ecb4f1..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/conf/docker.properties.template +++ /dev/null @@ -1,20 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -spark.mesos.executor.docker.image: -spark.mesos.executor.docker.volumes: /usr/local/lib:/host/usr/local/lib:ro -spark.mesos.executor.home: /opt/spark diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/conf/fairscheduler.xml.template b/scripts/spark-2.4.3-bin-hadoop2.7/conf/fairscheduler.xml.template deleted file mode 100644 index 385b2e7..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/conf/fairscheduler.xml.template +++ /dev/null @@ -1,31 +0,0 @@ - - - - - - - FAIR - 1 - 2 - - - FIFO - 2 - 3 - - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/conf/log4j.properties.template b/scripts/spark-2.4.3-bin-hadoop2.7/conf/log4j.properties.template deleted file mode 100644 index ec1aa18..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/conf/log4j.properties.template +++ /dev/null @@ -1,40 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set everything to be logged to the console -log4j.rootCategory=INFO, console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - -# Set the default spark-shell log level to WARN. When running the spark-shell, the -# log level for this class is used to overwrite the root logger's log level, so that -# the user can have different defaults for the shell and regular Spark apps. -log4j.logger.org.apache.spark.repl.Main=WARN - -# Settings to quiet third party logs that are too verbose -log4j.logger.org.spark_project.jetty=WARN -log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR -log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO -log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -log4j.logger.org.apache.parquet=ERROR -log4j.logger.parquet=ERROR - -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support -log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL -log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/conf/metrics.properties.template b/scripts/spark-2.4.3-bin-hadoop2.7/conf/metrics.properties.template deleted file mode 100644 index 4c008a1..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/conf/metrics.properties.template +++ /dev/null @@ -1,182 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# syntax: [instance].sink|source.[name].[options]=[value] - -# This file configures Spark's internal metrics system. The metrics system is -# divided into instances which correspond to internal components. -# Each instance can be configured to report its metrics to one or more sinks. -# Accepted values for [instance] are "master", "worker", "executor", "driver", -# and "applications". A wildcard "*" can be used as an instance name, in -# which case all instances will inherit the supplied property. -# -# Within an instance, a "source" specifies a particular set of grouped metrics. -# there are two kinds of sources: -# 1. Spark internal sources, like MasterSource, WorkerSource, etc, which will -# collect a Spark component's internal state. Each instance is paired with a -# Spark source that is added automatically. -# 2. Common sources, like JvmSource, which will collect low level state. -# These can be added through configuration options and are then loaded -# using reflection. -# -# A "sink" specifies where metrics are delivered to. Each instance can be -# assigned one or more sinks. -# -# The sink|source field specifies whether the property relates to a sink or -# source. -# -# The [name] field specifies the name of source or sink. -# -# The [options] field is the specific property of this source or sink. The -# source or sink is responsible for parsing this property. -# -# Notes: -# 1. To add a new sink, set the "class" option to a fully qualified class -# name (see examples below). -# 2. Some sinks involve a polling period. The minimum allowed polling period -# is 1 second. -# 3. Wildcard properties can be overridden by more specific properties. -# For example, master.sink.console.period takes precedence over -# *.sink.console.period. -# 4. A metrics specific configuration -# "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be -# added to Java properties using -Dspark.metrics.conf=xxx if you want to -# customize metrics system. You can also put the file in ${SPARK_HOME}/conf -# and it will be loaded automatically. -# 5. The MetricsServlet sink is added by default as a sink in the master, -# worker and driver, and you can send HTTP requests to the "/metrics/json" -# endpoint to get a snapshot of all the registered metrics in JSON format. -# For master, requests to the "/metrics/master/json" and -# "/metrics/applications/json" endpoints can be sent separately to get -# metrics snapshots of the master instance and applications. This -# MetricsServlet does not have to be configured. - -## List of available common sources and their properties. - -# org.apache.spark.metrics.source.JvmSource -# Note: Currently, JvmSource is the only available common source. -# It can be added to an instance by setting the "class" option to its -# fully qualified class name (see examples below). - -## List of available sinks and their properties. - -# org.apache.spark.metrics.sink.ConsoleSink -# Name: Default: Description: -# period 10 Poll period -# unit seconds Unit of the poll period - -# org.apache.spark.metrics.sink.CSVSink -# Name: Default: Description: -# period 10 Poll period -# unit seconds Unit of the poll period -# directory /tmp Where to store CSV files - -# org.apache.spark.metrics.sink.GangliaSink -# Name: Default: Description: -# host NONE Hostname or multicast group of the Ganglia server, -# must be set -# port NONE Port of the Ganglia server(s), must be set -# period 10 Poll period -# unit seconds Unit of the poll period -# ttl 1 TTL of messages sent by Ganglia -# dmax 0 Lifetime in seconds of metrics (0 never expired) -# mode multicast Ganglia network mode ('unicast' or 'multicast') - -# org.apache.spark.metrics.sink.JmxSink - -# org.apache.spark.metrics.sink.MetricsServlet -# Name: Default: Description: -# path VARIES* Path prefix from the web server root -# sample false Whether to show entire set of samples for histograms -# ('false' or 'true') -# -# * Default path is /metrics/json for all instances except the master. The -# master has two paths: -# /metrics/applications/json # App information -# /metrics/master/json # Master information - -# org.apache.spark.metrics.sink.GraphiteSink -# Name: Default: Description: -# host NONE Hostname of the Graphite server, must be set -# port NONE Port of the Graphite server, must be set -# period 10 Poll period -# unit seconds Unit of the poll period -# prefix EMPTY STRING Prefix to prepend to every metric's name -# protocol tcp Protocol ("tcp" or "udp") to use - -# org.apache.spark.metrics.sink.StatsdSink -# Name: Default: Description: -# host 127.0.0.1 Hostname or IP of StatsD server -# port 8125 Port of StatsD server -# period 10 Poll period -# unit seconds Units of poll period -# prefix EMPTY STRING Prefix to prepend to metric name - -## Examples -# Enable JmxSink for all instances by class name -#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink - -# Enable ConsoleSink for all instances by class name -#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink - -# Enable StatsdSink for all instances by class name -#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink -#*.sink.statsd.prefix=spark - -# Polling period for the ConsoleSink -#*.sink.console.period=10 -# Unit of the polling period for the ConsoleSink -#*.sink.console.unit=seconds - -# Polling period for the ConsoleSink specific for the master instance -#master.sink.console.period=15 -# Unit of the polling period for the ConsoleSink specific for the master -# instance -#master.sink.console.unit=seconds - -# Enable CsvSink for all instances by class name -#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink - -# Polling period for the CsvSink -#*.sink.csv.period=1 -# Unit of the polling period for the CsvSink -#*.sink.csv.unit=minutes - -# Polling directory for CsvSink -#*.sink.csv.directory=/tmp/ - -# Polling period for the CsvSink specific for the worker instance -#worker.sink.csv.period=10 -# Unit of the polling period for the CsvSink specific for the worker instance -#worker.sink.csv.unit=minutes - -# Enable Slf4jSink for all instances by class name -#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink - -# Polling period for the Slf4JSink -#*.sink.slf4j.period=1 -# Unit of the polling period for the Slf4jSink -#*.sink.slf4j.unit=minutes - -# Enable JvmSource for instance master, worker, driver and executor -#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource - -#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource - -#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource - -#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/conf/slaves.template b/scripts/spark-2.4.3-bin-hadoop2.7/conf/slaves.template deleted file mode 100644 index be42a63..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/conf/slaves.template +++ /dev/null @@ -1,19 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# A Spark Worker will be started on each of the machines listed below. -localhost \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/conf/spark-defaults.conf.template b/scripts/spark-2.4.3-bin-hadoop2.7/conf/spark-defaults.conf.template deleted file mode 100644 index 19cba6e..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/conf/spark-defaults.conf.template +++ /dev/null @@ -1,27 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Default system properties included when running spark-submit. -# This is useful for setting default environmental settings. - -# Example: -# spark.master spark://master:7077 -# spark.eventLog.enabled true -# spark.eventLog.dir hdfs://namenode:8021/directory -# spark.serializer org.apache.spark.serializer.KryoSerializer -# spark.driver.memory 5g -# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/conf/spark-env.sh.template b/scripts/spark-2.4.3-bin-hadoop2.7/conf/spark-env.sh.template deleted file mode 100755 index bc92c78..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/conf/spark-env.sh.template +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This file is sourced when running various Spark programs. -# Copy it as spark-env.sh and edit that to configure Spark for your site. - -# Options read when launching programs locally with -# ./bin/run-example or ./bin/spark-submit -# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files -# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node -# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program - -# Options read by executors and drivers running inside the cluster -# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node -# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program -# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data -# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos - -# Options read in YARN client/cluster mode -# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf) -# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files -# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN -# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). -# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) -# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) - -# Options for the daemons used in the standalone deploy mode -# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname -# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master -# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") -# - SPARK_WORKER_CORES, to set the number of cores to use on this machine -# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) -# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker -# - SPARK_WORKER_DIR, to set the working directory of worker processes -# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") -# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). -# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") -# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") -# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") -# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons -# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers - -# Generic options for the daemons used in the standalone deploy mode -# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) -# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) -# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) -# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) -# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) -# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. -# Options for native BLAS, like Intel MKL, OpenBLAS, and so on. -# You might get better performance to enable these options if using native BLAS (see SPARK-21305). -# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL -# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/graphx/followers.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/graphx/followers.txt deleted file mode 100644 index 7bb8e90..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/graphx/followers.txt +++ /dev/null @@ -1,8 +0,0 @@ -2 1 -4 1 -1 2 -6 3 -7 3 -7 6 -6 7 -3 7 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/graphx/users.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/graphx/users.txt deleted file mode 100644 index 982d19d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/graphx/users.txt +++ /dev/null @@ -1,7 +0,0 @@ -1,BarackObama,Barack Obama -2,ladygaga,Goddess of Love -3,jeresig,John Resig -4,justinbieber,Justin Bieber -6,matei_zaharia,Matei Zaharia -7,odersky,Martin Odersky -8,anonsys diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/als/sample_movielens_ratings.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/als/sample_movielens_ratings.txt deleted file mode 100644 index 0889142..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/als/sample_movielens_ratings.txt +++ /dev/null @@ -1,1501 +0,0 @@ -0::2::3::1424380312 -0::3::1::1424380312 -0::5::2::1424380312 -0::9::4::1424380312 -0::11::1::1424380312 -0::12::2::1424380312 -0::15::1::1424380312 -0::17::1::1424380312 -0::19::1::1424380312 -0::21::1::1424380312 -0::23::1::1424380312 -0::26::3::1424380312 -0::27::1::1424380312 -0::28::1::1424380312 -0::29::1::1424380312 -0::30::1::1424380312 -0::31::1::1424380312 -0::34::1::1424380312 -0::37::1::1424380312 -0::41::2::1424380312 -0::44::1::1424380312 -0::45::2::1424380312 -0::46::1::1424380312 -0::47::1::1424380312 -0::48::1::1424380312 -0::50::1::1424380312 -0::51::1::1424380312 -0::54::1::1424380312 -0::55::1::1424380312 -0::59::2::1424380312 -0::61::2::1424380312 -0::64::1::1424380312 -0::67::1::1424380312 -0::68::1::1424380312 -0::69::1::1424380312 -0::71::1::1424380312 -0::72::1::1424380312 -0::77::2::1424380312 -0::79::1::1424380312 -0::83::1::1424380312 -0::87::1::1424380312 -0::89::2::1424380312 -0::91::3::1424380312 -0::92::4::1424380312 -0::94::1::1424380312 -0::95::2::1424380312 -0::96::1::1424380312 -0::98::1::1424380312 -0::99::1::1424380312 -1::2::2::1424380312 -1::3::1::1424380312 -1::4::2::1424380312 -1::6::1::1424380312 -1::9::3::1424380312 -1::12::1::1424380312 -1::13::1::1424380312 -1::14::1::1424380312 -1::16::1::1424380312 -1::19::1::1424380312 -1::21::3::1424380312 -1::27::1::1424380312 -1::28::3::1424380312 -1::33::1::1424380312 -1::36::2::1424380312 -1::37::1::1424380312 -1::40::1::1424380312 -1::41::2::1424380312 -1::43::1::1424380312 -1::44::1::1424380312 -1::47::1::1424380312 -1::50::1::1424380312 -1::54::1::1424380312 -1::56::2::1424380312 -1::57::1::1424380312 -1::58::1::1424380312 -1::60::1::1424380312 -1::62::4::1424380312 -1::63::1::1424380312 -1::67::1::1424380312 -1::68::4::1424380312 -1::70::2::1424380312 -1::72::1::1424380312 -1::73::1::1424380312 -1::74::2::1424380312 -1::76::1::1424380312 -1::77::3::1424380312 -1::78::1::1424380312 -1::81::1::1424380312 -1::82::1::1424380312 -1::85::3::1424380312 -1::86::2::1424380312 -1::88::2::1424380312 -1::91::1::1424380312 -1::92::2::1424380312 -1::93::1::1424380312 -1::94::2::1424380312 -1::96::1::1424380312 -1::97::1::1424380312 -2::4::3::1424380312 -2::6::1::1424380312 -2::8::5::1424380312 -2::9::1::1424380312 -2::10::1::1424380312 -2::12::3::1424380312 -2::13::1::1424380312 -2::15::2::1424380312 -2::18::2::1424380312 -2::19::4::1424380312 -2::22::1::1424380312 -2::26::1::1424380312 -2::28::1::1424380312 -2::34::4::1424380312 -2::35::1::1424380312 -2::37::5::1424380312 -2::38::1::1424380312 -2::39::5::1424380312 -2::40::4::1424380312 -2::47::1::1424380312 -2::50::1::1424380312 -2::52::2::1424380312 -2::54::1::1424380312 -2::55::1::1424380312 -2::57::2::1424380312 -2::58::2::1424380312 -2::59::1::1424380312 -2::61::1::1424380312 -2::62::1::1424380312 -2::64::1::1424380312 -2::65::1::1424380312 -2::66::3::1424380312 -2::68::1::1424380312 -2::71::3::1424380312 -2::76::1::1424380312 -2::77::1::1424380312 -2::78::1::1424380312 -2::80::1::1424380312 -2::83::5::1424380312 -2::85::1::1424380312 -2::87::2::1424380312 -2::88::1::1424380312 -2::89::4::1424380312 -2::90::1::1424380312 -2::92::4::1424380312 -2::93::5::1424380312 -3::0::1::1424380312 -3::1::1::1424380312 -3::2::1::1424380312 -3::7::3::1424380312 -3::8::3::1424380312 -3::9::1::1424380312 -3::14::1::1424380312 -3::15::1::1424380312 -3::16::1::1424380312 -3::18::4::1424380312 -3::19::1::1424380312 -3::24::3::1424380312 -3::26::1::1424380312 -3::29::3::1424380312 -3::33::1::1424380312 -3::34::3::1424380312 -3::35::1::1424380312 -3::36::3::1424380312 -3::37::1::1424380312 -3::38::2::1424380312 -3::43::1::1424380312 -3::44::1::1424380312 -3::46::1::1424380312 -3::47::1::1424380312 -3::51::5::1424380312 -3::52::3::1424380312 -3::56::1::1424380312 -3::58::1::1424380312 -3::60::3::1424380312 -3::62::1::1424380312 -3::65::2::1424380312 -3::66::1::1424380312 -3::67::1::1424380312 -3::68::2::1424380312 -3::70::1::1424380312 -3::72::2::1424380312 -3::76::3::1424380312 -3::79::3::1424380312 -3::80::4::1424380312 -3::81::1::1424380312 -3::83::1::1424380312 -3::84::1::1424380312 -3::86::1::1424380312 -3::87::2::1424380312 -3::88::4::1424380312 -3::89::1::1424380312 -3::91::1::1424380312 -3::94::3::1424380312 -4::1::1::1424380312 -4::6::1::1424380312 -4::8::1::1424380312 -4::9::1::1424380312 -4::10::1::1424380312 -4::11::1::1424380312 -4::12::1::1424380312 -4::13::1::1424380312 -4::14::2::1424380312 -4::15::1::1424380312 -4::17::1::1424380312 -4::20::1::1424380312 -4::22::1::1424380312 -4::23::1::1424380312 -4::24::1::1424380312 -4::29::4::1424380312 -4::30::1::1424380312 -4::31::1::1424380312 -4::34::1::1424380312 -4::35::1::1424380312 -4::36::1::1424380312 -4::39::2::1424380312 -4::40::3::1424380312 -4::41::4::1424380312 -4::43::2::1424380312 -4::44::1::1424380312 -4::45::1::1424380312 -4::46::1::1424380312 -4::47::1::1424380312 -4::49::2::1424380312 -4::50::1::1424380312 -4::51::1::1424380312 -4::52::4::1424380312 -4::54::1::1424380312 -4::55::1::1424380312 -4::60::3::1424380312 -4::61::1::1424380312 -4::62::4::1424380312 -4::63::3::1424380312 -4::65::1::1424380312 -4::67::2::1424380312 -4::69::1::1424380312 -4::70::4::1424380312 -4::71::1::1424380312 -4::73::1::1424380312 -4::78::1::1424380312 -4::84::1::1424380312 -4::85::1::1424380312 -4::87::3::1424380312 -4::88::3::1424380312 -4::89::2::1424380312 -4::96::1::1424380312 -4::97::1::1424380312 -4::98::1::1424380312 -4::99::1::1424380312 -5::0::1::1424380312 -5::1::1::1424380312 -5::4::1::1424380312 -5::5::1::1424380312 -5::8::1::1424380312 -5::9::3::1424380312 -5::10::2::1424380312 -5::13::3::1424380312 -5::15::1::1424380312 -5::19::1::1424380312 -5::20::3::1424380312 -5::21::2::1424380312 -5::23::3::1424380312 -5::27::1::1424380312 -5::28::1::1424380312 -5::29::1::1424380312 -5::31::1::1424380312 -5::36::3::1424380312 -5::38::2::1424380312 -5::39::1::1424380312 -5::42::1::1424380312 -5::48::3::1424380312 -5::49::4::1424380312 -5::50::3::1424380312 -5::51::1::1424380312 -5::52::1::1424380312 -5::54::1::1424380312 -5::55::5::1424380312 -5::56::3::1424380312 -5::58::1::1424380312 -5::60::1::1424380312 -5::61::1::1424380312 -5::64::3::1424380312 -5::65::2::1424380312 -5::68::4::1424380312 -5::70::1::1424380312 -5::71::1::1424380312 -5::72::1::1424380312 -5::74::1::1424380312 -5::79::1::1424380312 -5::81::2::1424380312 -5::84::1::1424380312 -5::85::1::1424380312 -5::86::1::1424380312 -5::88::1::1424380312 -5::90::4::1424380312 -5::91::2::1424380312 -5::95::2::1424380312 -5::99::1::1424380312 -6::0::1::1424380312 -6::1::1::1424380312 -6::2::3::1424380312 -6::5::1::1424380312 -6::6::1::1424380312 -6::9::1::1424380312 -6::10::1::1424380312 -6::15::2::1424380312 -6::16::2::1424380312 -6::17::1::1424380312 -6::18::1::1424380312 -6::20::1::1424380312 -6::21::1::1424380312 -6::22::1::1424380312 -6::24::1::1424380312 -6::25::5::1424380312 -6::26::1::1424380312 -6::28::1::1424380312 -6::30::1::1424380312 -6::33::1::1424380312 -6::38::1::1424380312 -6::39::1::1424380312 -6::43::4::1424380312 -6::44::1::1424380312 -6::45::1::1424380312 -6::48::1::1424380312 -6::49::1::1424380312 -6::50::1::1424380312 -6::53::1::1424380312 -6::54::1::1424380312 -6::55::1::1424380312 -6::56::1::1424380312 -6::58::4::1424380312 -6::59::1::1424380312 -6::60::1::1424380312 -6::61::3::1424380312 -6::63::3::1424380312 -6::66::1::1424380312 -6::67::3::1424380312 -6::68::1::1424380312 -6::69::1::1424380312 -6::71::2::1424380312 -6::73::1::1424380312 -6::75::1::1424380312 -6::77::1::1424380312 -6::79::1::1424380312 -6::81::1::1424380312 -6::84::1::1424380312 -6::85::3::1424380312 -6::86::1::1424380312 -6::87::1::1424380312 -6::88::1::1424380312 -6::89::1::1424380312 -6::91::2::1424380312 -6::94::1::1424380312 -6::95::2::1424380312 -6::96::1::1424380312 -7::1::1::1424380312 -7::2::2::1424380312 -7::3::1::1424380312 -7::4::1::1424380312 -7::7::1::1424380312 -7::10::1::1424380312 -7::11::2::1424380312 -7::14::2::1424380312 -7::15::1::1424380312 -7::16::1::1424380312 -7::18::1::1424380312 -7::21::1::1424380312 -7::22::1::1424380312 -7::23::1::1424380312 -7::25::5::1424380312 -7::26::1::1424380312 -7::29::4::1424380312 -7::30::1::1424380312 -7::31::3::1424380312 -7::32::1::1424380312 -7::33::1::1424380312 -7::35::1::1424380312 -7::37::2::1424380312 -7::39::3::1424380312 -7::40::2::1424380312 -7::42::2::1424380312 -7::44::1::1424380312 -7::45::2::1424380312 -7::47::4::1424380312 -7::48::1::1424380312 -7::49::1::1424380312 -7::53::1::1424380312 -7::54::1::1424380312 -7::55::1::1424380312 -7::56::1::1424380312 -7::59::1::1424380312 -7::61::2::1424380312 -7::62::3::1424380312 -7::63::2::1424380312 -7::66::1::1424380312 -7::67::3::1424380312 -7::74::1::1424380312 -7::75::1::1424380312 -7::76::3::1424380312 -7::77::1::1424380312 -7::81::1::1424380312 -7::82::1::1424380312 -7::84::2::1424380312 -7::85::4::1424380312 -7::86::1::1424380312 -7::92::2::1424380312 -7::96::1::1424380312 -7::97::1::1424380312 -7::98::1::1424380312 -8::0::1::1424380312 -8::2::4::1424380312 -8::3::2::1424380312 -8::4::2::1424380312 -8::5::1::1424380312 -8::7::1::1424380312 -8::9::1::1424380312 -8::11::1::1424380312 -8::15::1::1424380312 -8::18::1::1424380312 -8::19::1::1424380312 -8::21::1::1424380312 -8::29::5::1424380312 -8::31::3::1424380312 -8::33::1::1424380312 -8::35::1::1424380312 -8::36::1::1424380312 -8::40::2::1424380312 -8::44::1::1424380312 -8::45::1::1424380312 -8::50::1::1424380312 -8::51::1::1424380312 -8::52::5::1424380312 -8::53::5::1424380312 -8::54::1::1424380312 -8::55::1::1424380312 -8::56::1::1424380312 -8::58::4::1424380312 -8::60::3::1424380312 -8::62::4::1424380312 -8::64::1::1424380312 -8::67::3::1424380312 -8::69::1::1424380312 -8::71::1::1424380312 -8::72::3::1424380312 -8::77::3::1424380312 -8::78::1::1424380312 -8::79::1::1424380312 -8::83::1::1424380312 -8::85::5::1424380312 -8::86::1::1424380312 -8::88::1::1424380312 -8::90::1::1424380312 -8::92::2::1424380312 -8::95::4::1424380312 -8::96::3::1424380312 -8::97::1::1424380312 -8::98::1::1424380312 -8::99::1::1424380312 -9::2::3::1424380312 -9::3::1::1424380312 -9::4::1::1424380312 -9::5::1::1424380312 -9::6::1::1424380312 -9::7::5::1424380312 -9::9::1::1424380312 -9::12::1::1424380312 -9::14::3::1424380312 -9::15::1::1424380312 -9::19::1::1424380312 -9::21::1::1424380312 -9::22::1::1424380312 -9::24::1::1424380312 -9::25::1::1424380312 -9::26::1::1424380312 -9::30::3::1424380312 -9::32::4::1424380312 -9::35::2::1424380312 -9::36::2::1424380312 -9::37::2::1424380312 -9::38::1::1424380312 -9::39::1::1424380312 -9::43::3::1424380312 -9::49::5::1424380312 -9::50::3::1424380312 -9::53::1::1424380312 -9::54::1::1424380312 -9::58::1::1424380312 -9::59::1::1424380312 -9::60::1::1424380312 -9::61::1::1424380312 -9::63::3::1424380312 -9::64::3::1424380312 -9::68::1::1424380312 -9::69::1::1424380312 -9::70::3::1424380312 -9::71::1::1424380312 -9::73::2::1424380312 -9::75::1::1424380312 -9::77::2::1424380312 -9::81::2::1424380312 -9::82::1::1424380312 -9::83::1::1424380312 -9::84::1::1424380312 -9::86::1::1424380312 -9::87::4::1424380312 -9::88::1::1424380312 -9::90::3::1424380312 -9::94::2::1424380312 -9::95::3::1424380312 -9::97::2::1424380312 -9::98::1::1424380312 -10::0::3::1424380312 -10::2::4::1424380312 -10::4::3::1424380312 -10::7::1::1424380312 -10::8::1::1424380312 -10::10::1::1424380312 -10::13::2::1424380312 -10::14::1::1424380312 -10::16::2::1424380312 -10::17::1::1424380312 -10::18::1::1424380312 -10::21::1::1424380312 -10::22::1::1424380312 -10::24::1::1424380312 -10::25::3::1424380312 -10::28::1::1424380312 -10::35::1::1424380312 -10::36::1::1424380312 -10::37::1::1424380312 -10::38::1::1424380312 -10::39::1::1424380312 -10::40::4::1424380312 -10::41::2::1424380312 -10::42::3::1424380312 -10::43::1::1424380312 -10::49::3::1424380312 -10::50::1::1424380312 -10::51::1::1424380312 -10::52::1::1424380312 -10::55::2::1424380312 -10::56::1::1424380312 -10::58::1::1424380312 -10::63::1::1424380312 -10::66::1::1424380312 -10::67::2::1424380312 -10::68::1::1424380312 -10::75::1::1424380312 -10::77::1::1424380312 -10::79::1::1424380312 -10::86::1::1424380312 -10::89::3::1424380312 -10::90::1::1424380312 -10::97::1::1424380312 -10::98::1::1424380312 -11::0::1::1424380312 -11::6::2::1424380312 -11::9::1::1424380312 -11::10::1::1424380312 -11::11::1::1424380312 -11::12::1::1424380312 -11::13::4::1424380312 -11::16::1::1424380312 -11::18::5::1424380312 -11::19::4::1424380312 -11::20::1::1424380312 -11::21::1::1424380312 -11::22::1::1424380312 -11::23::5::1424380312 -11::25::1::1424380312 -11::27::5::1424380312 -11::30::5::1424380312 -11::32::5::1424380312 -11::35::3::1424380312 -11::36::2::1424380312 -11::37::2::1424380312 -11::38::4::1424380312 -11::39::1::1424380312 -11::40::1::1424380312 -11::41::1::1424380312 -11::43::2::1424380312 -11::45::1::1424380312 -11::47::1::1424380312 -11::48::5::1424380312 -11::50::4::1424380312 -11::51::3::1424380312 -11::59::1::1424380312 -11::61::1::1424380312 -11::62::1::1424380312 -11::64::1::1424380312 -11::66::4::1424380312 -11::67::1::1424380312 -11::69::5::1424380312 -11::70::1::1424380312 -11::71::3::1424380312 -11::72::3::1424380312 -11::75::3::1424380312 -11::76::1::1424380312 -11::77::1::1424380312 -11::78::1::1424380312 -11::79::5::1424380312 -11::80::3::1424380312 -11::81::4::1424380312 -11::82::1::1424380312 -11::86::1::1424380312 -11::88::1::1424380312 -11::89::1::1424380312 -11::90::4::1424380312 -11::94::2::1424380312 -11::97::3::1424380312 -11::99::1::1424380312 -12::2::1::1424380312 -12::4::1::1424380312 -12::6::1::1424380312 -12::7::3::1424380312 -12::8::1::1424380312 -12::14::1::1424380312 -12::15::2::1424380312 -12::16::4::1424380312 -12::17::5::1424380312 -12::18::2::1424380312 -12::21::1::1424380312 -12::22::2::1424380312 -12::23::3::1424380312 -12::24::1::1424380312 -12::25::1::1424380312 -12::27::5::1424380312 -12::30::2::1424380312 -12::31::4::1424380312 -12::35::5::1424380312 -12::38::1::1424380312 -12::41::1::1424380312 -12::44::2::1424380312 -12::45::1::1424380312 -12::50::4::1424380312 -12::51::1::1424380312 -12::52::1::1424380312 -12::53::1::1424380312 -12::54::1::1424380312 -12::56::2::1424380312 -12::57::1::1424380312 -12::60::1::1424380312 -12::63::1::1424380312 -12::64::5::1424380312 -12::66::3::1424380312 -12::67::1::1424380312 -12::70::1::1424380312 -12::72::1::1424380312 -12::74::1::1424380312 -12::75::1::1424380312 -12::77::1::1424380312 -12::78::1::1424380312 -12::79::3::1424380312 -12::82::2::1424380312 -12::83::1::1424380312 -12::84::1::1424380312 -12::85::1::1424380312 -12::86::1::1424380312 -12::87::1::1424380312 -12::88::1::1424380312 -12::91::3::1424380312 -12::92::1::1424380312 -12::94::4::1424380312 -12::95::2::1424380312 -12::96::1::1424380312 -12::98::2::1424380312 -13::0::1::1424380312 -13::3::1::1424380312 -13::4::2::1424380312 -13::5::1::1424380312 -13::6::1::1424380312 -13::12::1::1424380312 -13::14::2::1424380312 -13::15::1::1424380312 -13::17::1::1424380312 -13::18::3::1424380312 -13::20::1::1424380312 -13::21::1::1424380312 -13::22::1::1424380312 -13::26::1::1424380312 -13::27::1::1424380312 -13::29::3::1424380312 -13::31::1::1424380312 -13::33::1::1424380312 -13::40::2::1424380312 -13::43::2::1424380312 -13::44::1::1424380312 -13::45::1::1424380312 -13::49::1::1424380312 -13::51::1::1424380312 -13::52::2::1424380312 -13::53::3::1424380312 -13::54::1::1424380312 -13::62::1::1424380312 -13::63::2::1424380312 -13::64::1::1424380312 -13::68::1::1424380312 -13::71::1::1424380312 -13::72::3::1424380312 -13::73::1::1424380312 -13::74::3::1424380312 -13::77::2::1424380312 -13::78::1::1424380312 -13::79::2::1424380312 -13::83::3::1424380312 -13::85::1::1424380312 -13::86::1::1424380312 -13::87::2::1424380312 -13::88::2::1424380312 -13::90::1::1424380312 -13::93::4::1424380312 -13::94::1::1424380312 -13::98::1::1424380312 -13::99::1::1424380312 -14::1::1::1424380312 -14::3::3::1424380312 -14::4::1::1424380312 -14::5::1::1424380312 -14::6::1::1424380312 -14::7::1::1424380312 -14::9::1::1424380312 -14::10::1::1424380312 -14::11::1::1424380312 -14::12::1::1424380312 -14::13::1::1424380312 -14::14::3::1424380312 -14::15::1::1424380312 -14::16::1::1424380312 -14::17::1::1424380312 -14::20::1::1424380312 -14::21::1::1424380312 -14::24::1::1424380312 -14::25::2::1424380312 -14::27::1::1424380312 -14::28::1::1424380312 -14::29::5::1424380312 -14::31::3::1424380312 -14::34::1::1424380312 -14::36::1::1424380312 -14::37::2::1424380312 -14::39::2::1424380312 -14::40::1::1424380312 -14::44::1::1424380312 -14::45::1::1424380312 -14::47::3::1424380312 -14::48::1::1424380312 -14::49::1::1424380312 -14::51::1::1424380312 -14::52::5::1424380312 -14::53::3::1424380312 -14::54::1::1424380312 -14::55::1::1424380312 -14::56::1::1424380312 -14::62::4::1424380312 -14::63::5::1424380312 -14::67::3::1424380312 -14::68::1::1424380312 -14::69::3::1424380312 -14::71::1::1424380312 -14::72::4::1424380312 -14::73::1::1424380312 -14::76::5::1424380312 -14::79::1::1424380312 -14::82::1::1424380312 -14::83::1::1424380312 -14::88::1::1424380312 -14::93::3::1424380312 -14::94::1::1424380312 -14::95::2::1424380312 -14::96::4::1424380312 -14::98::1::1424380312 -15::0::1::1424380312 -15::1::4::1424380312 -15::2::1::1424380312 -15::5::2::1424380312 -15::6::1::1424380312 -15::7::1::1424380312 -15::13::1::1424380312 -15::14::1::1424380312 -15::15::1::1424380312 -15::17::2::1424380312 -15::19::2::1424380312 -15::22::2::1424380312 -15::23::2::1424380312 -15::25::1::1424380312 -15::26::3::1424380312 -15::27::1::1424380312 -15::28::2::1424380312 -15::29::1::1424380312 -15::32::1::1424380312 -15::33::2::1424380312 -15::34::1::1424380312 -15::35::2::1424380312 -15::36::1::1424380312 -15::37::1::1424380312 -15::39::1::1424380312 -15::42::1::1424380312 -15::46::5::1424380312 -15::48::2::1424380312 -15::50::2::1424380312 -15::51::1::1424380312 -15::52::1::1424380312 -15::58::1::1424380312 -15::62::1::1424380312 -15::64::3::1424380312 -15::65::2::1424380312 -15::72::1::1424380312 -15::73::1::1424380312 -15::74::1::1424380312 -15::79::1::1424380312 -15::80::1::1424380312 -15::81::1::1424380312 -15::82::2::1424380312 -15::85::1::1424380312 -15::87::1::1424380312 -15::91::2::1424380312 -15::96::1::1424380312 -15::97::1::1424380312 -15::98::3::1424380312 -16::2::1::1424380312 -16::5::3::1424380312 -16::6::2::1424380312 -16::7::1::1424380312 -16::9::1::1424380312 -16::12::1::1424380312 -16::14::1::1424380312 -16::15::1::1424380312 -16::19::1::1424380312 -16::21::2::1424380312 -16::29::4::1424380312 -16::30::2::1424380312 -16::32::1::1424380312 -16::34::1::1424380312 -16::36::1::1424380312 -16::38::1::1424380312 -16::46::1::1424380312 -16::47::3::1424380312 -16::48::1::1424380312 -16::49::1::1424380312 -16::50::1::1424380312 -16::51::5::1424380312 -16::54::5::1424380312 -16::55::1::1424380312 -16::56::2::1424380312 -16::57::1::1424380312 -16::60::1::1424380312 -16::63::2::1424380312 -16::65::1::1424380312 -16::67::1::1424380312 -16::72::1::1424380312 -16::74::1::1424380312 -16::80::1::1424380312 -16::81::1::1424380312 -16::82::1::1424380312 -16::85::5::1424380312 -16::86::1::1424380312 -16::90::5::1424380312 -16::91::1::1424380312 -16::93::1::1424380312 -16::94::3::1424380312 -16::95::2::1424380312 -16::96::3::1424380312 -16::98::3::1424380312 -16::99::1::1424380312 -17::2::1::1424380312 -17::3::1::1424380312 -17::6::1::1424380312 -17::10::4::1424380312 -17::11::1::1424380312 -17::13::2::1424380312 -17::17::5::1424380312 -17::19::1::1424380312 -17::20::5::1424380312 -17::22::4::1424380312 -17::28::1::1424380312 -17::29::1::1424380312 -17::33::1::1424380312 -17::34::1::1424380312 -17::35::2::1424380312 -17::37::1::1424380312 -17::38::1::1424380312 -17::45::1::1424380312 -17::46::5::1424380312 -17::47::1::1424380312 -17::49::3::1424380312 -17::51::1::1424380312 -17::55::5::1424380312 -17::56::3::1424380312 -17::57::1::1424380312 -17::58::1::1424380312 -17::59::1::1424380312 -17::60::1::1424380312 -17::63::1::1424380312 -17::66::1::1424380312 -17::68::4::1424380312 -17::69::1::1424380312 -17::70::1::1424380312 -17::72::1::1424380312 -17::73::3::1424380312 -17::78::1::1424380312 -17::79::1::1424380312 -17::82::2::1424380312 -17::84::1::1424380312 -17::90::5::1424380312 -17::91::3::1424380312 -17::92::1::1424380312 -17::93::1::1424380312 -17::94::4::1424380312 -17::95::2::1424380312 -17::97::1::1424380312 -18::1::1::1424380312 -18::4::3::1424380312 -18::5::2::1424380312 -18::6::1::1424380312 -18::7::1::1424380312 -18::10::1::1424380312 -18::11::4::1424380312 -18::12::2::1424380312 -18::13::1::1424380312 -18::15::1::1424380312 -18::18::1::1424380312 -18::20::1::1424380312 -18::21::2::1424380312 -18::22::1::1424380312 -18::23::2::1424380312 -18::25::1::1424380312 -18::26::1::1424380312 -18::27::1::1424380312 -18::28::5::1424380312 -18::29::1::1424380312 -18::31::1::1424380312 -18::32::1::1424380312 -18::36::1::1424380312 -18::38::5::1424380312 -18::39::5::1424380312 -18::40::1::1424380312 -18::42::1::1424380312 -18::43::1::1424380312 -18::44::4::1424380312 -18::46::1::1424380312 -18::47::1::1424380312 -18::48::1::1424380312 -18::51::2::1424380312 -18::55::1::1424380312 -18::56::1::1424380312 -18::57::1::1424380312 -18::62::1::1424380312 -18::63::1::1424380312 -18::66::3::1424380312 -18::67::1::1424380312 -18::70::1::1424380312 -18::75::1::1424380312 -18::76::3::1424380312 -18::77::1::1424380312 -18::80::3::1424380312 -18::81::3::1424380312 -18::82::1::1424380312 -18::83::5::1424380312 -18::84::1::1424380312 -18::97::1::1424380312 -18::98::1::1424380312 -18::99::2::1424380312 -19::0::1::1424380312 -19::1::1::1424380312 -19::2::1::1424380312 -19::4::1::1424380312 -19::6::2::1424380312 -19::11::1::1424380312 -19::12::1::1424380312 -19::14::1::1424380312 -19::23::1::1424380312 -19::26::1::1424380312 -19::31::1::1424380312 -19::32::4::1424380312 -19::33::1::1424380312 -19::34::1::1424380312 -19::37::1::1424380312 -19::38::1::1424380312 -19::41::1::1424380312 -19::43::1::1424380312 -19::45::1::1424380312 -19::48::1::1424380312 -19::49::1::1424380312 -19::50::2::1424380312 -19::53::2::1424380312 -19::54::3::1424380312 -19::55::1::1424380312 -19::56::2::1424380312 -19::58::1::1424380312 -19::61::1::1424380312 -19::62::1::1424380312 -19::63::1::1424380312 -19::64::1::1424380312 -19::65::1::1424380312 -19::69::2::1424380312 -19::72::1::1424380312 -19::74::3::1424380312 -19::76::1::1424380312 -19::78::1::1424380312 -19::79::1::1424380312 -19::81::1::1424380312 -19::82::1::1424380312 -19::84::1::1424380312 -19::86::1::1424380312 -19::87::2::1424380312 -19::90::4::1424380312 -19::93::1::1424380312 -19::94::4::1424380312 -19::95::2::1424380312 -19::96::1::1424380312 -19::98::4::1424380312 -20::0::1::1424380312 -20::1::1::1424380312 -20::2::2::1424380312 -20::4::2::1424380312 -20::6::1::1424380312 -20::8::1::1424380312 -20::12::1::1424380312 -20::21::2::1424380312 -20::22::5::1424380312 -20::24::2::1424380312 -20::25::1::1424380312 -20::26::1::1424380312 -20::29::2::1424380312 -20::30::2::1424380312 -20::32::2::1424380312 -20::39::1::1424380312 -20::40::1::1424380312 -20::41::2::1424380312 -20::45::2::1424380312 -20::48::1::1424380312 -20::50::1::1424380312 -20::51::3::1424380312 -20::53::3::1424380312 -20::55::1::1424380312 -20::57::2::1424380312 -20::60::1::1424380312 -20::61::1::1424380312 -20::64::1::1424380312 -20::66::1::1424380312 -20::70::2::1424380312 -20::72::1::1424380312 -20::73::2::1424380312 -20::75::4::1424380312 -20::76::1::1424380312 -20::77::4::1424380312 -20::78::1::1424380312 -20::79::1::1424380312 -20::84::2::1424380312 -20::85::2::1424380312 -20::88::3::1424380312 -20::89::1::1424380312 -20::90::3::1424380312 -20::91::1::1424380312 -20::92::2::1424380312 -20::93::1::1424380312 -20::94::4::1424380312 -20::97::1::1424380312 -21::0::1::1424380312 -21::2::4::1424380312 -21::3::1::1424380312 -21::7::2::1424380312 -21::11::1::1424380312 -21::12::1::1424380312 -21::13::1::1424380312 -21::14::3::1424380312 -21::17::1::1424380312 -21::19::1::1424380312 -21::20::1::1424380312 -21::21::1::1424380312 -21::22::1::1424380312 -21::23::1::1424380312 -21::24::1::1424380312 -21::27::1::1424380312 -21::29::5::1424380312 -21::30::2::1424380312 -21::38::1::1424380312 -21::40::2::1424380312 -21::43::3::1424380312 -21::44::1::1424380312 -21::45::1::1424380312 -21::46::1::1424380312 -21::48::1::1424380312 -21::51::1::1424380312 -21::53::5::1424380312 -21::54::1::1424380312 -21::55::1::1424380312 -21::56::1::1424380312 -21::58::3::1424380312 -21::59::3::1424380312 -21::64::1::1424380312 -21::66::1::1424380312 -21::68::1::1424380312 -21::71::1::1424380312 -21::73::1::1424380312 -21::74::4::1424380312 -21::80::1::1424380312 -21::81::1::1424380312 -21::83::1::1424380312 -21::84::1::1424380312 -21::85::3::1424380312 -21::87::4::1424380312 -21::89::2::1424380312 -21::92::2::1424380312 -21::96::3::1424380312 -21::99::1::1424380312 -22::0::1::1424380312 -22::3::2::1424380312 -22::5::2::1424380312 -22::6::2::1424380312 -22::9::1::1424380312 -22::10::1::1424380312 -22::11::1::1424380312 -22::13::1::1424380312 -22::14::1::1424380312 -22::16::1::1424380312 -22::18::3::1424380312 -22::19::1::1424380312 -22::22::5::1424380312 -22::25::1::1424380312 -22::26::1::1424380312 -22::29::3::1424380312 -22::30::5::1424380312 -22::32::4::1424380312 -22::33::1::1424380312 -22::35::1::1424380312 -22::36::3::1424380312 -22::37::1::1424380312 -22::40::1::1424380312 -22::41::3::1424380312 -22::44::1::1424380312 -22::45::2::1424380312 -22::48::1::1424380312 -22::51::5::1424380312 -22::55::1::1424380312 -22::56::2::1424380312 -22::60::3::1424380312 -22::61::1::1424380312 -22::62::4::1424380312 -22::63::1::1424380312 -22::65::1::1424380312 -22::66::1::1424380312 -22::68::4::1424380312 -22::69::4::1424380312 -22::70::3::1424380312 -22::71::1::1424380312 -22::74::5::1424380312 -22::75::5::1424380312 -22::78::1::1424380312 -22::80::3::1424380312 -22::81::1::1424380312 -22::82::1::1424380312 -22::84::1::1424380312 -22::86::1::1424380312 -22::87::3::1424380312 -22::88::5::1424380312 -22::90::2::1424380312 -22::92::3::1424380312 -22::95::2::1424380312 -22::96::2::1424380312 -22::98::4::1424380312 -22::99::1::1424380312 -23::0::1::1424380312 -23::2::1::1424380312 -23::4::1::1424380312 -23::6::2::1424380312 -23::10::4::1424380312 -23::12::1::1424380312 -23::13::4::1424380312 -23::14::1::1424380312 -23::15::1::1424380312 -23::18::4::1424380312 -23::22::2::1424380312 -23::23::4::1424380312 -23::24::1::1424380312 -23::25::1::1424380312 -23::26::1::1424380312 -23::27::5::1424380312 -23::28::1::1424380312 -23::29::1::1424380312 -23::30::4::1424380312 -23::32::5::1424380312 -23::33::2::1424380312 -23::36::3::1424380312 -23::37::1::1424380312 -23::38::1::1424380312 -23::39::1::1424380312 -23::43::1::1424380312 -23::48::5::1424380312 -23::49::5::1424380312 -23::50::4::1424380312 -23::53::1::1424380312 -23::55::5::1424380312 -23::57::1::1424380312 -23::59::1::1424380312 -23::60::1::1424380312 -23::61::1::1424380312 -23::64::4::1424380312 -23::65::5::1424380312 -23::66::2::1424380312 -23::67::1::1424380312 -23::68::3::1424380312 -23::69::1::1424380312 -23::72::1::1424380312 -23::73::3::1424380312 -23::77::1::1424380312 -23::82::2::1424380312 -23::83::1::1424380312 -23::84::1::1424380312 -23::85::1::1424380312 -23::87::3::1424380312 -23::88::1::1424380312 -23::95::2::1424380312 -23::97::1::1424380312 -24::4::1::1424380312 -24::6::3::1424380312 -24::7::1::1424380312 -24::10::2::1424380312 -24::12::1::1424380312 -24::15::1::1424380312 -24::19::1::1424380312 -24::24::1::1424380312 -24::27::3::1424380312 -24::30::5::1424380312 -24::31::1::1424380312 -24::32::3::1424380312 -24::33::1::1424380312 -24::37::1::1424380312 -24::39::1::1424380312 -24::40::1::1424380312 -24::42::1::1424380312 -24::43::3::1424380312 -24::45::2::1424380312 -24::46::1::1424380312 -24::47::1::1424380312 -24::48::1::1424380312 -24::49::1::1424380312 -24::50::1::1424380312 -24::52::5::1424380312 -24::57::1::1424380312 -24::59::4::1424380312 -24::63::4::1424380312 -24::65::1::1424380312 -24::66::1::1424380312 -24::67::1::1424380312 -24::68::3::1424380312 -24::69::5::1424380312 -24::71::1::1424380312 -24::72::4::1424380312 -24::77::4::1424380312 -24::78::1::1424380312 -24::80::1::1424380312 -24::82::1::1424380312 -24::84::1::1424380312 -24::86::1::1424380312 -24::87::1::1424380312 -24::88::2::1424380312 -24::89::1::1424380312 -24::90::5::1424380312 -24::91::1::1424380312 -24::92::1::1424380312 -24::94::2::1424380312 -24::95::1::1424380312 -24::96::5::1424380312 -24::98::1::1424380312 -24::99::1::1424380312 -25::1::3::1424380312 -25::2::1::1424380312 -25::7::1::1424380312 -25::9::1::1424380312 -25::12::3::1424380312 -25::16::3::1424380312 -25::17::1::1424380312 -25::18::1::1424380312 -25::20::1::1424380312 -25::22::1::1424380312 -25::23::1::1424380312 -25::26::2::1424380312 -25::29::1::1424380312 -25::30::1::1424380312 -25::31::2::1424380312 -25::33::4::1424380312 -25::34::3::1424380312 -25::35::2::1424380312 -25::36::1::1424380312 -25::37::1::1424380312 -25::40::1::1424380312 -25::41::1::1424380312 -25::43::1::1424380312 -25::47::4::1424380312 -25::50::1::1424380312 -25::51::1::1424380312 -25::53::1::1424380312 -25::56::1::1424380312 -25::58::2::1424380312 -25::64::2::1424380312 -25::67::2::1424380312 -25::68::1::1424380312 -25::70::1::1424380312 -25::71::4::1424380312 -25::73::1::1424380312 -25::74::1::1424380312 -25::76::1::1424380312 -25::79::1::1424380312 -25::82::1::1424380312 -25::84::2::1424380312 -25::85::1::1424380312 -25::91::3::1424380312 -25::92::1::1424380312 -25::94::1::1424380312 -25::95::1::1424380312 -25::97::2::1424380312 -26::0::1::1424380312 -26::1::1::1424380312 -26::2::1::1424380312 -26::3::1::1424380312 -26::4::4::1424380312 -26::5::2::1424380312 -26::6::3::1424380312 -26::7::5::1424380312 -26::13::3::1424380312 -26::14::1::1424380312 -26::16::1::1424380312 -26::18::3::1424380312 -26::20::1::1424380312 -26::21::3::1424380312 -26::22::5::1424380312 -26::23::5::1424380312 -26::24::5::1424380312 -26::27::1::1424380312 -26::31::1::1424380312 -26::35::1::1424380312 -26::36::4::1424380312 -26::40::1::1424380312 -26::44::1::1424380312 -26::45::2::1424380312 -26::47::1::1424380312 -26::48::1::1424380312 -26::49::3::1424380312 -26::50::2::1424380312 -26::52::1::1424380312 -26::54::4::1424380312 -26::55::1::1424380312 -26::57::3::1424380312 -26::58::1::1424380312 -26::61::1::1424380312 -26::62::2::1424380312 -26::66::1::1424380312 -26::68::4::1424380312 -26::71::1::1424380312 -26::73::4::1424380312 -26::76::1::1424380312 -26::81::3::1424380312 -26::85::1::1424380312 -26::86::3::1424380312 -26::88::5::1424380312 -26::91::1::1424380312 -26::94::5::1424380312 -26::95::1::1424380312 -26::96::1::1424380312 -26::97::1::1424380312 -27::0::1::1424380312 -27::9::1::1424380312 -27::10::1::1424380312 -27::18::4::1424380312 -27::19::3::1424380312 -27::20::1::1424380312 -27::22::2::1424380312 -27::24::2::1424380312 -27::25::1::1424380312 -27::27::3::1424380312 -27::28::1::1424380312 -27::29::1::1424380312 -27::31::1::1424380312 -27::33::3::1424380312 -27::40::1::1424380312 -27::42::1::1424380312 -27::43::1::1424380312 -27::44::3::1424380312 -27::45::1::1424380312 -27::51::3::1424380312 -27::52::1::1424380312 -27::55::3::1424380312 -27::57::1::1424380312 -27::59::1::1424380312 -27::60::1::1424380312 -27::61::1::1424380312 -27::64::1::1424380312 -27::66::3::1424380312 -27::68::1::1424380312 -27::70::1::1424380312 -27::71::2::1424380312 -27::72::1::1424380312 -27::75::3::1424380312 -27::78::1::1424380312 -27::80::3::1424380312 -27::82::1::1424380312 -27::83::3::1424380312 -27::86::1::1424380312 -27::87::2::1424380312 -27::90::1::1424380312 -27::91::1::1424380312 -27::92::1::1424380312 -27::93::1::1424380312 -27::94::2::1424380312 -27::95::1::1424380312 -27::98::1::1424380312 -28::0::3::1424380312 -28::1::1::1424380312 -28::2::4::1424380312 -28::3::1::1424380312 -28::6::1::1424380312 -28::7::1::1424380312 -28::12::5::1424380312 -28::13::2::1424380312 -28::14::1::1424380312 -28::15::1::1424380312 -28::17::1::1424380312 -28::19::3::1424380312 -28::20::1::1424380312 -28::23::3::1424380312 -28::24::3::1424380312 -28::27::1::1424380312 -28::29::1::1424380312 -28::33::1::1424380312 -28::34::1::1424380312 -28::36::1::1424380312 -28::38::2::1424380312 -28::39::2::1424380312 -28::44::1::1424380312 -28::45::1::1424380312 -28::49::4::1424380312 -28::50::1::1424380312 -28::52::1::1424380312 -28::54::1::1424380312 -28::56::1::1424380312 -28::57::3::1424380312 -28::58::1::1424380312 -28::59::1::1424380312 -28::60::1::1424380312 -28::62::3::1424380312 -28::63::1::1424380312 -28::65::1::1424380312 -28::75::1::1424380312 -28::78::1::1424380312 -28::81::5::1424380312 -28::82::4::1424380312 -28::83::1::1424380312 -28::85::1::1424380312 -28::88::2::1424380312 -28::89::4::1424380312 -28::90::1::1424380312 -28::92::5::1424380312 -28::94::1::1424380312 -28::95::2::1424380312 -28::98::1::1424380312 -28::99::1::1424380312 -29::3::1::1424380312 -29::4::1::1424380312 -29::5::1::1424380312 -29::7::2::1424380312 -29::9::1::1424380312 -29::10::3::1424380312 -29::11::1::1424380312 -29::13::3::1424380312 -29::14::1::1424380312 -29::15::1::1424380312 -29::17::3::1424380312 -29::19::3::1424380312 -29::22::3::1424380312 -29::23::4::1424380312 -29::25::1::1424380312 -29::29::1::1424380312 -29::31::1::1424380312 -29::32::4::1424380312 -29::33::2::1424380312 -29::36::2::1424380312 -29::38::3::1424380312 -29::39::1::1424380312 -29::42::1::1424380312 -29::46::5::1424380312 -29::49::3::1424380312 -29::51::2::1424380312 -29::59::1::1424380312 -29::61::1::1424380312 -29::62::1::1424380312 -29::67::1::1424380312 -29::68::3::1424380312 -29::69::1::1424380312 -29::70::1::1424380312 -29::74::1::1424380312 -29::75::1::1424380312 -29::79::2::1424380312 -29::80::1::1424380312 -29::81::2::1424380312 -29::83::1::1424380312 -29::85::1::1424380312 -29::86::1::1424380312 -29::90::4::1424380312 -29::93::1::1424380312 -29::94::4::1424380312 -29::97::1::1424380312 -29::99::1::1424380312 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/als/test.data b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/als/test.data deleted file mode 100644 index e476cc2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/als/test.data +++ /dev/null @@ -1,16 +0,0 @@ -1,1,5.0 -1,2,1.0 -1,3,5.0 -1,4,1.0 -2,1,5.0 -2,2,1.0 -2,3,5.0 -2,4,1.0 -3,1,1.0 -3,2,5.0 -3,3,1.0 -3,4,5.0 -4,1,1.0 -4,2,5.0 -4,3,1.0 -4,4,5.0 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/gmm_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/gmm_data.txt deleted file mode 100644 index 934ee4a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/gmm_data.txt +++ /dev/null @@ -1,2000 +0,0 @@ - 2.59470454e+00 2.12298217e+00 - 1.15807024e+00 -1.46498723e-01 - 2.46206638e+00 6.19556894e-01 - -5.54845070e-01 -7.24700066e-01 - -3.23111426e+00 -1.42579084e+00 - 3.02978115e+00 7.87121753e-01 - 1.97365907e+00 1.15914704e+00 - -6.44852101e+00 -3.18154314e+00 - 1.30963349e+00 1.62866434e-01 - 4.26482541e+00 2.15547996e+00 - 3.79927257e+00 1.50572445e+00 - 4.17452609e-01 -6.74032760e-01 - 4.21117627e-01 4.45590255e-01 - -2.80425571e+00 -7.77150554e-01 - 2.55928797e+00 7.03954218e-01 - 1.32554059e+00 -9.46663152e-01 - -3.39691439e+00 -1.49005743e+00 - -2.26542270e-01 3.60052515e-02 - 1.04994198e+00 5.29825685e-01 - -1.51566882e+00 -1.86264432e-01 - -3.27928172e-01 -7.60859110e-01 - -3.18054866e-01 3.97719805e-01 - 1.65579418e-01 -3.47232033e-01 - 6.47162333e-01 4.96059961e-02 - -2.80776647e-01 4.79418757e-01 - 7.45069752e-01 1.20790281e-01 - 2.13604102e-01 1.59542555e-01 - -3.08860224e+00 -1.43259870e+00 - 8.97066497e-01 1.10206801e+00 - -2.23918874e-01 -1.07267267e+00 - 2.51525708e+00 2.84761973e-01 - 9.98052532e-01 1.08333783e+00 - 1.76705588e+00 8.18866778e-01 - 5.31555163e-02 -1.90111151e-01 - -2.17405059e+00 7.21854582e-02 - -2.13772505e+00 -3.62010387e-01 - 2.95974057e+00 1.31602381e+00 - 2.74053561e+00 1.61781757e+00 - 6.68135448e-01 2.86586009e-01 - 2.82323739e+00 1.74437257e+00 - 8.11540288e-01 5.50744478e-01 - 4.10050897e-01 5.10668402e-03 - 9.58626136e-01 -3.49633680e-01 - 4.66599798e+00 1.49964894e+00 - 4.94507794e-01 2.58928077e-01 - -2.36029742e+00 -1.61042909e+00 - -4.99306804e-01 -8.04984769e-01 - 1.07448510e+00 9.39605828e-01 - -1.80448949e+00 -1.05983264e+00 - -3.22353821e-01 1.73612093e-01 - 1.85418702e+00 1.15640643e+00 - 6.93794163e-01 6.59993560e-01 - 1.99399102e+00 1.44547123e+00 - 3.38866124e+00 1.23379290e+00 - -4.24067720e+00 -1.22264282e+00 - 6.03230201e-02 2.95232729e-01 - -3.59341813e+00 -7.17453726e-01 - 4.87447372e-01 -2.00733911e-01 - 1.20149195e+00 4.07880197e-01 - -2.13331464e+00 -4.58518077e-01 - -3.84091083e+00 -1.71553950e+00 - -5.37279250e-01 2.64822629e-02 - -2.10155227e+00 -1.32558103e+00 - -1.71318897e+00 -7.12098563e-01 - -1.46280695e+00 -1.84868337e-01 - -3.59785325e+00 -1.54832434e+00 - -5.77528081e-01 -5.78580857e-01 - 3.14734283e-01 5.80184639e-01 - -2.71164714e+00 -1.19379432e+00 - 1.09634489e+00 7.20143887e-01 - -3.05527722e+00 -1.47774064e+00 - 6.71753586e-01 7.61350020e-01 - 3.98294144e+00 1.54166484e+00 - -3.37220384e+00 -2.21332064e+00 - 1.81222914e+00 7.41212752e-01 - 2.71458282e-01 1.36329078e-01 - -3.97815359e-01 1.16766886e-01 - -1.70192814e+00 -9.75851571e-01 - -3.46803804e+00 -1.09965988e+00 - -1.69649627e+00 -5.76045801e-01 - -1.02485636e-01 -8.81841246e-01 - -3.24194667e-02 2.55429276e-01 - -2.75343168e+00 -1.51366320e+00 - -2.78676702e+00 -5.22360489e-01 - 1.70483164e+00 1.19769805e+00 - 4.92022579e-01 3.24944706e-01 - 2.48768464e+00 1.00055363e+00 - 4.48786400e-01 7.63902870e-01 - 2.93862696e+00 1.73809968e+00 - -3.55019305e+00 -1.97875558e+00 - 1.74270784e+00 6.90229224e-01 - 5.13391994e-01 4.58374016e-01 - 1.78379499e+00 9.08026381e-01 - 1.75814147e+00 7.41449784e-01 - -2.30687792e-01 3.91009729e-01 - 3.92271353e+00 1.44006290e+00 - 2.93361679e-01 -4.99886375e-03 - 2.47902690e-01 -7.49542503e-01 - -3.97675355e-01 1.36824887e-01 - 3.56535953e+00 1.15181329e+00 - 3.22425301e+00 1.28702383e+00 - -2.94192478e-01 -2.42382557e-01 - 8.02068864e-01 -1.51671475e-01 - 8.54133530e-01 -4.89514885e-02 - -1.64316316e-01 -5.34642346e-01 - -6.08485405e-01 -2.10332352e-01 - -2.18940059e+00 -1.07024952e+00 - -1.71586960e+00 -2.83333492e-02 - 1.70200448e-01 -3.28031178e-01 - -1.97210346e+00 -5.39948532e-01 - 2.19500160e+00 1.05697170e+00 - -1.76239935e+00 -1.09377438e+00 - 1.68314744e+00 6.86491164e-01 - -2.99852288e+00 -1.46619067e+00 - -2.23769560e+00 -9.15008355e-01 - 9.46887516e-01 5.58410503e-01 - 5.02153123e-01 1.63851235e-01 - -9.70297062e-01 3.14625374e-01 - -1.29405593e+00 -8.20994131e-01 - 2.72516079e+00 7.85839947e-01 - 1.45788024e+00 3.37487353e-01 - -4.36292749e-01 -5.42150480e-01 - 2.21304711e+00 1.25254042e+00 - -1.20810271e-01 4.79632898e-01 - -3.30884511e+00 -1.50607586e+00 - -6.55882455e+00 -1.94231256e+00 - -3.17033630e+00 -9.94678930e-01 - 1.42043617e+00 7.28808957e-01 - -1.57546099e+00 -1.10320497e+00 - -3.22748754e+00 -1.64174579e+00 - 2.96776017e-03 -3.16191512e-02 - -2.25986054e+00 -6.13123197e-01 - 2.49434243e+00 7.73069183e-01 - 9.08494049e-01 -1.53926853e-01 - -2.80559090e+00 -1.37474221e+00 - 4.75224286e-01 2.53153674e-01 - 4.37644006e+00 8.49116998e-01 - 2.27282959e+00 6.16568202e-01 - 1.16006880e+00 1.65832798e-01 - -1.67163193e+00 -1.22555386e+00 - -1.38231118e+00 -7.29575504e-01 - -3.49922750e+00 -2.26446675e+00 - -3.73780110e-01 -1.90657869e-01 - 1.68627679e+00 1.05662987e+00 - -3.28891792e+00 -1.11080334e+00 - -2.59815798e+00 -1.51410198e+00 - -2.61203309e+00 -6.00143552e-01 - 6.58964943e-01 4.47216094e-01 - -2.26711381e+00 -7.26512923e-01 - -5.31429009e-02 -1.97925341e-02 - 3.19749807e+00 9.20425476e-01 - -1.37595787e+00 -6.58062732e-01 - 8.09900278e-01 -3.84286160e-01 - -5.07741280e+00 -1.97683808e+00 - -2.99764250e+00 -1.50753777e+00 - -9.87671815e-01 -4.63255889e-01 - 1.65390765e+00 6.73806615e-02 - 5.51252659e+00 2.69842267e+00 - -2.23724309e+00 -4.77624004e-01 - 4.99726228e+00 1.74690949e+00 - 1.75859162e-01 -1.49350995e-01 - 4.13382789e+00 1.31735161e+00 - 2.69058117e+00 4.87656923e-01 - 1.07180318e+00 1.01426954e+00 - 3.37216869e+00 1.05955377e+00 - -2.95006781e+00 -1.57048303e+00 - -2.46401648e+00 -8.37056374e-01 - 1.19012962e-01 7.54702770e-01 - 3.34142539e+00 4.81938295e-01 - 2.92643913e+00 1.04301050e+00 - 2.89697751e+00 1.37551442e+00 - -1.03094242e+00 2.20903962e-01 - -5.13914589e+00 -2.23355387e+00 - -8.81680780e-01 1.83590000e-01 - 2.82334775e+00 1.26650464e+00 - -2.81042540e-01 -3.26370240e-01 - 2.97995487e+00 8.34569452e-01 - -1.39857135e+00 -1.15798385e+00 - 4.27186506e+00 9.04253702e-01 - 6.98684517e-01 7.91167305e-01 - 3.52233095e+00 1.29976473e+00 - 2.21448029e+00 2.73213379e-01 - -3.13505683e-01 -1.20593774e-01 - 3.70571571e+00 1.06220876e+00 - 9.83881041e-01 5.67713803e-01 - -2.17897705e+00 2.52925205e-01 - 1.38734039e+00 4.61287066e-01 - -1.41181602e+00 -1.67248955e-02 - -1.69974639e+00 -7.17812071e-01 - -2.01005793e-01 -7.49662056e-01 - 1.69016336e+00 3.24687979e-01 - -2.03250179e+00 -2.76108460e-01 - 3.68776848e-01 4.12536941e-01 - 7.66238259e-01 -1.84750637e-01 - -2.73989147e-01 -1.72817250e-01 - -2.18623745e+00 -2.10906798e-01 - -1.39795625e-01 3.26066094e-02 - -2.73826912e-01 -6.67586097e-02 - -1.57880654e+00 -4.99395900e-01 - 4.55950908e+00 2.29410489e+00 - -7.36479631e-01 -1.57861857e-01 - 1.92082888e+00 1.05843391e+00 - 4.29192810e+00 1.38127810e+00 - 1.61852879e+00 1.95871986e-01 - -1.95027403e+00 -5.22448168e-01 - -1.67446281e+00 -9.41497162e-01 - 6.07097859e-01 3.44178029e-01 - -3.44004683e+00 -1.49258461e+00 - 2.72114752e+00 6.00728991e-01 - 8.80685522e-01 -2.53243336e-01 - 1.39254928e+00 3.42988512e-01 - 1.14194836e-01 -8.57945694e-02 - -1.49387332e+00 -7.60860481e-01 - -1.98053285e+00 -4.86039865e-01 - 3.56008568e+00 1.08438692e+00 - 2.27833961e-01 1.09441881e+00 - -1.16716710e+00 -6.54778242e-01 - 2.02156613e+00 5.42075758e-01 - 1.08429178e+00 -7.67420693e-01 - 6.63058455e-01 4.61680991e-01 - -1.06201537e+00 1.38862846e-01 - 3.08701875e+00 8.32580273e-01 - -4.96558108e-01 -2.47031257e-01 - 7.95109987e-01 7.59314147e-02 - -3.39903524e-01 8.71565566e-03 - 8.68351357e-01 4.78358641e-01 - 1.48750819e+00 7.63257420e-01 - -4.51224101e-01 -4.44056898e-01 - -3.02734750e-01 -2.98487961e-01 - 5.46846609e-01 7.02377629e-01 - 1.65129778e+00 3.74008231e-01 - -7.43336512e-01 3.95723531e-01 - -5.88446605e-01 -6.47520211e-01 - 3.58613167e+00 1.95024937e+00 - 3.11718883e+00 8.37984715e-01 - 1.80919244e+00 9.62644986e-01 - 5.43856371e-02 -5.86297543e-01 - -1.95186766e+00 -1.02624212e-01 - 8.95628057e-01 5.91812281e-01 - 4.97691627e-02 5.31137156e-01 - -1.07633113e+00 -2.47392788e-01 - -1.17257986e+00 -8.68528265e-01 - -8.19227665e-02 5.80579434e-03 - -2.86409787e-01 1.95812924e-01 - 1.10582671e+00 7.42853240e-01 - 4.06429774e+00 1.06557476e+00 - -3.42521792e+00 -7.74327139e-01 - 1.28468671e+00 6.20431661e-01 - 6.01201008e-01 -1.16799728e-01 - -1.85058727e-01 -3.76235293e-01 - 5.44083324e+00 2.98490868e+00 - 2.69273070e+00 7.83901153e-01 - 1.88938036e-01 -4.83222152e-01 - 1.05667256e+00 -2.57003165e-01 - 2.99711662e-01 -4.33131912e-01 - 7.73689216e-02 -1.78738364e-01 - 9.58326279e-01 6.38325706e-01 - -3.97727049e-01 2.27314759e-01 - 3.36098175e+00 1.12165237e+00 - 1.77804871e+00 6.46961933e-01 - -2.86945546e+00 -1.00395518e+00 - 3.03494815e+00 7.51814612e-01 - -1.43658194e+00 -3.55432244e-01 - -3.08455105e+00 -1.51535106e+00 - -1.55841975e+00 3.93454820e-02 - 7.96073412e-01 -3.11036969e-01 - -9.84125401e-01 -1.02064649e+00 - -7.75688143e+00 -3.65219926e+00 - 1.53816429e+00 7.65926670e-01 - -4.92712738e-01 2.32244240e-02 - -1.93166919e+00 -1.07701304e+00 - 2.03029875e-02 -7.54055699e-01 - 2.52177489e+00 1.01544979e+00 - 3.65109048e-01 -9.48328494e-01 - -1.28849143e-01 2.51947174e-01 - -1.02428075e+00 -9.37767116e-01 - -3.04179748e+00 -9.97926994e-01 - -2.51986980e+00 -1.69117413e+00 - -1.24900838e+00 -4.16179917e-01 - 2.77943992e+00 1.22842327e+00 - -4.37434557e+00 -1.70182693e+00 - -1.60019319e+00 -4.18345639e-01 - -1.67613646e+00 -9.44087262e-01 - -9.00843245e-01 8.26378089e-02 - 3.29770621e-01 -9.07870444e-01 - -2.84650535e+00 -9.00155396e-01 - 1.57111705e+00 7.07432268e-01 - 1.24948552e+00 1.04812849e-01 - 1.81440558e+00 9.53545082e-01 - -1.74915794e+00 -1.04606288e+00 - 1.20593269e+00 -1.12607147e-02 - 1.36004919e-01 -1.09828044e+00 - 2.57480693e-01 3.34941541e-01 - 7.78775385e-01 -5.32494732e-01 - -1.79155126e+00 -6.29994129e-01 - -1.75706839e+00 -8.35100126e-01 - 4.29512012e-01 7.81426910e-02 - 3.08349370e-01 -1.27359861e-01 - 1.05560329e+00 4.55150640e-01 - 1.95662574e+00 1.17593217e+00 - 8.77376632e-01 6.57866662e-01 - 7.71311255e-01 9.15134334e-02 - -6.36978275e+00 -2.55874241e+00 - -2.98335339e+00 -1.59567024e+00 - -3.67104587e-01 1.85315291e-01 - 1.95347407e+00 -7.15503113e-02 - 8.45556363e-01 6.51256415e-02 - 9.42868521e-01 3.56647624e-01 - 2.99321875e+00 1.07505254e+00 - -2.91030538e-01 -3.77637183e-01 - 1.62870918e+00 3.37563671e-01 - 2.05773173e-01 3.43337416e-01 - -8.40879199e-01 -1.35600767e-01 - 1.38101624e+00 5.99253495e-01 - -6.93715607e+00 -2.63580662e+00 - -1.04423404e+00 -8.32865050e-01 - 1.33448476e+00 1.04863475e+00 - 6.01675207e-01 1.98585194e-01 - 2.31233993e+00 7.98628331e-01 - 1.85201313e-01 -1.76070247e+00 - 1.92006354e+00 8.45737582e-01 - 1.06320415e+00 2.93426068e-01 - -1.20360141e+00 -1.00301288e+00 - 1.95926629e+00 6.26643532e-01 - 6.04483978e-02 5.72643059e-01 - -1.04568563e+00 -5.91021496e-01 - 2.62300678e+00 9.50997831e-01 - -4.04610275e-01 3.73150879e-01 - 2.26371902e+00 8.73627529e-01 - 2.12545313e+00 7.90640352e-01 - 7.72181917e-03 1.65718952e-02 - 1.00422340e-01 -2.05562936e-01 - -1.22989802e+00 -1.01841681e-01 - 3.09064082e+00 1.04288010e+00 - 5.18274167e+00 1.34749259e+00 - -8.32075153e-01 -1.97592029e-01 - 3.84126764e-02 5.58171345e-01 - 4.99560727e-01 -4.26154438e-02 - 4.79071151e+00 2.19728942e+00 - -2.78437968e+00 -1.17812590e+00 - -2.22804226e+00 -4.31174255e-01 - 8.50762292e-01 -1.06445261e-01 - 1.10812830e+00 -2.59118812e-01 - -2.91450155e-01 6.42802679e-01 - -1.38631532e-01 -5.88585623e-01 - -5.04120983e-01 -2.17094915e-01 - 3.41410820e+00 1.67897767e+00 - -2.23697326e+00 -6.62735244e-01 - -3.55961064e-01 -1.27647226e-01 - -3.55568274e+00 -2.49011369e+00 - -8.77586408e-01 -9.38268065e-03 - 1.52382384e-01 -5.62155760e-01 - 1.55885574e-01 1.07617069e-01 - -8.37129973e-01 -5.22259081e-01 - -2.92741750e+00 -1.35049428e+00 - -3.54670781e-01 5.69205952e-02 - 2.21030255e+00 1.34689986e+00 - 1.60787722e+00 5.75984706e-01 - 1.32294221e+00 5.31577509e-01 - 7.05672928e-01 3.34241244e-01 - 1.41406179e+00 1.15783408e+00 - -6.92172228e-01 -2.84817896e-01 - 3.28358655e-01 -2.66910083e-01 - 1.68013644e-01 -4.28016549e-02 - 2.07365974e+00 7.76496211e-01 - -3.92974907e-01 2.46796730e-01 - -5.76078636e-01 3.25676963e-01 - -1.82547204e-01 -5.06410543e-01 - 3.04754906e+00 1.16174496e+00 - -3.01090632e+00 -1.09195183e+00 - -1.44659696e+00 -6.87838682e-01 - 2.11395861e+00 9.10495785e-01 - 1.40962871e+00 1.13568678e+00 - -1.66653234e-01 -2.10012503e-01 - 3.17456029e+00 9.74502922e-01 - 2.15944820e+00 8.62807189e-01 - -3.45418719e+00 -1.33647548e+00 - -3.41357732e+00 -8.47048920e-01 - -3.06702448e-01 -6.64280634e-01 - -2.86930714e-01 -1.35268264e-01 - -3.15835557e+00 -5.43439253e-01 - 2.49541440e-01 -4.71733570e-01 - 2.71933912e+00 4.13308399e-01 - -2.43787038e+00 -1.08050547e+00 - -4.90234490e-01 -6.64069865e-01 - 8.99524451e-02 5.76180541e-01 - 5.00500404e+00 2.12125521e+00 - -1.73107940e-01 -2.28506575e-02 - 5.44938858e-01 -1.29523352e-01 - 5.13526842e+00 1.68785993e+00 - 1.70228304e+00 1.02601138e+00 - 3.58957507e+00 1.54396196e+00 - 1.85615738e+00 4.92916197e-01 - 2.55772147e+00 7.88438908e-01 - -1.57008279e+00 -4.17377300e-01 - -1.42548604e+00 -3.63684860e-01 - -8.52026118e-01 2.72052686e-01 - -5.10563077e+00 -2.35665994e+00 - -2.95517031e+00 -1.84945297e+00 - -2.91947959e+00 -1.66016784e+00 - -4.21462387e+00 -1.41131535e+00 - 6.59901121e-01 4.87156314e-01 - -9.75352532e-01 -4.50231285e-01 - -5.94084444e-01 -1.16922670e+00 - 7.50554615e-01 -9.83692552e-01 - 1.07054926e+00 2.77143030e-01 - -3.88079578e-01 -4.17737309e-02 - -9.59373733e-01 -8.85454886e-01 - -7.53560665e-02 -5.16223870e-02 - 9.84108158e-01 -5.89290700e-02 - 1.87272961e-01 -4.34238391e-01 - 6.86509981e-01 -3.15116460e-01 - -1.07762538e+00 6.58984161e-02 - 6.09266592e-01 6.91808473e-02 - -8.30529954e-01 -7.00454791e-01 - -9.13179464e-01 -6.31712891e-01 - 7.68744851e-01 1.09840676e+00 - -1.07606690e+00 -8.78390282e-01 - -1.71038184e+00 -5.73606033e-01 - 8.75982765e-01 3.66343143e-01 - -7.04919009e-01 -8.49182590e-01 - -1.00274668e+00 -7.99573611e-01 - -1.05562848e+00 -5.84060076e-01 - 4.03490015e+00 1.28679206e+00 - -3.53484804e+00 -1.71381255e+00 - 2.31527363e-01 1.04179397e-01 - -3.58592392e-02 3.74895739e-01 - 3.92253428e+00 1.81852726e+00 - -7.27384249e-01 -6.45605128e-01 - 4.65678097e+00 2.41379899e+00 - 1.16750534e+00 7.60718205e-01 - 1.15677059e+00 7.96225550e-01 - -1.42920261e+00 -4.66946295e-01 - 3.71148192e+00 1.88060191e+00 - 2.44052407e+00 3.84472199e-01 - -1.64535035e+00 -8.94530036e-01 - -3.69608753e+00 -1.36402754e+00 - 2.24419208e+00 9.69744889e-01 - 2.54822427e+00 1.22613039e+00 - 3.77484909e-01 -5.98521878e-01 - -3.61521175e+00 -1.11123912e+00 - 3.28113127e+00 1.52551775e+00 - -3.51030902e+00 -1.53913980e+00 - -2.44874505e+00 -6.30246005e-01 - -3.42516153e-01 -5.07352665e-01 - 1.09110502e+00 6.36821628e-01 - -2.49434967e+00 -8.02827146e-01 - 1.41763139e+00 -3.46591820e-01 - 1.61108619e+00 5.93871102e-01 - 3.97371717e+00 1.35552499e+00 - -1.33437177e+00 -2.83908670e-01 - -1.41606483e+00 -1.76402601e-01 - 2.23945322e-01 -1.77157065e-01 - 2.60271569e+00 2.40778251e-01 - -2.82213895e-02 1.98255474e-01 - 4.20727940e+00 1.31490863e+00 - 3.36944889e+00 1.57566635e+00 - 3.53049396e+00 1.73579350e+00 - -1.29170202e+00 -1.64196290e+00 - 9.27295604e-01 9.98808036e-01 - 1.75321843e-01 -2.83267817e-01 - -2.19069578e+00 -1.12814358e+00 - 1.66606031e+00 7.68006933e-01 - -7.13826035e-01 5.20881684e-02 - -3.43821888e+00 -2.36137021e+00 - -5.93210310e-01 1.21843813e-01 - -4.09800822e+00 -1.39893953e+00 - 2.74110954e+00 1.52728606e+00 - 1.72652512e+00 -1.25435113e-01 - 1.97722357e+00 6.40667481e-01 - 4.18635780e-01 3.57018509e-01 - -1.78303569e+00 -2.11864764e-01 - -3.52809366e+00 -2.58794450e-01 - -4.72407090e+00 -1.63870734e+00 - 1.73917807e+00 8.73251829e-01 - 4.37979356e-01 8.49210569e-01 - 3.93791881e+00 1.76269490e+00 - 2.79065411e+00 1.04019042e+00 - -8.47426142e-01 -3.40136892e-01 - -4.24389181e+00 -1.80253120e+00 - -1.86675870e+00 -7.64558265e-01 - 9.46212675e-01 -7.77681445e-02 - -2.82448462e+00 -1.33592449e+00 - -2.57938567e+00 -1.56554690e+00 - -2.71615767e+00 -6.27667233e-01 - -1.55999166e+00 -5.81013466e-01 - -4.24696864e-01 -7.44673250e-01 - 1.67592970e+00 7.68164292e-01 - 8.48455216e-01 -6.05681126e-01 - 6.12575454e+00 1.65607584e+00 - 1.38207327e+00 2.39261863e-01 - 3.13364450e+00 1.17154698e+00 - 1.71694858e+00 1.26744905e+00 - -1.61746367e+00 -8.80098073e-01 - -8.52196756e-01 -9.27299728e-01 - -1.51562462e-01 -8.36552490e-02 - -7.04792753e-01 -1.24726713e-02 - -3.35265757e+00 -1.82176312e+00 - 3.32173170e-01 -1.33405580e-01 - 4.95841013e-01 4.58292712e-01 - 1.57713955e+00 7.79272991e-01 - 2.09743109e+00 9.23542557e-01 - 3.90450311e-03 -8.42873164e-01 - 2.59519038e+00 7.56479591e-01 - -5.77643976e-01 -2.36401904e-01 - -5.22310654e-01 1.34187830e-01 - -2.22096086e+00 -7.75507719e-01 - 1.35907831e+00 7.80197510e-01 - 3.80355868e+00 1.16983476e+00 - 3.82746596e+00 1.31417718e+00 - 3.30451183e+00 1.55398159e+00 - -3.42917814e-01 -8.62281222e-02 - -2.59093020e+00 -9.29883526e-01 - 1.40928562e+00 1.08398346e+00 - 1.54400137e-01 3.35881092e-01 - 1.59171586e+00 1.18855802e+00 - -5.25164002e-01 -1.03104220e-01 - 2.20067959e+00 1.37074713e+00 - 6.97860830e-01 6.27718548e-01 - -4.59743507e-01 1.36061163e-01 - -1.04691963e-01 -2.16271727e-01 - -1.08905573e+00 -5.95510769e-01 - -1.00826983e+00 -5.38509162e-02 - -3.16402719e+00 -1.33414216e+00 - 1.47870874e-01 1.75234619e-01 - -2.57078234e-01 7.03316889e-02 - 1.81073945e+00 4.26901462e-01 - 2.65476530e+00 6.74217273e-01 - 1.27539811e+00 6.22914081e-01 - -3.76750499e-01 -1.20629449e+00 - 1.00177595e+00 -1.40660091e-01 - -2.98919265e+00 -1.65145013e+00 - -2.21557682e+00 -8.11123452e-01 - -3.22635378e+00 -1.65639056e+00 - -2.72868553e+00 -1.02812087e+00 - 1.26042797e+00 8.49005248e-01 - -9.38318534e-01 -9.87588651e-01 - 3.38013194e-01 -1.00237461e-01 - 1.91175691e+00 8.48716369e-01 - 4.30244344e-01 6.05539915e-02 - 2.21783435e+00 3.03268204e-01 - 1.78019576e+00 1.27377108e+00 - 1.59733274e+00 4.40674687e-02 - 3.97428484e+00 2.20881566e+00 - -2.41108677e+00 -6.01410418e-01 - -2.50796499e+00 -5.71169866e-01 - -3.71957427e+00 -1.38195726e+00 - -1.57992670e+00 1.32068593e-01 - -1.35278851e+00 -6.39349270e-01 - 1.23075932e+00 2.40445409e-01 - 1.35606530e+00 4.33180078e-01 - 9.60968518e-02 2.26734255e-01 - 6.22975063e-01 5.03431915e-02 - -1.47624851e+00 -3.60568238e-01 - -2.49337808e+00 -1.15083052e+00 - 2.15717792e+00 1.03071559e+00 - -3.07814376e-02 1.38700314e-02 - 4.52049499e-02 -4.86409775e-01 - 2.58231061e+00 1.14327809e-01 - 1.10999138e+00 -5.18568405e-01 - -2.19426443e-01 -5.37505538e-01 - -4.44740298e-01 6.78099955e-01 - 4.03379080e+00 1.49825720e+00 - -5.13182408e-01 -4.90201950e-01 - -6.90139716e-01 1.63875126e-01 - -8.17281461e-01 2.32155064e-01 - -2.92357619e-01 -8.02573544e-01 - -1.80769841e+00 -7.58907326e-01 - 2.16981590e+00 1.06728873e+00 - 1.98995203e-01 -6.84176682e-02 - -2.39546753e+00 -2.92873789e-01 - -4.24251021e+00 -1.46255564e+00 - -5.01411291e-01 -5.95712813e-03 - 2.68085809e+00 1.42883780e+00 - -4.13289873e+00 -1.62729388e+00 - 1.87957843e+00 3.63341638e-01 - -1.15270744e+00 -3.03563774e-01 - -4.43994248e+00 -2.97323905e+00 - -7.17067733e-01 -7.08349542e-01 - -3.28870393e+00 -1.19263863e+00 - -7.55325944e-01 -5.12703329e-01 - -2.07291938e+00 -2.65025085e-01 - -7.50073814e-01 -1.70771041e-01 - -8.77381404e-01 -5.47417325e-01 - -5.33725862e-01 5.15837119e-01 - 8.45056431e-01 2.82125560e-01 - -1.59598637e+00 -1.38743235e+00 - 1.41362902e+00 1.06407789e+00 - 1.02584504e+00 -3.68219466e-01 - -1.04644488e+00 -1.48769392e-01 - 2.66990191e+00 8.57633492e-01 - -1.84251857e+00 -9.82430175e-01 - 9.71404204e-01 -2.81934209e-01 - -2.50177989e+00 -9.21260335e-01 - -1.31060074e+00 -5.84488113e-01 - -2.12129400e-01 -3.06244708e-02 - -5.28933882e+00 -2.50663129e+00 - 1.90220541e+00 1.08662918e+00 - -3.99366086e-02 -6.87178973e-01 - -4.93417342e-01 4.37354182e-01 - 2.13494486e+00 1.37679569e+00 - 2.18396765e+00 5.81023868e-01 - -3.07866587e+00 -1.45384974e+00 - 6.10894119e-01 -4.17050124e-01 - -1.88766952e+00 -8.86160058e-01 - 3.34527253e+00 1.78571260e+00 - 6.87769059e-01 -5.01157336e-01 - 2.60470837e+00 1.45853560e+00 - -6.49315691e-01 -9.16112805e-01 - -1.29817687e+00 -2.15924339e-01 - -1.20100409e-03 -4.03137422e-01 - -1.36471594e+00 -6.93266356e-01 - 1.38682062e+00 7.15131598e-01 - 2.47830103e+00 1.24862305e+00 - -2.78288147e+00 -1.03329235e+00 - -7.33443403e-01 -6.11041652e-01 - -4.12745671e-01 -5.96133390e-02 - -2.58632336e+00 -4.51557058e-01 - -1.16570367e+00 -1.27065510e+00 - 2.76187104e+00 2.21895451e-01 - -3.80443767e+00 -1.66319902e+00 - 9.84658633e-01 6.81475569e-01 - 9.33814584e-01 -4.89335563e-02 - -4.63427997e-01 1.72989539e-01 - 1.82401546e+00 3.60164021e-01 - -5.36521077e-01 -8.08691351e-01 - -1.37367030e+00 -1.02126160e+00 - -3.70310682e+00 -1.19840844e+00 - -1.51894242e+00 -3.89510223e-01 - -3.67347940e-01 -3.25540516e-02 - -1.00988595e+00 1.82802194e-01 - 2.01622795e+00 7.86367901e-01 - 1.02440231e+00 8.79780360e-01 - -3.05971480e+00 -8.40901527e-01 - 2.73909457e+00 1.20558628e+00 - 2.39559056e+00 1.10786694e+00 - 1.65471544e+00 7.33824651e-01 - 2.18546787e+00 6.41168955e-01 - 1.47152266e+00 3.91839132e-01 - 1.45811155e+00 5.21820495e-01 - -4.27531469e-02 -3.52343068e-03 - -9.54948010e-01 -1.52313876e-01 - 7.57151215e-01 -5.68728854e-03 - -8.46205751e-01 -7.54580229e-01 - 4.14493548e+00 1.45532780e+00 - 4.58688968e-01 -4.54012803e-02 - -1.49295381e+00 -4.57471758e-01 - 1.80020351e+00 8.13724973e-01 - -5.82727738e+00 -2.18269581e+00 - -2.09017809e+00 -1.18305177e+00 - -2.31628303e+00 -7.21600235e-01 - -8.09679091e-01 -1.49101752e-01 - 8.88005605e-01 8.57940857e-01 - -1.44148219e+00 -3.10926299e-01 - 3.68828186e-01 -3.08848059e-01 - -6.63267389e-01 -8.58950139e-02 - -1.14702569e+00 -6.32147854e-01 - -1.51741715e+00 -8.53330564e-01 - -1.33903718e+00 -1.45875547e-01 - 4.12485387e+00 1.85620435e+00 - -2.42353639e+00 -2.92669850e-01 - 1.88708583e+00 9.35984730e-01 - 2.15585179e+00 6.30469051e-01 - -1.13627973e-01 -1.62554045e-01 - 2.04540494e+00 1.36599834e+00 - 2.81591381e+00 1.60897941e+00 - 3.02736260e-02 3.83255815e-03 - 7.97634013e-02 -2.82035099e-01 - -3.24607473e-01 -5.30065956e-01 - -3.91862894e+00 -1.94083334e+00 - 1.56360901e+00 7.93882743e-01 - -1.03905772e+00 6.25590229e-01 - 2.54746492e+00 1.64233560e+00 - -4.80774423e-01 -8.92298032e-02 - 9.06979990e-02 1.05020427e+00 - -2.47521290e+00 -1.78275982e-01 - -3.91871729e-01 3.80285423e-01 - 1.00658382e+00 4.58947483e-01 - 4.68102941e-01 1.02992741e+00 - 4.44242568e-01 2.89870239e-01 - 3.29684452e+00 1.44677474e+00 - -2.24983007e+00 -9.65574499e-01 - -3.54453926e-01 -3.99020325e-01 - -3.87429665e+00 -1.90079739e+00 - 2.02656674e+00 1.12444894e+00 - 3.77011621e+00 1.43200852e+00 - 1.61259275e+00 4.65417399e-01 - 2.28725434e+00 6.79181395e-01 - 2.75421009e+00 2.27327345e+00 - -2.40894409e+00 -1.03926359e+00 - 1.52996651e-01 -2.73373046e-02 - -2.63218977e+00 -7.22802821e-01 - 2.77688169e+00 1.15310186e+00 - 1.18832341e+00 4.73457165e-01 - -2.35536326e+00 -1.08034554e+00 - -5.84221627e-01 1.03505984e-02 - 2.96730300e+00 1.33478306e+00 - -8.61947692e-01 6.09137051e-02 - 8.22343921e-01 -8.14155286e-02 - 1.75809015e+00 1.07921470e+00 - 1.19501279e+00 1.05309972e+00 - -1.75901792e+00 9.75320161e-02 - 1.64398635e+00 9.54384323e-01 - -2.21878052e-01 -3.64847144e-01 - -2.03128968e+00 -8.57866419e-01 - 1.86750633e+00 7.08524487e-01 - 8.03972976e-01 3.47404314e-01 - 3.41203749e+00 1.39810900e+00 - 4.22397681e-01 -6.41440488e-01 - -4.88493360e+00 -1.58967816e+00 - -1.67649284e-01 -1.08485915e-01 - 2.11489023e+00 1.50506158e+00 - -1.81639929e+00 -3.85542192e-01 - 2.24044819e-01 -1.45100577e-01 - -3.39262411e+00 -1.44394324e+00 - 1.68706599e+00 2.29199618e-01 - -1.94093257e+00 -1.65975814e-01 - 8.28143367e-01 5.92109281e-01 - -8.29587998e-01 -9.57130831e-01 - -1.50011401e+00 -8.36802092e-01 - 2.40770449e+00 9.32820177e-01 - 7.41391309e-02 3.12878473e-01 - 1.87745264e-01 6.19231425e-01 - 9.57622692e-01 -2.20640033e-01 - 3.18479243e+00 1.02986233e+00 - 2.43133846e+00 8.41302677e-01 - -7.09963834e-01 1.99718943e-01 - -2.88253498e-01 -3.62772094e-01 - 5.14052574e+00 1.79304595e+00 - -3.27930993e+00 -1.29177973e+00 - -1.16723536e+00 1.29519656e-01 - 1.04801056e+00 3.41508300e-01 - -3.99256195e+00 -2.51176471e+00 - -7.62824318e-01 -6.84242153e-01 - 2.71524986e-02 5.35157164e-02 - 3.26430102e+00 1.34887262e+00 - -1.72357766e+00 -4.94524388e-01 - -3.81149536e+00 -1.28121944e+00 - 3.36919354e+00 1.10672075e+00 - -3.14841757e+00 -7.10713767e-01 - -3.16463676e+00 -7.58558435e-01 - -2.44745969e+00 -1.08816514e+00 - 2.79173264e-01 -2.19652051e-02 - 4.15309883e-01 6.07502790e-01 - -9.51007417e-01 -5.83976336e-01 - -1.47929839e+00 -8.39850409e-01 - 2.38335703e+00 6.16055149e-01 - -7.47749031e-01 -5.56164928e-01 - -3.65643622e-01 -5.06684411e-01 - -1.76634163e+00 -7.86382097e-01 - 6.76372222e-01 -3.06592181e-01 - -1.33505058e+00 -1.18301441e-01 - 3.59660179e+00 2.00424178e+00 - -7.88912762e-02 8.71956146e-02 - 1.22656397e+00 1.18149583e+00 - 4.24919729e+00 1.20082355e+00 - 2.94607456e+00 1.00676505e+00 - 7.46061275e-02 4.41761753e-02 - -2.47738025e-02 1.92737701e-01 - -2.20509316e-01 -3.79163193e-01 - -3.50222190e-01 3.58727299e-01 - -3.64788014e+00 -1.36107312e+00 - 3.56062799e+00 9.27032742e-01 - 1.04317289e+00 6.08035970e-01 - 4.06718718e-01 3.00628051e-01 - 4.33158086e+00 2.25860714e+00 - 2.13917145e-01 -1.72757967e-01 - -1.40637998e+00 -1.14119465e+00 - 3.61554872e+00 1.87797348e+00 - 1.01726871e+00 5.70255097e-01 - -7.04902551e-01 2.16444147e-01 - -2.51492186e+00 -8.52997369e-01 - 1.85097530e+00 1.15124496e+00 - -8.67569714e-01 -3.05682432e-01 - 8.07550858e-01 5.88901608e-01 - 1.85186755e-01 -1.94589367e-01 - -1.23378238e+00 -7.84128347e-01 - -1.22713161e+00 -4.21218235e-01 - 2.97751165e-01 2.81055275e-01 - 4.77703554e+00 1.66265524e+00 - 2.51549669e+00 7.49980674e-01 - 2.76510822e-01 1.40456909e-01 - 1.98740905e+00 -1.79608212e-01 - 9.35429145e-01 8.44344180e-01 - -1.20854492e+00 -5.00598453e-01 - 2.29936219e+00 8.10236668e-01 - 6.92555544e-01 -2.65891331e-01 - -1.58050994e+00 2.31237821e-01 - -1.50864880e+00 -9.49661690e-01 - -1.27689206e+00 -7.18260016e-01 - -3.12517127e+00 -1.75587113e+00 - 8.16062912e-02 -6.56551804e-01 - -5.02479939e-01 -4.67162543e-01 - -5.47435788e+00 -2.47799576e+00 - 1.95872901e-02 5.80874076e-01 - -1.59064958e+00 -6.34554756e-01 - -3.77521478e+00 -1.74301790e+00 - 5.89628224e-01 8.55736553e-01 - -1.81903543e+00 -7.50011008e-01 - 1.38557775e+00 3.71490991e-01 - 9.70032652e-01 -7.11356016e-01 - 2.63539625e-01 -4.20994771e-01 - 2.12154222e+00 8.19081400e-01 - -6.56977937e-01 -1.37810098e-01 - 8.91309581e-01 2.77864361e-01 - -7.43693195e-01 -1.46293770e-01 - 2.24447769e+00 4.00911438e-01 - -2.25169262e-01 2.04148801e-02 - 1.68744684e+00 9.47573007e-01 - 2.73086373e-01 3.30877195e-01 - 5.54294414e+00 2.14198009e+00 - -8.49238733e-01 3.65603298e-02 - 2.39685712e+00 1.17951039e+00 - -2.58230528e+00 -5.52116673e-01 - 2.79785277e+00 2.88833717e-01 - -1.96576188e-01 1.11652123e+00 - -4.69383301e-01 1.96496282e-01 - -1.95011845e+00 -6.15235169e-01 - 1.03379890e-02 2.33701239e-01 - 4.18933607e-01 2.77939814e-01 - -1.18473337e+00 -4.10051126e-01 - -7.61499744e-01 -1.43658094e+00 - -1.65586092e+00 -3.41615303e-01 - -5.58523700e-02 -5.21837080e-01 - -2.40331088e+00 -2.64521583e-01 - 2.24925206e+00 6.79843335e-02 - 1.46360479e+00 1.04271443e+00 - -3.09255443e+00 -1.82548953e+00 - 2.11325841e+00 1.14996627e+00 - -8.70657797e-01 1.02461839e-01 - -5.71056521e-01 9.71232588e-02 - -3.37870752e+00 -1.54091877e+00 - 1.03907189e+00 -1.35661392e-01 - 8.40057486e-01 6.12172413e-02 - -1.30998234e+00 -1.34077226e+00 - 7.53744974e-01 1.49447350e-01 - 9.13995056e-01 -1.81227962e-01 - 2.28386229e-01 3.74498520e-01 - 2.54829151e-01 -2.88802704e-01 - 1.61709009e+00 2.09319193e-01 - -1.12579380e+00 -5.95955338e-01 - -2.69610726e+00 -2.76222736e-01 - -2.63773329e+00 -7.84491970e-01 - -2.62167427e+00 -1.54792874e+00 - -4.80639856e-01 -1.30582102e-01 - -1.26130891e+00 -8.86841840e-01 - -1.24951950e+00 -1.18182622e+00 - -1.40107574e+00 -9.13695575e-01 - 4.99872179e-01 4.69014702e-01 - -2.03550193e-02 -1.48859738e-01 - -1.50189069e+00 -2.97714278e-02 - -2.07846113e+00 -7.29937809e-01 - -5.50576792e-01 -7.03151525e-01 - -3.88069238e+00 -1.63215295e+00 - 2.97032988e+00 6.43571144e-01 - -1.85999273e-01 1.18107620e+00 - 1.79249709e+00 6.65356160e-01 - 2.68842472e+00 1.35703255e+00 - 1.07675417e+00 1.39845588e-01 - 8.01226349e-01 2.11392275e-01 - 9.64329379e-01 3.96146195e-01 - -8.22529511e-01 1.96080831e-01 - 1.92481841e+00 4.62985744e-01 - 3.69756927e-01 3.77135799e-01 - 1.19807835e+00 8.87715050e-01 - -1.01363587e+00 -2.48151636e-01 - 8.53071010e-01 4.96887868e-01 - -3.41120553e+00 -1.35401843e+00 - -2.64787381e+00 -1.08690563e+00 - -1.11416759e+00 -4.43848915e-01 - 1.46242648e+00 6.17106076e-02 - -7.52968881e-01 -9.20972209e-01 - -1.22492228e+00 -5.40327617e-01 - 1.08001827e+00 5.29593785e-01 - -2.58706464e-01 1.13022085e-01 - -4.27394011e-01 1.17864354e-02 - -3.20728413e+00 -1.71224737e-01 - 1.71398530e+00 8.68885893e-01 - 2.12067866e+00 1.45092772e+00 - 4.32782616e-01 -3.34117769e-01 - 7.80084374e-01 -1.35100217e-01 - -2.05547729e+00 -4.70217750e-01 - 2.38379736e+00 1.09186058e+00 - -2.80825477e+00 -1.03320187e+00 - 2.63434576e+00 1.15671733e+00 - -1.60936214e+00 1.91843035e-01 - -5.02298769e+00 -2.32820708e+00 - 1.90349195e+00 1.45215416e+00 - 3.00232888e-01 3.24412586e-01 - -2.46503943e+00 -1.19550010e+00 - 1.06304233e+00 2.20136246e-01 - -2.99101388e+00 -1.58299318e+00 - 2.30071719e+00 1.12881362e+00 - -2.37587247e+00 -8.08298336e-01 - 7.27006308e-01 3.80828984e-01 - 2.61199061e+00 1.56473491e+00 - 8.33936357e-01 -1.42189425e-01 - 3.13291605e+00 1.77771210e+00 - 2.21917371e+00 5.68427075e-01 - 2.38867649e+00 9.06637262e-01 - -6.92959466e+00 -3.57682881e+00 - 2.57904824e+00 5.93959108e-01 - 2.71452670e+00 1.34436199e+00 - 4.39988761e+00 2.13124672e+00 - 5.71783077e-01 5.08346173e-01 - -3.65399429e+00 -1.18192861e+00 - 4.46176453e-01 3.75685594e-02 - -2.97501495e+00 -1.69459236e+00 - 1.60855728e+00 9.20930014e-01 - -1.44270290e+00 -1.93922306e-01 - 1.67624229e+00 1.66233866e+00 - -1.42579598e+00 -1.44990145e-01 - 1.19923176e+00 4.58490278e-01 - -9.00068460e-01 5.09701825e-02 - -1.69391694e+00 -7.60070300e-01 - -1.36576440e+00 -5.24244256e-01 - -1.03016748e+00 -3.44625878e-01 - 2.40519313e+00 1.09947587e+00 - 1.50365433e+00 1.06464802e+00 - -1.07609727e+00 -3.68897187e-01 - 2.44969069e+00 1.28486192e+00 - -1.25610307e+00 -1.14644789e+00 - 2.05962899e+00 4.31162369e-01 - -7.15886908e-01 -6.11587804e-02 - -6.92354119e-01 -7.85019920e-01 - -1.63016508e+00 -5.96944975e-01 - 1.90352536e+00 1.28197457e+00 - -4.01535243e+00 -1.81934488e+00 - -1.07534435e+00 -2.10544784e-01 - 3.25500866e-01 7.69603661e-01 - 2.18443365e+00 6.59773335e-01 - 8.80856790e-01 6.39505913e-01 - -2.23956372e-01 -4.65940132e-01 - -1.06766519e+00 -5.38388505e-03 - 7.25556863e-01 -2.91123488e-01 - -4.69451411e-01 7.89182650e-02 - 2.58146587e+00 1.29653243e+00 - 1.53747468e-01 7.69239075e-01 - -4.61152262e-01 -4.04151413e-01 - 1.48183517e+00 8.10079506e-01 - -1.83402614e+00 -1.36939322e+00 - 1.49315501e+00 7.95225425e-01 - 1.41922346e+00 1.05582774e-01 - 1.57473493e-01 9.70795657e-01 - -2.67603254e+00 -7.48562280e-01 - -8.49156216e-01 -6.05762529e-03 - 1.12944274e+00 3.67741591e-01 - 1.94228071e-01 5.28188141e-01 - -3.65610158e-01 4.05851838e-01 - -1.98839111e+00 -1.38452764e+00 - 2.73765752e+00 8.24150530e-01 - 7.63728641e-01 3.51617707e-01 - 5.78307267e+00 1.68103612e+00 - 2.27547227e+00 3.60876164e-01 - -3.50681697e+00 -1.74429984e+00 - 4.01241184e+00 1.26227829e+00 - 2.44946343e+00 9.06119057e-01 - -2.96638941e+00 -9.01532322e-01 - 1.11267643e+00 -3.43333381e-01 - -6.61868994e-01 -3.44666391e-01 - -8.34917179e-01 5.69478372e-01 - -1.91888454e+00 -3.03791075e-01 - 1.50397636e+00 8.31961240e-01 - 6.12260198e+00 2.16851807e+00 - 1.34093127e+00 8.86649385e-01 - 1.48748519e+00 8.26273697e-01 - 7.62243068e-01 2.64841396e-01 - -2.17604986e+00 -3.54219958e-01 - 2.64708640e-01 -4.38136718e-02 - 1.44725372e+00 1.18499914e-01 - -6.71259446e-01 -1.19526851e-01 - 2.40134595e-01 -8.90042323e-02 - -3.57238199e+00 -1.23166201e+00 - -3.77626645e+00 -1.19533443e+00 - -3.81101035e-01 -4.94160532e-01 - -3.02758757e+00 -1.18436066e+00 - 2.59116298e-01 1.38023047e+00 - 4.17900116e+00 1.12065959e+00 - 1.54598848e+00 2.89806755e-01 - 1.00656475e+00 1.76974511e-01 - -4.15730234e-01 -6.22681694e-01 - -6.00903565e-01 -1.43256959e-01 - -6.03652508e-01 -5.09936379e-01 - -1.94096658e+00 -9.48789544e-01 - -1.74464105e+00 -8.50491590e-01 - 1.17652544e+00 1.88118317e+00 - 2.35507776e+00 1.44000205e+00 - 2.63067924e+00 1.06692988e+00 - 2.88805386e+00 1.23924715e+00 - 8.27595008e-01 5.75364692e-01 - 3.91384216e-01 9.72781920e-02 - -1.03866816e+00 -1.37567768e+00 - -1.34777969e+00 -8.40266025e-02 - -4.12904508e+00 -1.67618340e+00 - 1.27918111e+00 3.52085961e-01 - 4.15361174e-01 6.28896189e-01 - -7.00539496e-01 4.80447955e-02 - -1.62332639e+00 -5.98236485e-01 - 1.45957300e+00 1.00305154e+00 - -3.06875603e+00 -1.25897545e+00 - -1.94708176e+00 4.85143006e-01 - 3.55744156e+00 -1.07468822e+00 - 1.21602223e+00 1.28768827e-01 - 1.89093098e+00 -4.70835659e-01 - -6.55759125e+00 2.70114082e+00 - 8.96843535e-01 -3.98115252e-01 - 4.13450429e+00 -2.32069236e+00 - 2.37764218e+00 -1.09098890e+00 - -1.11388901e+00 6.27083097e-01 - -6.34116929e-01 4.62816387e-01 - 2.90203079e+00 -1.33589143e+00 - 3.17457598e+00 -5.13575945e-01 - -1.76362299e+00 5.71820693e-01 - 1.66103362e+00 -8.99466249e-01 - -2.53947433e+00 8.40084780e-01 - 4.36631397e-01 7.24234261e-02 - -1.87589394e+00 5.08529113e-01 - 4.49563965e+00 -9.43365992e-01 - 1.78876299e+00 -1.27076149e+00 - -1.16269107e-01 -4.55078316e-01 - 1.92966079e+00 -8.05371385e-01 - 2.20632583e+00 -9.00919345e-01 - 1.52387824e+00 -4.82391996e-01 - 8.04004564e-01 -2.73650595e-01 - -7.75326067e-01 1.07469566e+00 - 1.83226282e+00 -4.52173344e-01 - 1.25079758e-01 -3.52895417e-02 - -9.90957437e-01 8.55993130e-01 - 1.71623322e+00 -7.08691667e-01 - -2.86175924e+00 6.75160955e-01 - -8.40817853e-01 -1.00361809e-01 - 1.33393000e+00 -4.65788123e-01 - 5.29394114e-01 -5.44881619e-02 - -8.07435599e-01 8.27353370e-01 - -4.33165824e+00 1.97299638e+00 - 1.26452422e+00 -8.34070486e-01 - 1.45996394e-02 2.97736043e-01 - -1.64489287e+00 6.72839598e-01 - -5.74234578e+00 3.20975117e+00 - 2.13841341e-02 3.64514015e-01 - 6.68084924e+00 -2.27464254e+00 - -3.22881590e+00 8.01879324e-01 - 3.02534313e-01 -4.56222796e-01 - -5.84520734e+00 1.95678162e+00 - 2.81515232e+00 -1.72101318e+00 - -2.39620908e-01 2.69145522e-01 - -7.41669691e-01 -2.30283281e-01 - -2.15682714e+00 3.45313021e-01 - 1.23475788e+00 -7.32276553e-01 - -1.71816113e-01 1.20419560e-02 - 1.89174235e+00 2.27435901e-01 - -3.64511114e-01 1.72260361e-02 - -3.24143860e+00 6.50125817e-01 - -2.25707409e+00 5.66970751e-01 - 1.03901456e+00 -1.00588433e+00 - -5.09159710e+00 1.58736109e+00 - 1.45534075e+00 -5.83787452e-01 - 4.28879587e+00 -1.58006866e+00 - 8.52384427e-01 -1.11042299e+00 - 4.51431615e+00 -2.63844265e+00 - -4.33042648e+00 1.86497078e+00 - -2.13568046e+00 5.82559743e-01 - -4.42568887e+00 1.26131214e+00 - 3.15821315e+00 -1.61515905e+00 - -3.14125204e+00 8.49604386e-01 - 6.54152300e-01 -2.04624711e-01 - -3.73374317e-01 9.94187820e-02 - -3.96177282e+00 1.27245623e+00 - 9.59825199e-01 -1.15547861e+00 - 3.56902055e+00 -1.46591091e+00 - 1.55433633e-02 6.93544345e-01 - 1.15684646e+00 -4.99836352e-01 - 3.11824573e+00 -4.75900506e-01 - -8.61706369e-01 -3.50774059e-01 - 9.89057391e-01 -7.16878802e-01 - -4.94787870e+00 2.09137481e+00 - 1.37777347e+00 -1.34946349e+00 - -1.13161577e+00 8.05114754e-01 - 8.12020675e-01 -1.04849421e+00 - 4.73783881e+00 -2.26718812e+00 - 8.99579366e-01 -8.89764451e-02 - 4.78524868e+00 -2.25795843e+00 - 1.75164590e+00 -1.73822209e-01 - 1.30204590e+00 -7.26724717e-01 - -7.26526403e-01 -5.23925361e-02 - 2.01255351e+00 -1.69965366e+00 - 9.87852740e-01 -4.63577220e-01 - 2.45957762e+00 -1.29278962e+00 - -3.13817948e+00 1.64433038e+00 - -1.76302159e+00 9.62784302e-01 - -1.91106331e+00 5.81460008e-01 - -3.30883001e+00 1.30378978e+00 - 5.54376450e-01 3.78814272e-01 - 1.09982111e+00 -1.47969612e+00 - -2.61300705e-02 -1.42573464e-01 - -2.22096157e+00 7.75684440e-01 - 1.70319323e+00 -2.89738444e-01 - -1.43223842e+00 6.39284281e-01 - 2.34360959e-01 -1.64379268e-01 - -2.67147991e+00 9.46548086e-01 - 1.51131425e+00 -4.91594395e-01 - -2.48446856e+00 1.01286123e+00 - 1.50534658e-01 -2.94620246e-01 - -1.66966792e+00 1.67755508e+00 - -1.50094241e+00 3.30163095e-01 - 2.27681194e+00 -1.08064317e+00 - 2.05122965e+00 -1.15165939e+00 - -4.23509309e-01 -6.56906167e-02 - 1.80084023e+00 -1.07228556e+00 - -2.65769521e+00 1.18023206e+00 - 2.02852676e+00 -8.06793574e-02 - -4.49544185e+00 2.68200163e+00 - -7.50043216e-01 1.17079331e+00 - 6.80060893e-02 3.99055351e-01 - -3.83634635e+00 1.38406887e+00 - 3.24858545e-01 -9.25273218e-02 - -2.19895100e+00 1.47819500e+00 - -3.61569522e-01 -1.03188739e-01 - 1.12180375e-01 -9.52696354e-02 - -1.31477803e+00 1.79900570e-01 - 2.39573628e+00 -6.09739269e-01 - -1.00135700e+00 6.02837296e-01 - -4.11994589e+00 2.49599192e+00 - -1.54196236e-01 -4.84921951e-01 - 5.92569908e-01 -1.87310359e-01 - 3.85407741e+00 -1.50979925e+00 - 5.17802528e+00 -2.26032607e+00 - -1.37018916e+00 1.87111822e-01 - 8.46682996e-01 -3.56676331e-01 - -1.17559949e+00 5.29057734e-02 - -5.56475671e-02 6.79049243e-02 - 1.07851745e+00 -5.14535101e-01 - -2.71622446e+00 1.00151846e+00 - -1.08477208e+00 8.81391054e-01 - 5.50755824e-01 -5.20577727e-02 - 4.70885495e+00 -2.04220397e+00 - -1.87375336e-01 -6.16962830e-02 - 3.52097100e-01 2.21163550e-01 - 7.07929984e-01 -1.75827590e-01 - -1.22149219e+00 1.83084346e-01 - 2.58247412e+00 -6.15914898e-01 - -6.01206182e-01 -2.29832987e-01 - 9.83360449e-01 -3.75870060e-01 - -3.20027685e+00 1.35467480e+00 - 1.79178978e+00 -1.38531981e+00 - -3.30376867e-01 -1.16250192e-01 - -1.89053055e+00 5.68463567e-01 - -4.20604849e+00 1.65429681e+00 - -1.01185529e+00 1.92801240e-01 - -6.18819882e-01 5.42206996e-01 - -5.08091672e+00 2.61598591e+00 - -2.62570344e+00 2.51590658e+00 - 3.05577906e+00 -1.49090609e+00 - 2.77609677e+00 -1.37681378e+00 - -7.93515301e-02 4.28072744e-01 - -2.08359471e+00 8.94334295e-01 - 2.20163801e+00 4.01127167e-02 - -1.18145785e-01 -2.06822464e-01 - -2.74788298e-01 2.96250607e-01 - 1.59613555e+00 -3.87246203e-01 - -3.82971472e-01 -3.39716093e-02 - -4.20311307e-02 3.88529510e-01 - 1.52128574e+00 -9.33138876e-01 - -9.06584458e-01 -2.75016094e-02 - 3.56216834e+00 -9.99384622e-01 - 2.11964220e+00 -9.98749118e-02 - 4.01203480e+00 -2.03032745e+00 - -1.24171557e+00 1.97596725e-01 - -1.57230455e+00 4.14126609e-01 - -1.85484741e+00 5.40041563e-01 - 1.76329831e+00 -6.95967734e-01 - -2.29439232e-01 5.08669245e-01 - -5.45124276e+00 2.26907549e+00 - -5.71364288e-02 5.04476476e-01 - 3.12468018e+00 -1.46358879e+00 - 8.20017359e-01 6.51949028e-01 - -1.33977500e+00 2.83634232e-04 - -1.83311685e+00 1.23947117e+00 - 6.31205922e-01 1.19792164e-02 - -2.21967834e+00 6.94056232e-01 - -1.41693842e+00 9.93526233e-01 - -7.58885703e-01 6.78547347e-01 - 3.60239086e+00 -1.08644935e+00 - 6.72217073e-02 3.00036011e-02 - -3.42680958e-01 -3.48049352e-01 - 1.87546079e+00 -4.78018246e-01 - 7.00485821e-01 -3.52905383e-01 - -8.54580948e-01 8.17330861e-01 - 8.19123706e-01 -5.73927281e-01 - 2.70855639e-01 -3.08940052e-01 - -1.05059952e+00 3.27873168e-01 - 1.08282999e+00 4.84559349e-02 - -7.89899220e-01 1.22291138e+00 - -2.87939816e+00 7.17403497e-01 - -2.08429452e+00 8.87409226e-01 - 1.58409232e+00 -4.74123532e-01 - 1.26882735e+00 1.59162510e-01 - -2.53782993e+00 6.18253491e-01 - -8.92757445e-01 3.35979011e-01 - 1.31867900e+00 -1.17355054e+00 - 1.14918879e-01 -5.35184038e-01 - -1.70288738e-01 5.35868087e-02 - 4.21355121e-01 5.41848690e-02 - 2.07926943e+00 -5.72538144e-01 - 4.08788970e-01 3.77655777e-01 - -3.39631381e+00 9.84216764e-01 - 2.94170163e+00 -1.83120916e+00 - -7.94798752e-01 7.39889052e-01 - 1.46555463e+00 -4.62275563e-01 - 2.57255955e+00 -1.04671434e+00 - 8.45042540e-01 -1.96952892e-01 - -3.23526646e+00 1.60049846e+00 - 3.21948565e+00 -8.88376674e-01 - 1.43005104e+00 -9.21561086e-01 - 8.82360506e-01 2.98403872e-01 - -8.91168097e-01 1.01319072e+00 - -5.13215241e-01 -2.47182649e-01 - -1.35759444e+00 7.07450608e-02 - -4.04550983e+00 2.23534867e+00 - 1.39348883e+00 3.81637747e-01 - -2.85676418e+00 1.53240862e+00 - -1.37183120e+00 6.37977425e-02 - -3.88195859e+00 1.73887145e+00 - 1.19509776e+00 -6.25013512e-01 - -2.80062734e+00 1.79840585e+00 - 1.96558429e+00 -4.70997234e-01 - 1.93111352e+00 -9.70318441e-01 - 3.57991190e+00 -1.65065116e+00 - 2.12831714e+00 -1.11531708e+00 - -3.95661018e-01 -8.54339904e-02 - -2.41630441e+00 1.65166304e+00 - 7.55412624e-01 -1.53453579e-01 - -1.77043450e+00 1.39928715e+00 - -9.32631260e-01 8.73649199e-01 - 1.53342205e+00 -8.39569765e-01 - -6.29846924e-02 1.25023084e-01 - 3.31509049e+00 -1.10733235e+00 - -2.18957109e+00 3.07376993e-01 - -2.35740747e+00 6.47437564e-01 - -2.22142438e+00 8.47318938e-01 - -6.51401147e-01 3.48398562e-01 - 2.75763095e+00 -1.21390708e+00 - 1.12550484e+00 -5.61412847e-01 - -5.65053161e-01 6.74365205e-02 - 1.68952456e+00 -6.57566096e-01 - 8.95598401e-01 3.96738993e-01 - -1.86537066e+00 9.44129208e-01 - -2.59933294e+00 2.57423247e-01 - -6.59598267e-01 1.91828851e-02 - -2.64506676e+00 8.41783205e-01 - -1.25911802e+00 5.52425066e-01 - -1.39754507e+00 3.73689222e-01 - 5.49550729e-02 1.35071215e+00 - 3.31874811e+00 -1.05682424e+00 - 3.63159604e+00 -1.42864695e+00 - -4.45944617e+00 1.42889446e+00 - 5.87314342e-01 -4.88892988e-01 - -7.26130820e-01 1.51936106e-01 - -1.79246441e+00 6.05888105e-01 - -5.50948207e-01 6.21443081e-01 - -3.17246063e-01 1.77213880e-01 - -2.00098937e+00 1.23799074e+00 - 4.33790961e+00 -1.08490465e+00 - -2.03114114e+00 1.31613237e+00 - -6.29216542e+00 1.92406317e+00 - -1.60265624e+00 8.87947500e-01 - 8.64465062e-01 -8.37416270e-01 - -2.14273937e+00 8.05485900e-01 - -2.36844256e+00 6.17915124e-01 - -1.40429636e+00 6.78296866e-01 - 9.99019988e-01 -5.84297572e-01 - 7.38824546e-01 1.68838678e-01 - 1.45681238e+00 3.04641461e-01 - 2.15914949e+00 -3.43089227e-01 - -1.23895930e+00 1.05339864e-01 - -1.23162264e+00 6.46629863e-01 - 2.28183862e+00 -9.24157063e-01 - -4.29615882e-01 5.69130863e-01 - -1.37449121e+00 -9.12032183e-01 - -7.33890904e-01 -3.91865471e-02 - 8.41400661e-01 -4.76002200e-01 - -1.73349274e-01 -6.84143467e-02 - 3.16042891e+00 -1.32651856e+00 - -3.78244609e+00 2.38619718e+00 - -3.69634380e+00 2.22368561e+00 - 1.83766344e+00 -1.65675953e+00 - -1.63206002e+00 1.19484469e+00 - 3.68480064e-01 -5.70764494e-01 - 3.61982479e-01 1.04274409e-01 - 2.48863048e+00 -1.13285542e+00 - -2.81896488e+00 9.47958768e-01 - 5.74952901e-01 -2.75959392e-01 - 3.72783275e-01 -3.48937848e-01 - 1.95935716e+00 -1.06750415e+00 - 5.19357531e+00 -2.32070803e+00 - 4.09246149e+00 -1.89976700e+00 - -3.36666087e-01 8.17645057e-02 - 1.85453493e-01 3.76913151e-01 - -3.06458262e+00 1.34106402e+00 - -3.13796566e+00 7.00485099e-01 - 1.42964058e+00 -1.35536932e-01 - -1.23440423e-01 4.60094177e-02 - -2.86753037e+00 -5.21724160e-02 - 2.67113726e+00 -1.83746924e+00 - -1.35335062e+00 1.28238073e+00 - -2.43569899e+00 1.25998539e+00 - 1.26036740e-01 -2.35416844e-01 - -1.35725745e+00 7.37788491e-01 - -3.80897538e-01 3.30757889e-01 - 6.58694434e-01 -1.07566603e+00 - 2.11273640e+00 -9.02260632e-01 - 4.00755057e-01 -2.49229150e-02 - -1.80095812e+00 9.73099742e-01 - -2.68408372e+00 1.63737364e+00 - -2.66079826e+00 7.47289412e-01 - -9.92321439e-02 -1.49331396e-01 - 4.45678251e+00 -1.80352394e+00 - 1.35962915e+00 -1.31554389e+00 - -7.76601417e-01 -9.66173523e-02 - 1.68096348e+00 -6.27235133e-01 - 1.53081227e-01 -3.54216830e-01 - -1.54913095e+00 3.43689269e-01 - 5.29187357e-02 -6.73916964e-01 - -2.06606084e+00 8.34784242e-01 - 1.73701179e+00 -6.06467340e-01 - 1.55856757e+00 -2.58642780e-01 - 1.04349101e+00 -4.43027348e-01 - -1.02397719e+00 1.01308824e+00 - -2.13860204e-01 -4.73347361e-01 - -2.59004955e+00 1.43367853e+00 - 7.98457679e-01 2.18621627e-02 - -1.32974762e+00 4.61802208e-01 - 3.21419359e-01 2.30723316e-02 - 2.87201888e-02 6.24566672e-02 - -1.22261418e+00 6.02340363e-01 - 1.28750335e+00 -3.34839548e-02 - -9.67952623e-01 4.34470505e-01 - 2.02850324e+00 -9.05160255e-01 - -4.13946010e+00 2.33779091e+00 - -4.47508806e-01 3.06440495e-01 - -3.91543394e+00 1.68251022e+00 - -6.45193001e-01 5.29781162e-01 - -2.15518916e-02 5.07278355e-01 - -2.83356868e+00 1.00670227e+00 - 1.82989749e+00 -1.37329222e+00 - -1.09330213e+00 1.08560688e+00 - 1.90533722e+00 -1.28905879e+00 - 2.33986084e+00 2.30642626e-02 - 8.01940220e-01 -1.63986962e+00 - -4.23415165e+00 2.07530423e+00 - 9.33382522e-01 -7.62917211e-01 - -1.84033954e+00 1.07469401e+00 - -2.81938669e+00 1.07342024e+00 - -7.05169988e-01 2.13124943e-01 - 5.09598137e-01 1.32725493e-01 - -2.34558226e+00 8.62383168e-01 - -1.70322072e+00 2.70893796e-01 - 1.23652660e+00 -7.53216034e-02 - 2.84660646e+00 -3.48178304e-02 - 2.50250128e+00 -1.27770855e+00 - -1.00279469e+00 8.77194218e-01 - -4.34674121e-02 -2.12091350e-01 - -5.84151289e-01 1.50382340e-01 - -1.79024013e+00 4.24972808e-01 - -1.23434666e+00 -8.85546570e-02 - 1.36575412e+00 -6.42639880e-01 - -1.98429947e+00 2.27650336e-01 - 2.36253589e+00 -1.51340773e+00 - 8.79157643e-01 6.84142159e-01 - -2.18577755e+00 2.76526200e-01 - -3.55473434e-01 8.29976561e-01 - 1.16442595e+00 -5.97699411e-01 - -7.35528097e-01 2.40318183e-01 - -1.73702631e-01 7.33788663e-02 - -1.40451745e+00 3.24899628e-01 - -2.05434385e+00 5.68123738e-01 - 8.47876642e-01 -5.74224294e-01 - -6.91955602e-01 1.26009087e+00 - 2.56574498e+00 -1.15602581e+00 - 3.93306545e+00 -1.38398209e+00 - -2.73230251e+00 4.89062581e-01 - -1.04315474e+00 6.06335547e-01 - 1.23231431e+00 -4.46675065e-01 - -3.93035285e+00 1.43287651e+00 - -1.02132111e+00 9.58919791e-01 - -1.49425352e+00 1.06456165e+00 - -6.26485337e-01 1.03791402e+00 - -6.61772998e-01 2.63275425e-01 - -1.80940386e+00 5.70767403e-01 - 9.83720450e-01 -1.39449756e-01 - -2.24619662e+00 9.01044870e-01 - 8.94343014e-01 5.31038678e-02 - 1.95518199e-01 -2.81343295e-01 - -2.30533019e-01 -1.74478106e-01 - -2.01550361e+00 5.55958010e-01 - -4.36281469e+00 1.94374226e+00 - -5.18530457e+00 2.89278357e+00 - 2.67289101e+00 -2.98511449e-01 - -1.53566179e+00 -1.00588944e-01 - -6.09943217e-02 -1.56986047e-01 - -5.22146452e+00 1.66209208e+00 - -3.69777478e+00 2.26154873e+00 - 2.24607181e-01 -4.86934960e-01 - 2.49909450e+00 -1.03033370e+00 - -1.07841120e+00 8.22388054e-01 - -3.20697089e+00 1.09536143e+00 - 3.43524232e+00 -1.47289362e+00 - -5.65784134e-01 4.60365175e-01 - -1.76714734e+00 1.57752346e-01 - -7.77620365e-01 5.60153443e-01 - 6.34399352e-01 -5.22339836e-01 - 2.91011875e+00 -9.72623380e-01 - -1.19286824e+00 6.32370253e-01 - -2.18327609e-01 8.23953181e-01 - 3.42430842e-01 1.37098055e-01 - 1.28658034e+00 -9.11357320e-01 - 2.06914465e+00 -6.67556382e-01 - -6.69451020e-01 -6.38605102e-01 - -2.09312398e+00 1.16743634e+00 - -3.63778357e+00 1.91919157e+00 - 8.74685911e-01 -1.09931208e+00 - -3.91496791e+00 1.00808357e+00 - 1.29621330e+00 -8.32239802e-01 - 9.00222045e-01 -1.31159793e+00 - -1.12242062e+00 1.98517079e-01 - -3.71932852e-01 1.31667093e-01 - -2.23829610e+00 1.26328346e+00 - -2.08365062e+00 9.93385336e-01 - -1.91082720e+00 7.45866855e-01 - 4.38024917e+00 -2.05901118e+00 - -2.28872886e+00 6.85279335e-01 - 1.01274497e-01 -3.26227153e-01 - -5.04447572e-01 -3.18619513e-01 - 1.28537006e+00 -1.04573551e+00 - -7.83175212e-01 1.54791645e-01 - -3.89239175e+00 1.60017929e+00 - -8.87877111e-01 -1.04968005e-01 - 9.32215179e-01 -5.58691113e-01 - -6.44977127e-01 -2.23018375e-01 - 1.10141900e+00 -1.00666432e+00 - 2.92755687e-01 -1.45480350e-01 - 7.73580681e-01 -2.21150567e-01 - -1.40873709e+00 7.61548044e-01 - -8.89031805e-01 -3.48542923e-01 - 4.16844267e-01 -2.39914494e-01 - -4.64265832e-01 7.29581138e-01 - 1.99835179e+00 -7.70542813e-01 - 4.20523191e-02 -2.18783563e-01 - -6.32611758e-01 -3.09926115e-01 - 6.82912198e-02 -8.48327050e-01 - 1.92425229e+00 -1.37876951e+00 - 3.49461782e+00 -1.88354255e+00 - -3.25209026e+00 1.49809395e+00 - 6.59273182e-01 -2.37435654e-01 - -1.15517300e+00 8.46134387e-01 - 1.26756151e+00 -4.58988026e-01 - -3.99178418e+00 2.04153008e+00 - 7.05687841e-01 -6.83433306e-01 - -1.61997342e+00 8.16577004e-01 - -3.89750399e-01 4.29753250e-01 - -2.53026432e-01 4.92861432e-01 - -3.16788324e+00 4.44285524e-01 - -7.86248901e-01 1.12753716e+00 - -3.02351433e+00 1.28419015e+00 - -1.30131355e+00 1.71226678e+00 - -4.08843475e+00 1.62063214e+00 - -3.09209403e+00 1.19958520e+00 - 1.49102271e+00 -1.11834864e+00 - -3.18059348e+00 5.74587042e-01 - 2.06054867e+00 3.25797860e-03 - -3.50999200e+00 2.02412428e+00 - -8.26610023e-01 3.46528211e-01 - 2.00546034e+00 -4.07333110e-01 - -9.69941653e-01 4.80953753e-01 - 4.47925660e+00 -2.33127314e+00 - 2.03845790e+00 -9.90439915e-01 - -1.11349191e+00 4.31183918e-01 - -4.03628396e+00 1.68509679e+00 - -1.48177601e+00 7.74322088e-01 - 3.07369385e+00 -9.57465886e-01 - 2.39011286e+00 -6.44506921e-01 - 2.91561991e+00 -8.78627328e-01 - 1.10212733e+00 -4.21637388e-01 - 5.31985231e-01 -6.17445696e-01 - -6.82340929e-01 -2.93529716e-01 - 1.94290679e+00 -4.64268634e-01 - 1.92262116e+00 -7.93142835e-01 - 4.73762800e+00 -1.63654174e+00 - -3.17848641e+00 8.05791391e-01 - 4.08739432e+00 -1.80816807e+00 - -7.60648826e-01 1.24216138e-01 - -2.24716400e+00 7.90020937e-01 - 1.64284052e+00 -7.18784070e-01 - 1.04410012e-01 -7.11195880e-02 - 2.18268225e+00 -7.01767831e-01 - 2.06218013e+00 -8.70251746e-01 - -1.35266581e+00 7.08456358e-01 - -1.38157779e+00 5.14401086e-01 - -3.28326008e+00 1.20988399e+00 - 8.85358917e-01 -8.12213495e-01 - -2.34067500e+00 3.67657353e-01 - 3.96878127e+00 -1.66841450e+00 - 1.36518053e+00 -8.33436812e-01 - 5.25771988e-01 -5.06121987e-01 - -2.25948361e+00 1.30663765e+00 - -2.57662070e+00 6.32114628e-01 - -3.43134685e+00 2.38106008e+00 - 2.31571924e+00 -1.56566818e+00 - -2.95397202e+00 1.05661888e+00 - -1.35331242e+00 6.76383411e-01 - 1.40977132e+00 -1.17775938e+00 - 1.52561996e+00 -9.83147176e-01 - 2.26550832e+00 -2.10464123e-02 - 6.23371684e-01 -5.30768122e-01 - -4.42356624e-01 9.72226986e-01 - 2.31517901e+00 -1.08468105e+00 - 1.97236640e+00 -1.42016619e+00 - 3.18618687e+00 -1.45056343e+00 - -2.75880360e+00 5.40254980e-01 - -1.92916581e+00 1.45029864e-01 - 1.90022524e+00 -6.03805754e-01 - -1.05446211e+00 5.74361752e-01 - 1.45990390e+00 -9.28233993e-01 - 5.14960557e+00 -2.07564096e+00 - -7.53104842e-01 1.55876958e-01 - 8.09490983e-02 -8.58886384e-02 - -1.56894969e+00 4.53497227e-01 - 1.36944658e-01 5.60670875e-01 - -5.32635329e-01 4.40309945e-01 - 1.32507853e+00 -5.83670099e-01 - 1.20676031e+00 -8.02296831e-01 - -3.65023422e+00 1.17211368e+00 - 1.53393850e+00 -6.17771312e-01 - -3.99977129e+00 1.71415137e+00 - 5.70705058e-01 -4.60771539e-01 - -2.20608002e+00 1.07866596e+00 - -1.09040244e+00 6.77441076e-01 - -5.09886482e-01 -1.97282128e-01 - -1.58062785e+00 6.18333697e-01 - -1.53295020e+00 4.02168701e-01 - -5.18580598e-01 2.25767177e-01 - 1.59514316e+00 -2.54983617e-01 - -5.91938655e+00 2.68223782e+00 - 2.84200509e+00 -1.04685313e+00 - 1.31298664e+00 -1.16672614e+00 - -2.36660033e+00 1.81359460e+00 - 6.94163290e-02 3.76658816e-01 - 2.33973934e+00 -8.33173023e-01 - -8.24640389e-01 7.83717285e-01 - -1.02888281e+00 1.04680766e+00 - 1.34750745e+00 -5.89568160e-01 - -2.48761231e+00 7.44199284e-01 - -1.04501559e+00 4.72326911e-01 - -3.14610089e+00 1.89843692e+00 - 2.13003416e-01 5.76633620e-01 - -1.69239608e+00 5.66070021e-01 - 1.80491280e+00 -9.31701080e-01 - -6.94362572e-02 6.96026587e-01 - 1.36502578e+00 -6.85599000e-02 - -7.76764337e-01 3.64328661e-01 - -2.67322167e+00 6.80150021e-01 - 1.84338485e+00 -1.18487494e+00 - 2.88009231e+00 -1.25700411e+00 - 1.17114433e+00 -7.69727080e-01 - 2.11576167e+00 2.81502116e-01 - -1.51470088e+00 2.61553540e-01 - 1.18923669e-01 -1.17890202e-01 - 4.48359786e+00 -1.81427466e+00 - -1.27055948e+00 9.92388998e-01 - -8.00276606e-01 9.11326621e-02 - 7.51764024e-01 -1.03676498e-01 - 1.35769348e-01 -2.11470084e-01 - 2.50731332e+00 -1.12418270e+00 - -2.49752781e-01 7.81224033e-02 - -6.23037902e-01 3.16599691e-01 - -3.93772902e+00 1.37195391e+00 - 1.74256361e+00 -1.12363582e+00 - -1.49737281e+00 5.98828310e-01 - 7.75592115e-01 -4.64733802e-01 - -2.26027693e+00 1.36991118e+00 - -1.62849836e+00 7.36899107e-01 - 2.36850751e+00 -9.32126872e-01 - 5.86169745e+00 -2.49342512e+00 - -5.37092226e-01 1.23821274e+00 - 2.80535867e+00 -1.93363302e+00 - -1.77638106e+00 9.10050276e-01 - 3.02692018e+00 -1.60774676e+00 - 1.97833084e+00 -1.50636531e+00 - 9.09168906e-01 -8.83799359e-01 - 2.39769655e+00 -7.56977869e-01 - 1.47283981e+00 -1.06749890e+00 - 2.92060943e-01 -6.07040605e-01 - -2.09278201e+00 7.71858590e-01 - 7.10015905e-01 -5.42768432e-01 - -2.16826169e-01 1.56897896e-01 - 4.56288247e+00 -2.08912680e+00 - -6.63374020e-01 6.67325183e-01 - 1.80564442e+00 -9.76366134e-01 - 3.28720168e+00 -4.66575145e-01 - -1.60463695e-01 -2.58428153e-01 - 1.78590750e+00 -3.96427146e-01 - 2.75950306e+00 -1.82102856e+00 - -1.18234310e+00 6.28073320e-01 - 4.11415835e+00 -2.33551216e+00 - 1.38721004e+00 -2.77450622e-01 - -2.94903545e+00 1.74813352e+00 - 8.67290400e-01 -6.51667894e-01 - 2.70022274e+00 -8.11832480e-01 - -2.06766146e+00 8.24047249e-01 - 3.90717142e+00 -1.20155758e+00 - -2.95102809e+00 1.36667968e+00 - 6.08815147e+00 -2.60737974e+00 - 2.78576476e+00 -7.86628755e-01 - -3.26258407e+00 1.09302450e+00 - 1.59849422e+00 -1.09705202e+00 - -2.50600710e-01 1.63243175e-01 - -4.90477087e-01 -4.57729572e-01 - -1.24837181e+00 3.22157840e-01 - -2.46341049e+00 1.06517849e+00 - 9.62880751e-01 4.56962496e-01 - 3.99964487e-01 2.07472802e-01 - 6.36657705e-01 -3.46400942e-02 - 4.91231407e-02 -1.40289235e-02 - -4.66683524e-02 -3.72326100e-01 - -5.22049702e-01 -1.70440260e-01 - 5.27062938e-01 -2.32628395e-01 - -2.69440318e+00 1.18914874e+00 - 3.65087539e+00 -1.53427267e+00 - -1.16546364e-01 4.93245392e-02 - 7.55931384e-01 -3.02980139e-01 - 2.06338745e+00 -6.24841225e-01 - 1.31177908e-01 7.29338183e-01 - 1.48021784e+00 -6.39509896e-01 - -5.98656707e-01 2.84525503e-01 - -2.18611080e+00 1.79549812e+00 - -2.91673624e+00 2.15772237e-01 - -8.95591350e-01 7.68250538e-01 - 1.36139762e+00 -1.93845144e-01 - 5.45730414e+00 -2.28114404e+00 - 3.22747247e-01 9.33582332e-01 - -1.46384504e+00 1.12801186e-01 - 4.26728166e-01 -2.33481242e-01 - -1.41327270e+00 8.16103740e-01 - -2.53998067e-01 1.44906646e-01 - -1.32436467e+00 1.87556361e-01 - -3.77313086e+00 1.32896038e+00 - 3.77651731e+00 -1.76548043e+00 - -2.45297093e+00 1.32571926e+00 - -6.55900588e-01 3.56921462e-01 - 9.25558722e-01 -4.51988954e-01 - 1.20732231e+00 -3.02821614e-01 - 3.72660154e-01 -1.89365208e-01 - -1.77090939e+00 9.18087975e-01 - 3.01127567e-01 2.67965829e-01 - -1.76708900e+00 4.62069259e-01 - -2.71812099e+00 1.57233508e+00 - -5.35297633e-01 4.99231535e-01 - 1.50507631e+00 -9.85763646e-01 - 3.00424787e+00 -1.29837562e+00 - -4.99311105e-01 3.91086482e-01 - 1.30125207e+00 -1.26247924e-01 - 4.01699483e-01 -4.46909391e-01 - -1.33635257e+00 5.12068703e-01 - 1.39229757e+00 -9.10974858e-01 - -1.74229508e+00 1.49475978e+00 - -1.21489414e+00 4.04193753e-01 - -3.36537605e-01 -6.74335427e-01 - -2.79186828e-01 8.48314720e-01 - -2.03080140e+00 1.66599815e+00 - -3.53064281e-01 -7.68582906e-04 - -5.30305657e+00 2.91091546e+00 - -1.20049972e+00 8.26578358e-01 - 2.95906989e-01 2.40215920e-01 - -1.42955534e+00 4.63480310e-01 - -1.87856619e+00 8.21459385e-01 - -2.71124720e+00 1.80246843e+00 - -3.06933780e+00 1.22235760e+00 - 5.21935582e-01 -1.27298218e+00 - -1.34175797e+00 7.69018937e-01 - -1.81962785e+00 1.15528991e+00 - -3.99227550e-01 2.93821598e-01 - 1.22533179e+00 -4.73846323e-01 - -2.08068359e-01 -1.75039817e-01 - -2.03068526e+00 1.50370503e+00 - -3.27606113e+00 1.74906330e+00 - -4.37802587e-01 -2.26956048e-01 - -7.69774213e-02 -3.54922468e-01 - 6.47160749e-02 -2.07334721e-01 - -1.37791524e+00 4.43766709e-01 - 3.29846803e+00 -1.04060799e+00 - -3.63704046e+00 1.05800226e+00 - -1.26716116e+00 1.13077353e+00 - 1.98549075e+00 -1.31864807e+00 - 1.85159500e+00 -5.78629560e-01 - -1.55295206e+00 1.23655857e+00 - 6.76026255e-01 9.18824125e-02 - 1.23418960e+00 -4.68162027e-01 - 2.43186642e+00 -9.22422440e-01 - -3.18729701e+00 1.77582673e+00 - -4.02945613e+00 1.14303496e+00 - -1.92694576e-01 1.03301431e-01 - 1.89554730e+00 -4.60128096e-01 - -2.55626581e+00 1.16057084e+00 - 6.89144365e-01 -9.94982900e-01 - -4.44680606e+00 2.19751983e+00 - -3.15196193e+00 1.18762993e+00 - -1.17434977e+00 1.04534656e+00 - 8.58386984e-02 -1.03947487e+00 - 3.33354973e-01 5.54813610e-01 - -9.37631808e-01 3.33450150e-01 - -2.50232471e+00 5.39720635e-01 - 1.03611949e+00 -7.16304095e-02 - -2.05556816e-02 -3.28992265e-01 - -2.24176201e+00 1.13077506e+00 - 4.53583688e+00 -1.10710212e+00 - 4.77389762e-01 -8.99445512e-01 - -2.69075551e+00 6.83176866e-01 - -2.21779724e+00 1.16916849e+00 - -1.09669056e+00 2.10044765e-01 - -8.45367920e-01 -8.45951423e-02 - 4.37558941e-01 -6.95904256e-01 - 1.84884195e+00 -1.71205136e-01 - -8.36371957e-01 5.62862478e-01 - 1.27786531e+00 -1.33362147e+00 - 2.90684492e+00 -7.49892184e-01 - -3.38652716e+00 1.51180670e+00 - -1.30945978e+00 7.09261928e-01 - -7.50471924e-01 -5.24637889e-01 - 1.18580718e+00 -9.97943971e-04 - -7.55395645e+00 3.19273590e+00 - 1.72822535e+00 -1.20996962e+00 - 5.67374320e-01 6.19573416e-01 - -2.99163781e+00 1.79721534e+00 - 1.49862187e+00 -6.05631846e-02 - 1.79503506e+00 -4.90419706e-01 - 3.85626054e+00 -1.95396324e+00 - -9.39188410e-01 7.96498057e-01 - 2.91986664e+00 -1.29392724e+00 - -1.54265750e+00 6.40727933e-01 - 1.14919794e+00 1.20834257e-01 - 2.00936817e+00 -1.53728359e+00 - 3.72468420e+00 -1.38704612e+00 - -1.27794802e+00 3.48543179e-01 - 3.63294077e-01 5.70623314e-01 - 1.49381016e+00 -6.04500534e-01 - 2.98912256e+00 -1.72295726e+00 - -1.80833817e+00 2.94907625e-01 - -3.19669622e+00 1.31888700e+00 - 1.45889401e+00 -8.88448639e-01 - -2.80045388e+00 1.01207060e+00 - -4.78379567e+00 1.48646520e+00 - 2.25510003e+00 -7.13372461e-01 - -9.74441433e-02 -2.17766373e-01 - 2.64468496e-01 -3.60842698e-01 - -5.98821713e+00 3.20197892e+00 - 2.67030213e-01 -5.36386416e-01 - 2.24546960e+00 -8.13464649e-01 - -4.89171414e-01 3.86255031e-01 - -7.45713706e-01 6.29800380e-01 - -3.30460503e-01 3.85127284e-01 - -4.19588147e+00 1.52793198e+00 - 5.42078582e-01 -2.61642741e-02 - 4.24938513e-01 -5.72936751e-01 - 2.82717288e+00 -6.75355024e-01 - -1.44741788e+00 5.03578028e-01 - -1.65547573e+00 7.76444277e-01 - 2.20361170e+00 -1.40835680e+00 - -3.69540235e+00 2.32953767e+00 - -1.41909357e-01 2.28989778e-01 - 1.92838879e+00 -8.72525737e-01 - 1.40708100e+00 -6.81849638e-02 - 1.24988112e+00 -1.39470590e-01 - -2.39435855e+00 7.26587655e-01 - 7.03985028e-01 4.85403277e-02 - 4.05214529e+00 -9.16928318e-01 - 3.74198837e-01 -5.04192358e-01 - -8.43374127e-01 2.36064018e-01 - -3.32253349e-01 7.47840055e-01 - -6.03725210e+00 1.95173337e+00 - 4.60829865e+00 -1.51191309e+00 - -1.46247098e+00 1.11140916e+00 - -9.60111157e-01 -1.23189114e-01 - -7.49613187e-01 4.53614129e-01 - -5.77838219e-01 2.07366469e-02 - 8.07652950e-01 -5.16272662e-01 - -6.02556049e-01 5.05318649e-01 - -1.28712445e-01 2.57836512e-01 - -5.27662820e+00 2.11790737e+00 - 5.40819308e+00 -2.15366022e+00 - 9.37742513e-02 -1.60221751e-01 - 4.55902865e+00 -1.24646307e+00 - -9.06582589e-01 1.92928110e-01 - 2.99928996e+00 -8.04301218e-01 - -3.24317381e+00 1.80076061e+00 - 3.20421743e-01 8.76524679e-01 - -5.29606705e-01 -3.16717696e-01 - -1.77264560e+00 7.52686776e-01 - -1.51706824e+00 8.43755103e-01 - 1.52759111e+00 -7.86814243e-01 - 4.74845617e-01 4.21319700e-01 - 6.97829149e-01 -8.15664881e-01 - 3.09564973e+00 -1.06202469e+00 - 2.95320379e+00 -1.98963943e+00 - -4.23033224e+00 1.41013338e+00 - 1.48576206e+00 8.02908511e-02 - 4.52041627e+00 -2.04620399e+00 - 6.58403922e-01 -7.60781799e-01 - 2.10667543e-01 1.15241731e-01 - 1.77702583e+00 -8.10271859e-01 - 2.41277385e+00 -1.46972042e+00 - 1.50685525e+00 -1.99272545e-01 - 7.61665522e-01 -4.11276152e-01 - 1.18352312e+00 -9.59908608e-01 - -3.32031305e-01 8.07500132e-02 - 1.16813118e+00 -1.73095194e-01 - 1.18363346e+00 -5.41565052e-01 - 5.17702179e-01 -7.62442035e-01 - 4.57401006e-01 -1.45951115e-02 - 1.49377115e-01 2.99571605e-01 - 1.40399453e+00 -1.30160353e+00 - 5.26231567e-01 3.52783752e-01 - -1.91136514e+00 4.24228635e-01 - 1.74156701e+00 -9.92076776e-01 - -4.89323391e+00 2.32483507e+00 - 2.54011209e+00 -8.80366295e-01 - -5.56925706e-01 1.48842026e-01 - -2.35904668e+00 9.60474853e-01 - 1.42216971e+00 -4.67062761e-01 - -1.10809680e+00 7.68684300e-01 - 4.09674726e+00 -1.90795680e+00 - -2.23048923e+00 9.03812542e-01 - 6.57025763e-01 1.36514871e-01 - 2.10944145e+00 -9.78897838e-02 - 1.22552525e+00 -2.50303867e-01 - 2.84620103e-01 -5.30164020e-01 - -2.13562585e+00 1.03503056e+00 - 1.32414902e-01 -8.14190240e-03 - -5.82433561e-01 3.21020292e-01 - -5.06473247e-01 3.11530419e-01 - 1.57162465e+00 -1.20763919e+00 - -1.43155284e+00 -2.51203698e-02 - -1.47093713e+00 -1.39620999e-01 - -2.65765643e+00 1.06091403e+00 - 2.45992927e+00 -5.88815836e-01 - -1.28440162e+00 -1.99377398e-01 - 6.11257504e-01 -3.73577401e-01 - -3.46606103e-01 6.06081290e-01 - 3.76687505e+00 -8.80181424e-01 - -1.03725103e+00 1.45177517e+00 - 2.76659936e+00 -1.09361320e+00 - -3.61311296e+00 9.75032455e-01 - 3.22878655e+00 -9.69497365e-01 - 1.43560379e+00 -5.52524585e-01 - 2.94042153e+00 -1.79747037e+00 - 1.30739580e+00 2.47989248e-01 - -4.05056982e-01 1.22831715e+00 - -2.25827421e+00 2.30604626e-01 - 3.69262926e-01 4.32714650e-02 - -5.52064063e-01 6.07806340e-01 - 7.03325987e+00 -2.17956730e+00 - -2.37823835e-01 -8.28068639e-01 - -4.84279888e-01 5.67765194e-01 - -3.15863410e+00 1.02241617e+00 - -3.39561593e+00 1.36876374e+00 - -2.78482934e+00 6.81641104e-01 - -4.37604334e+00 2.23826340e+00 - -2.54049692e+00 8.22676745e-01 - 3.73264822e+00 -9.93498732e-01 - -3.49536064e+00 1.84771519e+00 - 9.81801604e-01 -5.21278776e-01 - 1.52996831e+00 -1.27386206e+00 - -9.23490293e-01 5.29099482e-01 - -2.76999461e+00 9.24831872e-01 - -3.30029834e-01 -2.49645555e-01 - -1.71156166e+00 5.44940854e-01 - -2.37009487e+00 5.83826982e-01 - -3.03216865e+00 1.04922722e+00 - -2.19539936e+00 1.37558730e+00 - 1.15350207e+00 -6.15318535e-01 - 4.62011792e+00 -2.46714517e+00 - 1.52627952e-02 -1.00618283e-01 - -1.10399342e+00 4.87413533e-01 - 3.55448194e+00 -9.10394190e-01 - -5.21890321e+00 2.44710745e+00 - 1.54289749e+00 -6.54269311e-01 - 2.67935674e+00 -9.92758863e-01 - 1.05801310e+00 2.60054285e-02 - 1.52509097e+00 -4.08768600e-01 - 3.27576917e+00 -1.28769406e+00 - 1.71008412e-01 -2.68739994e-01 - -9.83351344e-04 7.02495897e-02 - -7.60795056e-03 1.61968285e-01 - -1.80620472e+00 4.24934471e-01 - 2.32023297e-02 -2.57284559e-01 - 3.98219478e-01 -4.65361935e-01 - 6.63476988e-01 -3.29823196e-02 - 4.00154707e+00 -1.01792211e+00 - -1.50286870e+00 9.46875359e-01 - -2.22717585e+00 7.50636195e-01 - -3.47381508e-01 -6.51596975e-01 - 2.08076453e+00 -8.22800165e-01 - 2.05099963e+00 -4.00868250e-01 - 3.52576988e-02 -2.54418565e-01 - 1.57342042e+00 -7.62166492e-02 - -1.47019722e+00 3.40861172e-01 - -1.21156090e+00 3.21891246e-01 - 3.79729047e+00 -1.54350764e+00 - 1.26459678e-02 6.99203693e-01 - 1.53974177e-01 4.68643204e-01 - -1.73923561e-01 -1.26229768e-01 - 4.54644993e+00 -2.13951783e+00 - 1.46022547e-01 -4.57084165e-01 - 6.50048037e+00 -2.78872609e+00 - -1.51934912e+00 1.03216768e+00 - -3.06483575e+00 1.81101446e+00 - -2.38212125e+00 9.19559042e-01 - -1.81319611e+00 8.10545112e-01 - 1.70951294e+00 -6.10712680e-01 - 1.67974156e+00 -1.51241453e+00 - -5.94795113e+00 2.56893813e+00 - 3.62633110e-01 -7.46965304e-01 - -2.44042594e+00 8.52761797e-01 - 3.32412550e+00 -1.28439899e+00 - 4.74860766e+00 -1.72821964e+00 - 1.29072541e+00 -8.24872902e-01 - -1.69450702e+00 4.09600876e-01 - 1.29705411e+00 1.22300809e-01 - -2.63597613e+00 8.55612913e-01 - 9.28467301e-01 -2.63550114e-02 - 2.44670264e+00 -4.10123002e-01 - 1.06408206e+00 -5.03361942e-01 - 5.12384049e-02 -1.27116595e-02 - -1.06731272e+00 -1.76205029e-01 - -9.45454582e-01 3.74404917e-01 - 2.54343689e+00 -7.13810545e-01 - -2.54460335e+00 1.31590265e+00 - 1.89864233e+00 -3.98436339e-01 - -1.93990133e+00 6.01474630e-01 - -1.35938824e+00 4.00751788e-01 - 2.38567018e+00 -6.13904880e-01 - 2.18748050e-01 2.62631712e-01 - -2.01388788e+00 1.41474031e+00 - 2.74014581e+00 -1.27448105e+00 - -2.13828583e+00 1.13616144e+00 - 5.98730932e+00 -2.53430080e+00 - -1.72872795e+00 1.53702057e+00 - -2.53263962e+00 1.27342410e+00 - 1.34326968e+00 -1.99395088e-01 - 3.83352666e-01 -1.25683065e-01 - -2.35630657e+00 5.54116983e-01 - -1.94900838e+00 5.76270178e-01 - -1.36699108e+00 -3.40904824e-01 - -2.34727346e+00 -1.93054940e-02 - -3.82779777e+00 1.83025664e+00 - -4.31602080e+00 9.21605705e-01 - 5.54098133e-01 2.33991419e-01 - -4.53591188e+00 1.99833353e+00 - -3.92715909e+00 1.83231482e+00 - 3.91344440e-01 -1.11355111e-01 - 3.48576363e+00 -1.41379449e+00 - -1.42858690e+00 3.84532286e-01 - 1.79519859e+00 -9.23486448e-01 - 8.49691242e-01 -1.76551084e-01 - 1.53618138e+00 8.23835015e-02 - 5.91476520e-02 3.88296940e-02 - 1.44837346e+00 -7.24097604e-01 - -6.79008418e-01 4.04078097e-01 - 2.87555510e+00 -9.51825076e-01 - -1.12379101e+00 2.93457714e-01 - 1.45263980e+00 -6.01960544e-01 - -2.55741621e-01 9.26233518e-01 - 3.54570714e+00 -1.41521877e+00 - -1.61542388e+00 6.57844512e-01 - -3.22844269e-01 3.02823546e-01 - 1.03523913e+00 -6.92730711e-01 - 1.11084909e+00 -3.50823642e-01 - 3.41268693e+00 -1.90865862e+00 - 7.67062858e-01 -9.48792160e-01 - -5.49798016e+00 1.71139960e+00 - 1.14865798e+00 -6.12669150e-01 - -2.18256680e+00 7.78634462e-01 - 4.78857389e+00 -2.55555085e+00 - -1.85555569e+00 8.04311615e-01 - -4.22278799e+00 2.01162524e+00 - -1.56556149e+00 1.54353907e+00 - -3.11527864e+00 1.65973526e+00 - 2.66342611e+00 -1.20449402e+00 - 1.57635314e+00 -1.48716308e-01 - -6.35606865e-01 2.59701180e-01 - 1.02431976e+00 -6.76929904e-01 - 1.12973772e+00 1.49473892e-02 - -9.12758116e-01 2.21533933e-01 - -2.98014470e+00 1.71651189e+00 - 2.74016965e+00 -9.47893923e-01 - -3.47830591e+00 1.34941430e+00 - 1.74757562e+00 -3.72503752e-01 - 5.55820383e-01 -6.47992466e-01 - -1.19871928e+00 9.82429151e-01 - -2.53040133e+00 2.10671307e+00 - -1.94085605e+00 1.38938137e+00 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/license.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/license.txt deleted file mode 100644 index 052f302..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/license.txt +++ /dev/null @@ -1,13 +0,0 @@ -The images in the folder "kittens" are under the creative commons CC0 license, or no rights reserved: -https://creativecommons.org/share-your-work/public-domain/cc0/ -The images are taken from: -https://ccsearch.creativecommons.org/image/detail/WZnbJSJ2-dzIDiuUUdto3Q== -https://ccsearch.creativecommons.org/image/detail/_TlKu_rm_QrWlR0zthQTXA== -https://ccsearch.creativecommons.org/image/detail/OPNnHJb6q37rSZ5o_L5JHQ== -https://ccsearch.creativecommons.org/image/detail/B2CVP_j5KjwZm7UAVJ3Hvw== - -The chr30.4.184.jpg and grayscale.jpg images are also under the CC0 license, taken from: -https://ccsearch.creativecommons.org/image/detail/8eO_qqotBfEm2UYxirLntw== - -The image under "multi-channel" directory is under the CC BY-SA 4.0 license cropped from: -https://en.wikipedia.org/wiki/Alpha_compositing#/media/File:Hue_alpha_falloff.png diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/29.5.a_b_EGDP022204.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/29.5.a_b_EGDP022204.jpg deleted file mode 100644 index 435e7df..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/29.5.a_b_EGDP022204.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/54893.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/54893.jpg deleted file mode 100644 index 825630c..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/54893.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/DP153539.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/DP153539.jpg deleted file mode 100644 index 571efe9..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/DP153539.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/DP802813.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/DP802813.jpg deleted file mode 100644 index 2d12359..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/DP802813.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/not-image.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/not-image.txt deleted file mode 100644 index 283e5e9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/kittens/not-image.txt +++ /dev/null @@ -1 +0,0 @@ -not an image diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/license.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/license.txt deleted file mode 100644 index 052f302..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/license.txt +++ /dev/null @@ -1,13 +0,0 @@ -The images in the folder "kittens" are under the creative commons CC0 license, or no rights reserved: -https://creativecommons.org/share-your-work/public-domain/cc0/ -The images are taken from: -https://ccsearch.creativecommons.org/image/detail/WZnbJSJ2-dzIDiuUUdto3Q== -https://ccsearch.creativecommons.org/image/detail/_TlKu_rm_QrWlR0zthQTXA== -https://ccsearch.creativecommons.org/image/detail/OPNnHJb6q37rSZ5o_L5JHQ== -https://ccsearch.creativecommons.org/image/detail/B2CVP_j5KjwZm7UAVJ3Hvw== - -The chr30.4.184.jpg and grayscale.jpg images are also under the CC0 license, taken from: -https://ccsearch.creativecommons.org/image/detail/8eO_qqotBfEm2UYxirLntw== - -The image under "multi-channel" directory is under the CC BY-SA 4.0 license cropped from: -https://en.wikipedia.org/wiki/Alpha_compositing#/media/File:Hue_alpha_falloff.png diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/BGRA.png b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/BGRA.png deleted file mode 100644 index a944c6c..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/BGRA.png and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/BGRA_alpha_60.png b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/BGRA_alpha_60.png deleted file mode 100644 index 913637c..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/BGRA_alpha_60.png and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/chr30.4.184.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/chr30.4.184.jpg deleted file mode 100644 index 7068b97..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/chr30.4.184.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/grayscale.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/grayscale.jpg deleted file mode 100644 index 621cdd1..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/origin/multi-channel/grayscale.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-01/29.5.a_b_EGDP022204.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-01/29.5.a_b_EGDP022204.jpg deleted file mode 100644 index 435e7df..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-01/29.5.a_b_EGDP022204.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-01/not-image.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-01/not-image.txt deleted file mode 100644 index 283e5e9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-01/not-image.txt +++ /dev/null @@ -1 +0,0 @@ -not an image diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-02/54893.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-02/54893.jpg deleted file mode 100644 index 825630c..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-02/54893.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-02/DP153539.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-02/DP153539.jpg deleted file mode 100644 index 571efe9..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-02/DP153539.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-02/DP802813.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-02/DP802813.jpg deleted file mode 100644 index 2d12359..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=kittens/date=2018-02/DP802813.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-01/BGRA.png b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-01/BGRA.png deleted file mode 100644 index a944c6c..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-01/BGRA.png and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-01/BGRA_alpha_60.png b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-01/BGRA_alpha_60.png deleted file mode 100644 index 913637c..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-01/BGRA_alpha_60.png and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-02/chr30.4.184.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-02/chr30.4.184.jpg deleted file mode 100644 index 7068b97..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-02/chr30.4.184.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-02/grayscale.jpg b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-02/grayscale.jpg deleted file mode 100644 index 621cdd1..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/images/partitioned/cls=multichannel/date=2018-02/grayscale.jpg and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/iris_libsvm.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/iris_libsvm.txt deleted file mode 100644 index db95901..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/iris_libsvm.txt +++ /dev/null @@ -1,150 +0,0 @@ -0.0 1:5.1 2:3.5 3:1.4 4:0.2 -0.0 1:4.9 2:3.0 3:1.4 4:0.2 -0.0 1:4.7 2:3.2 3:1.3 4:0.2 -0.0 1:4.6 2:3.1 3:1.5 4:0.2 -0.0 1:5.0 2:3.6 3:1.4 4:0.2 -0.0 1:5.4 2:3.9 3:1.7 4:0.4 -0.0 1:4.6 2:3.4 3:1.4 4:0.3 -0.0 1:5.0 2:3.4 3:1.5 4:0.2 -0.0 1:4.4 2:2.9 3:1.4 4:0.2 -0.0 1:4.9 2:3.1 3:1.5 4:0.1 -0.0 1:5.4 2:3.7 3:1.5 4:0.2 -0.0 1:4.8 2:3.4 3:1.6 4:0.2 -0.0 1:4.8 2:3.0 3:1.4 4:0.1 -0.0 1:4.3 2:3.0 3:1.1 4:0.1 -0.0 1:5.8 2:4.0 3:1.2 4:0.2 -0.0 1:5.7 2:4.4 3:1.5 4:0.4 -0.0 1:5.4 2:3.9 3:1.3 4:0.4 -0.0 1:5.1 2:3.5 3:1.4 4:0.3 -0.0 1:5.7 2:3.8 3:1.7 4:0.3 -0.0 1:5.1 2:3.8 3:1.5 4:0.3 -0.0 1:5.4 2:3.4 3:1.7 4:0.2 -0.0 1:5.1 2:3.7 3:1.5 4:0.4 -0.0 1:4.6 2:3.6 3:1.0 4:0.2 -0.0 1:5.1 2:3.3 3:1.7 4:0.5 -0.0 1:4.8 2:3.4 3:1.9 4:0.2 -0.0 1:5.0 2:3.0 3:1.6 4:0.2 -0.0 1:5.0 2:3.4 3:1.6 4:0.4 -0.0 1:5.2 2:3.5 3:1.5 4:0.2 -0.0 1:5.2 2:3.4 3:1.4 4:0.2 -0.0 1:4.7 2:3.2 3:1.6 4:0.2 -0.0 1:4.8 2:3.1 3:1.6 4:0.2 -0.0 1:5.4 2:3.4 3:1.5 4:0.4 -0.0 1:5.2 2:4.1 3:1.5 4:0.1 -0.0 1:5.5 2:4.2 3:1.4 4:0.2 -0.0 1:4.9 2:3.1 3:1.5 4:0.1 -0.0 1:5.0 2:3.2 3:1.2 4:0.2 -0.0 1:5.5 2:3.5 3:1.3 4:0.2 -0.0 1:4.9 2:3.1 3:1.5 4:0.1 -0.0 1:4.4 2:3.0 3:1.3 4:0.2 -0.0 1:5.1 2:3.4 3:1.5 4:0.2 -0.0 1:5.0 2:3.5 3:1.3 4:0.3 -0.0 1:4.5 2:2.3 3:1.3 4:0.3 -0.0 1:4.4 2:3.2 3:1.3 4:0.2 -0.0 1:5.0 2:3.5 3:1.6 4:0.6 -0.0 1:5.1 2:3.8 3:1.9 4:0.4 -0.0 1:4.8 2:3.0 3:1.4 4:0.3 -0.0 1:5.1 2:3.8 3:1.6 4:0.2 -0.0 1:4.6 2:3.2 3:1.4 4:0.2 -0.0 1:5.3 2:3.7 3:1.5 4:0.2 -0.0 1:5.0 2:3.3 3:1.4 4:0.2 -1.0 1:7.0 2:3.2 3:4.7 4:1.4 -1.0 1:6.4 2:3.2 3:4.5 4:1.5 -1.0 1:6.9 2:3.1 3:4.9 4:1.5 -1.0 1:5.5 2:2.3 3:4.0 4:1.3 -1.0 1:6.5 2:2.8 3:4.6 4:1.5 -1.0 1:5.7 2:2.8 3:4.5 4:1.3 -1.0 1:6.3 2:3.3 3:4.7 4:1.6 -1.0 1:4.9 2:2.4 3:3.3 4:1.0 -1.0 1:6.6 2:2.9 3:4.6 4:1.3 -1.0 1:5.2 2:2.7 3:3.9 4:1.4 -1.0 1:5.0 2:2.0 3:3.5 4:1.0 -1.0 1:5.9 2:3.0 3:4.2 4:1.5 -1.0 1:6.0 2:2.2 3:4.0 4:1.0 -1.0 1:6.1 2:2.9 3:4.7 4:1.4 -1.0 1:5.6 2:2.9 3:3.6 4:1.3 -1.0 1:6.7 2:3.1 3:4.4 4:1.4 -1.0 1:5.6 2:3.0 3:4.5 4:1.5 -1.0 1:5.8 2:2.7 3:4.1 4:1.0 -1.0 1:6.2 2:2.2 3:4.5 4:1.5 -1.0 1:5.6 2:2.5 3:3.9 4:1.1 -1.0 1:5.9 2:3.2 3:4.8 4:1.8 -1.0 1:6.1 2:2.8 3:4.0 4:1.3 -1.0 1:6.3 2:2.5 3:4.9 4:1.5 -1.0 1:6.1 2:2.8 3:4.7 4:1.2 -1.0 1:6.4 2:2.9 3:4.3 4:1.3 -1.0 1:6.6 2:3.0 3:4.4 4:1.4 -1.0 1:6.8 2:2.8 3:4.8 4:1.4 -1.0 1:6.7 2:3.0 3:5.0 4:1.7 -1.0 1:6.0 2:2.9 3:4.5 4:1.5 -1.0 1:5.7 2:2.6 3:3.5 4:1.0 -1.0 1:5.5 2:2.4 3:3.8 4:1.1 -1.0 1:5.5 2:2.4 3:3.7 4:1.0 -1.0 1:5.8 2:2.7 3:3.9 4:1.2 -1.0 1:6.0 2:2.7 3:5.1 4:1.6 -1.0 1:5.4 2:3.0 3:4.5 4:1.5 -1.0 1:6.0 2:3.4 3:4.5 4:1.6 -1.0 1:6.7 2:3.1 3:4.7 4:1.5 -1.0 1:6.3 2:2.3 3:4.4 4:1.3 -1.0 1:5.6 2:3.0 3:4.1 4:1.3 -1.0 1:5.5 2:2.5 3:4.0 4:1.3 -1.0 1:5.5 2:2.6 3:4.4 4:1.2 -1.0 1:6.1 2:3.0 3:4.6 4:1.4 -1.0 1:5.8 2:2.6 3:4.0 4:1.2 -1.0 1:5.0 2:2.3 3:3.3 4:1.0 -1.0 1:5.6 2:2.7 3:4.2 4:1.3 -1.0 1:5.7 2:3.0 3:4.2 4:1.2 -1.0 1:5.7 2:2.9 3:4.2 4:1.3 -1.0 1:6.2 2:2.9 3:4.3 4:1.3 -1.0 1:5.1 2:2.5 3:3.0 4:1.1 -1.0 1:5.7 2:2.8 3:4.1 4:1.3 -2.0 1:6.3 2:3.3 3:6.0 4:2.5 -2.0 1:5.8 2:2.7 3:5.1 4:1.9 -2.0 1:7.1 2:3.0 3:5.9 4:2.1 -2.0 1:6.3 2:2.9 3:5.6 4:1.8 -2.0 1:6.5 2:3.0 3:5.8 4:2.2 -2.0 1:7.6 2:3.0 3:6.6 4:2.1 -2.0 1:4.9 2:2.5 3:4.5 4:1.7 -2.0 1:7.3 2:2.9 3:6.3 4:1.8 -2.0 1:6.7 2:2.5 3:5.8 4:1.8 -2.0 1:7.2 2:3.6 3:6.1 4:2.5 -2.0 1:6.5 2:3.2 3:5.1 4:2.0 -2.0 1:6.4 2:2.7 3:5.3 4:1.9 -2.0 1:6.8 2:3.0 3:5.5 4:2.1 -2.0 1:5.7 2:2.5 3:5.0 4:2.0 -2.0 1:5.8 2:2.8 3:5.1 4:2.4 -2.0 1:6.4 2:3.2 3:5.3 4:2.3 -2.0 1:6.5 2:3.0 3:5.5 4:1.8 -2.0 1:7.7 2:3.8 3:6.7 4:2.2 -2.0 1:7.7 2:2.6 3:6.9 4:2.3 -2.0 1:6.0 2:2.2 3:5.0 4:1.5 -2.0 1:6.9 2:3.2 3:5.7 4:2.3 -2.0 1:5.6 2:2.8 3:4.9 4:2.0 -2.0 1:7.7 2:2.8 3:6.7 4:2.0 -2.0 1:6.3 2:2.7 3:4.9 4:1.8 -2.0 1:6.7 2:3.3 3:5.7 4:2.1 -2.0 1:7.2 2:3.2 3:6.0 4:1.8 -2.0 1:6.2 2:2.8 3:4.8 4:1.8 -2.0 1:6.1 2:3.0 3:4.9 4:1.8 -2.0 1:6.4 2:2.8 3:5.6 4:2.1 -2.0 1:7.2 2:3.0 3:5.8 4:1.6 -2.0 1:7.4 2:2.8 3:6.1 4:1.9 -2.0 1:7.9 2:3.8 3:6.4 4:2.0 -2.0 1:6.4 2:2.8 3:5.6 4:2.2 -2.0 1:6.3 2:2.8 3:5.1 4:1.5 -2.0 1:6.1 2:2.6 3:5.6 4:1.4 -2.0 1:7.7 2:3.0 3:6.1 4:2.3 -2.0 1:6.3 2:3.4 3:5.6 4:2.4 -2.0 1:6.4 2:3.1 3:5.5 4:1.8 -2.0 1:6.0 2:3.0 3:4.8 4:1.8 -2.0 1:6.9 2:3.1 3:5.4 4:2.1 -2.0 1:6.7 2:3.1 3:5.6 4:2.4 -2.0 1:6.9 2:3.1 3:5.1 4:2.3 -2.0 1:5.8 2:2.7 3:5.1 4:1.9 -2.0 1:6.8 2:3.2 3:5.9 4:2.3 -2.0 1:6.7 2:3.3 3:5.7 4:2.5 -2.0 1:6.7 2:3.0 3:5.2 4:2.3 -2.0 1:6.3 2:2.5 3:5.0 4:1.9 -2.0 1:6.5 2:3.0 3:5.2 4:2.0 -2.0 1:6.2 2:3.4 3:5.4 4:2.3 -2.0 1:5.9 2:3.0 3:5.1 4:1.8 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/kmeans_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/kmeans_data.txt deleted file mode 100644 index 338664f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/kmeans_data.txt +++ /dev/null @@ -1,6 +0,0 @@ -0.0 0.0 0.0 -0.1 0.1 0.1 -0.2 0.2 0.2 -9.0 9.0 9.0 -9.1 9.1 9.1 -9.2 9.2 9.2 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/pagerank_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/pagerank_data.txt deleted file mode 100644 index 95755ab..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/pagerank_data.txt +++ /dev/null @@ -1,6 +0,0 @@ -1 2 -1 3 -1 4 -2 1 -3 1 -4 1 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/pic_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/pic_data.txt deleted file mode 100644 index fcfef8c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/pic_data.txt +++ /dev/null @@ -1,19 +0,0 @@ -0 1 1.0 -0 2 1.0 -0 3 1.0 -1 2 1.0 -1 3 1.0 -2 3 1.0 -3 4 0.1 -4 5 1.0 -4 15 1.0 -5 6 1.0 -6 7 1.0 -7 8 1.0 -8 9 1.0 -9 10 1.0 -10 11 1.0 -11 12 1.0 -12 13 1.0 -13 14 1.0 -14 15 1.0 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/ridge-data/lpsa.data b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/ridge-data/lpsa.data deleted file mode 100644 index fdd16e3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/ridge-data/lpsa.data +++ /dev/null @@ -1,67 +0,0 @@ --0.4307829,-1.63735562648104 -2.00621178480549 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 --0.1625189,-1.98898046126935 -0.722008756122123 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 --0.1625189,-1.57881887548545 -2.1887840293994 1.36116336875686 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541 --0.1625189,-2.16691708463163 -0.807993896938655 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -0.3715636,-0.507874475300631 -0.458834049396776 -0.250631301876899 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -0.7654678,-2.03612849966376 -0.933954647105133 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -0.8544153,-0.557312518810673 -0.208756571683607 -0.787896192088153 0.990146852537193 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -1.2669476,-0.929360463147704 -0.0578991819441687 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -1.2669476,-2.28833047634983 -0.0706369432557794 -0.116315079324086 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -1.2669476,0.223498042876113 -1.41471935455355 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341 -1.3480731,0.107785900236813 -1.47221551299731 0.420949810887169 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.687186906466865 -1.446919,0.162180092313795 -1.32557369901905 0.286633588334355 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -1.4701758,-1.49795329918548 -0.263601072284232 0.823898478545609 0.788388310173035 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341 -1.4929041,0.796247055396743 0.0476559407005752 0.286633588334355 -1.02470580167082 -0.522940888712441 0.394013435896129 -1.04215728919298 -0.864466507337306 -1.5581446,-1.62233848461465 -0.843294091975396 -3.07127197548598 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -1.5993876,-0.990720665490831 0.458513517212311 0.823898478545609 1.07379746308195 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -1.6389967,-0.171901281967138 -0.489197399065355 -0.65357996953534 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -1.6956156,-1.60758252338831 -0.590700340358265 -0.65357996953534 -0.619561070667254 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -1.7137979,0.366273918511144 -0.414014962912583 -0.116315079324086 0.232904453212813 -0.522940888712441 0.971228997418125 0.342627053981254 1.26288870310799 -1.8000583,-0.710307384579833 0.211731938156277 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.442797990776478 0.342627053981254 1.61744790484887 -1.8484548,-0.262791728113881 -1.16708345615721 0.420949810887169 0.0846342590816532 -0.522940888712441 0.163172393491611 0.342627053981254 1.97200710658975 -1.8946169,0.899043117369237 -0.590700340358265 0.152317365781542 -1.02470580167082 -0.522940888712441 1.28643254437683 -1.04215728919298 -0.864466507337306 -1.9242487,-0.903451690500615 1.07659722048274 0.152317365781542 1.28380453408541 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306 -2.008214,-0.0633337899773081 -1.38088970920094 0.958214701098423 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -2.0476928,-1.15393789990757 -0.961853075398404 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306 -2.1575593,0.0620203721138446 0.0657973885499142 1.22684714620405 -0.468824786336838 -0.522940888712441 1.31421001659859 1.72741139715549 -0.332627704725983 -2.1916535,-0.75731027755674 -2.92717970468456 0.018001143228728 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 -2.2137539,1.11226993252773 1.06484916245061 0.555266033439982 0.877691038550889 1.89254797819741 1.43890404648442 0.342627053981254 0.376490698755783 -2.2772673,-0.468768642850639 -1.43754788774533 -1.05652863719378 0.576050411655607 -0.522940888712441 0.0120483832567209 0.342627053981254 -0.687186906466865 -2.2975726,-0.618884859896728 -1.1366360750781 -0.519263746982526 -1.02470580167082 -0.522940888712441 -0.863171185425945 3.11219574032972 1.97200710658975 -2.3272777,-0.651431999123483 0.55329161145762 -0.250631301876899 1.11210019001038 -0.522940888712441 -0.179808625688859 -1.04215728919298 -0.864466507337306 -2.5217206,0.115499102435224 -0.512233676577595 0.286633588334355 1.13650173283446 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.155348103855541 -2.5533438,0.266341329949937 -0.551137885443386 -0.384947524429713 0.354857790686005 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 -2.5687881,1.16902610257751 0.855491905752846 2.03274448152093 1.22628985326088 1.89254797819741 2.02833774827712 3.11219574032972 2.68112551007152 -2.6567569,-0.218972367124187 0.851192298581141 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 0.908329501367106 -2.677591,0.263121415733908 1.4142681068416 0.018001143228728 1.35980653053822 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -2.7180005,-0.0704736333296423 1.52000996595417 0.286633588334355 1.39364261119802 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 -2.7942279,-0.751957286017338 0.316843561689933 -1.99674219506348 0.911736065044475 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -2.8063861,-0.685277652430997 1.28214038482516 0.823898478545609 0.232904453212813 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541 -2.8124102,-0.244991501432929 0.51882005949686 -0.384947524429713 0.823246560137838 -0.522940888712441 -0.863171185425945 0.342627053981254 0.553770299626224 -2.8419982,-0.75731027755674 2.09041984898851 1.22684714620405 1.53428167116843 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -2.8535925,1.20962937075363 -0.242882661178889 1.09253092365124 -1.02470580167082 -0.522940888712441 1.24263233939889 3.11219574032972 2.50384590920108 -2.9204698,0.570886990493502 0.58243883987948 0.555266033439982 1.16006887775962 -0.522940888712441 1.07357183940747 0.342627053981254 1.61744790484887 -2.9626924,0.719758684343624 0.984970304132004 1.09253092365124 1.52137230773457 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.509907305596424 -2.9626924,-1.52406140158064 1.81975700990333 0.689582255992796 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -2.9729753,-0.132431544081234 2.68769877553723 1.09253092365124 1.53428167116843 -0.522940888712441 -0.442797990776478 0.342627053981254 -0.687186906466865 -3.0130809,0.436161292804989 -0.0834447307428255 -0.519263746982526 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799 -3.0373539,-0.161195191984091 -0.671900359186746 1.7641120364153 1.13650173283446 -0.522940888712441 -0.863171185425945 0.342627053981254 0.0219314970149 -3.2752562,1.39927182372944 0.513852869452676 0.689582255992796 -1.02470580167082 1.89254797819741 1.49394503405693 0.342627053981254 -0.155348103855541 -3.3375474,1.51967002306341 -0.852203755696565 0.555266033439982 -0.104527297798983 1.89254797819741 1.85927724828569 0.342627053981254 0.908329501367106 -3.3928291,0.560725834706224 1.87867703391426 1.09253092365124 1.39364261119802 -0.522940888712441 0.486423065822545 0.342627053981254 1.26288870310799 -3.4355988,1.00765532502814 1.69426310090641 1.89842825896812 1.53428167116843 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.509907305596424 -3.4578927,1.10152996153577 -0.10927271844907 0.689582255992796 -1.02470580167082 1.89254797819741 1.97630171771485 0.342627053981254 1.61744790484887 -3.5160131,0.100001934217311 -1.30380956369388 0.286633588334355 0.316555063757567 -0.522940888712441 0.28786643052924 0.342627053981254 0.553770299626224 -3.5307626,0.987291634724086 -0.36279314978779 -0.922212414640967 0.232904453212813 -0.522940888712441 1.79270085261407 0.342627053981254 1.26288870310799 -3.5652984,1.07158528137575 0.606453149641961 1.7641120364153 -0.432854616994416 1.89254797819741 0.528504607720369 0.342627053981254 0.199211097885341 -3.5876769,0.180156323255198 0.188987436375017 -0.519263746982526 1.09956763075594 -0.522940888712441 0.708239632330506 0.342627053981254 0.199211097885341 -3.6309855,1.65687973755377 -0.256675483533719 0.018001143228728 -1.02470580167082 1.89254797819741 1.79270085261407 0.342627053981254 1.26288870310799 -3.6800909,0.5720085322365 0.239854450210939 -0.787896192088153 1.0605418233138 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -3.7123518,0.323806133438225 -0.606717660886078 -0.250631301876899 -1.02470580167082 1.89254797819741 0.342907418101747 0.342627053981254 0.199211097885341 -3.9843437,1.23668206715898 2.54220539083611 0.152317365781542 -1.02470580167082 1.89254797819741 1.89037692416194 0.342627053981254 1.26288870310799 -3.993603,0.180156323255198 0.154448192444669 1.62979581386249 0.576050411655607 1.89254797819741 0.708239632330506 0.342627053981254 1.79472750571931 -4.029806,1.60906277046565 1.10378605019827 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 -4.1295508,1.0036214996026 0.113496885050331 -0.384947524429713 0.860016436332751 1.89254797819741 -0.863171185425945 0.342627053981254 -0.332627704725983 -4.3851468,1.25591974271076 0.577607033774471 0.555266033439982 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799 -4.6844434,2.09650591351268 0.625488598331018 -2.66832330782754 -1.02470580167082 1.89254797819741 1.67954222367555 0.342627053981254 0.553770299626224 -5.477509,1.30028987435881 0.338383613253713 0.555266033439982 1.00481276295349 1.89254797819741 1.24263233939889 0.342627053981254 1.97200710658975 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_binary_classification_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_binary_classification_data.txt deleted file mode 100644 index 861c70c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_binary_classification_data.txt +++ /dev/null @@ -1,100 +0,0 @@ -0 128:51 129:159 130:253 131:159 132:50 155:48 156:238 157:252 158:252 159:252 160:237 182:54 183:227 184:253 185:252 186:239 187:233 188:252 189:57 190:6 208:10 209:60 210:224 211:252 212:253 213:252 214:202 215:84 216:252 217:253 218:122 236:163 237:252 238:252 239:252 240:253 241:252 242:252 243:96 244:189 245:253 246:167 263:51 264:238 265:253 266:253 267:190 268:114 269:253 270:228 271:47 272:79 273:255 274:168 290:48 291:238 292:252 293:252 294:179 295:12 296:75 297:121 298:21 301:253 302:243 303:50 317:38 318:165 319:253 320:233 321:208 322:84 329:253 330:252 331:165 344:7 345:178 346:252 347:240 348:71 349:19 350:28 357:253 358:252 359:195 372:57 373:252 374:252 375:63 385:253 386:252 387:195 400:198 401:253 402:190 413:255 414:253 415:196 427:76 428:246 429:252 430:112 441:253 442:252 443:148 455:85 456:252 457:230 458:25 467:7 468:135 469:253 470:186 471:12 483:85 484:252 485:223 494:7 495:131 496:252 497:225 498:71 511:85 512:252 513:145 521:48 522:165 523:252 524:173 539:86 540:253 541:225 548:114 549:238 550:253 551:162 567:85 568:252 569:249 570:146 571:48 572:29 573:85 574:178 575:225 576:253 577:223 578:167 579:56 595:85 596:252 597:252 598:252 599:229 600:215 601:252 602:252 603:252 604:196 605:130 623:28 624:199 625:252 626:252 627:253 628:252 629:252 630:233 631:145 652:25 653:128 654:252 655:253 656:252 657:141 658:37 -1 159:124 160:253 161:255 162:63 186:96 187:244 188:251 189:253 190:62 214:127 215:251 216:251 217:253 218:62 241:68 242:236 243:251 244:211 245:31 246:8 268:60 269:228 270:251 271:251 272:94 296:155 297:253 298:253 299:189 323:20 324:253 325:251 326:235 327:66 350:32 351:205 352:253 353:251 354:126 378:104 379:251 380:253 381:184 382:15 405:80 406:240 407:251 408:193 409:23 432:32 433:253 434:253 435:253 436:159 460:151 461:251 462:251 463:251 464:39 487:48 488:221 489:251 490:251 491:172 515:234 516:251 517:251 518:196 519:12 543:253 544:251 545:251 546:89 570:159 571:255 572:253 573:253 574:31 597:48 598:228 599:253 600:247 601:140 602:8 625:64 626:251 627:253 628:220 653:64 654:251 655:253 656:220 681:24 682:193 683:253 684:220 -1 125:145 126:255 127:211 128:31 152:32 153:237 154:253 155:252 156:71 180:11 181:175 182:253 183:252 184:71 209:144 210:253 211:252 212:71 236:16 237:191 238:253 239:252 240:71 264:26 265:221 266:253 267:252 268:124 269:31 293:125 294:253 295:252 296:252 297:108 322:253 323:252 324:252 325:108 350:255 351:253 352:253 353:108 378:253 379:252 380:252 381:108 406:253 407:252 408:252 409:108 434:253 435:252 436:252 437:108 462:255 463:253 464:253 465:170 490:253 491:252 492:252 493:252 494:42 518:149 519:252 520:252 521:252 522:144 546:109 547:252 548:252 549:252 550:144 575:218 576:253 577:253 578:255 579:35 603:175 604:252 605:252 606:253 607:35 631:73 632:252 633:252 634:253 635:35 659:31 660:211 661:252 662:253 663:35 -1 153:5 154:63 155:197 181:20 182:254 183:230 184:24 209:20 210:254 211:254 212:48 237:20 238:254 239:255 240:48 265:20 266:254 267:254 268:57 293:20 294:254 295:254 296:108 321:16 322:239 323:254 324:143 350:178 351:254 352:143 378:178 379:254 380:143 406:178 407:254 408:162 434:178 435:254 436:240 462:113 463:254 464:240 490:83 491:254 492:245 493:31 518:79 519:254 520:246 521:38 547:214 548:254 549:150 575:144 576:241 577:8 603:144 604:240 605:2 631:144 632:254 633:82 659:230 660:247 661:40 687:168 688:209 689:31 -1 152:1 153:168 154:242 155:28 180:10 181:228 182:254 183:100 209:190 210:254 211:122 237:83 238:254 239:162 265:29 266:254 267:248 268:25 293:29 294:255 295:254 296:103 321:29 322:254 323:254 324:109 349:29 350:254 351:254 352:109 377:29 378:254 379:254 380:109 405:29 406:255 407:254 408:109 433:29 434:254 435:254 436:109 461:29 462:254 463:254 464:63 489:29 490:254 491:254 492:28 517:29 518:254 519:254 520:28 545:29 546:254 547:254 548:35 573:29 574:254 575:254 576:109 601:6 602:212 603:254 604:109 630:203 631:254 632:178 658:155 659:254 660:190 686:32 687:199 688:104 -0 130:64 131:253 132:255 133:63 157:96 158:205 159:251 160:253 161:205 162:111 163:4 184:96 185:189 186:251 187:251 188:253 189:251 190:251 191:31 209:16 210:64 211:223 212:244 213:251 214:251 215:211 216:213 217:251 218:251 219:31 236:80 237:181 238:251 239:253 240:251 241:251 242:251 243:94 244:96 245:251 246:251 247:31 263:92 264:253 265:253 266:253 267:255 268:253 269:253 270:253 271:95 272:96 273:253 274:253 275:31 290:92 291:236 292:251 293:243 294:220 295:233 296:251 297:251 298:243 299:82 300:96 301:251 302:251 303:31 317:80 318:253 319:251 320:251 321:188 323:96 324:251 325:251 326:109 328:96 329:251 330:251 331:31 344:96 345:240 346:253 347:243 348:188 349:42 351:96 352:204 353:109 354:4 356:12 357:197 358:251 359:31 372:221 373:251 374:253 375:121 379:36 380:23 385:190 386:251 387:31 399:48 400:234 401:253 413:191 414:253 415:31 426:44 427:221 428:251 429:251 440:12 441:197 442:251 443:31 454:190 455:251 456:251 457:251 468:96 469:251 470:251 471:31 482:190 483:251 484:251 485:113 495:40 496:234 497:251 498:219 499:23 510:190 511:251 512:251 513:94 522:40 523:217 524:253 525:231 526:47 538:191 539:253 540:253 541:253 548:12 549:174 550:253 551:253 552:219 553:39 566:67 567:236 568:251 569:251 570:191 571:190 572:111 573:72 574:190 575:191 576:197 577:251 578:243 579:121 580:39 595:63 596:236 597:251 598:253 599:251 600:251 601:251 602:251 603:253 604:251 605:188 606:94 624:27 625:129 626:253 627:251 628:251 629:251 630:251 631:229 632:168 633:15 654:95 655:212 656:251 657:211 658:94 659:59 -1 159:121 160:254 161:136 186:13 187:230 188:253 189:248 190:99 213:4 214:118 215:253 216:253 217:225 218:42 241:61 242:253 243:253 244:253 245:74 268:32 269:206 270:253 271:253 272:186 273:9 296:211 297:253 298:253 299:239 300:69 324:254 325:253 326:253 327:133 351:142 352:255 353:253 354:186 355:8 378:149 379:229 380:254 381:207 382:21 405:54 406:229 407:253 408:254 409:105 433:152 434:254 435:254 436:213 437:26 460:112 461:251 462:253 463:253 464:26 487:29 488:212 489:253 490:250 491:149 514:36 515:214 516:253 517:253 518:137 542:75 543:253 544:253 545:253 546:59 570:93 571:253 572:253 573:189 574:17 598:224 599:253 600:253 601:84 625:43 626:235 627:253 628:126 629:1 653:99 654:248 655:253 656:119 682:225 683:235 684:49 -1 100:166 101:222 102:55 128:197 129:254 130:218 131:5 155:29 156:249 157:254 158:254 159:9 183:45 184:254 185:254 186:174 187:2 210:4 211:164 212:254 213:254 214:85 238:146 239:254 240:254 241:254 242:85 265:101 266:245 267:254 268:254 269:254 270:85 292:97 293:248 294:254 295:204 296:254 297:254 298:85 315:12 316:59 317:98 318:151 319:237 320:254 321:254 322:109 323:35 324:254 325:254 326:85 343:41 344:216 345:254 346:254 347:239 348:153 349:37 350:4 351:32 352:254 353:254 354:85 372:7 373:44 374:44 375:30 379:32 380:254 381:254 382:96 407:19 408:230 409:254 410:174 436:197 437:254 438:110 464:197 465:254 466:85 492:197 493:253 494:63 515:37 516:54 517:54 518:45 519:26 520:84 521:221 522:84 523:21 524:31 525:162 526:78 540:6 541:41 542:141 543:244 544:254 545:254 546:248 547:236 548:254 549:254 550:254 551:233 552:239 553:254 554:138 567:23 568:167 569:254 570:254 571:254 572:254 573:229 574:228 575:185 576:138 577:138 578:138 579:138 580:138 581:138 582:44 595:113 596:254 597:254 598:254 599:179 600:64 601:5 623:32 624:209 625:183 626:97 -0 155:53 156:255 157:253 158:253 159:253 160:124 183:180 184:253 185:251 186:251 187:251 188:251 189:145 190:62 209:32 210:217 211:241 212:253 213:251 214:251 215:251 216:251 217:253 218:107 237:37 238:251 239:251 240:253 241:251 242:251 243:251 244:251 245:253 246:107 265:166 266:251 267:251 268:253 269:251 270:96 271:148 272:251 273:253 274:107 291:73 292:253 293:253 294:253 295:253 296:130 299:110 300:253 301:255 302:108 319:73 320:251 321:251 322:251 323:251 327:109 328:251 329:253 330:107 347:202 348:251 349:251 350:251 351:225 354:6 355:129 356:251 357:253 358:107 375:150 376:251 377:251 378:251 379:71 382:115 383:251 384:251 385:253 386:107 403:253 404:251 405:251 406:173 407:20 410:217 411:251 412:251 413:253 414:107 430:182 431:255 432:253 433:216 438:218 439:253 440:253 441:182 457:63 458:221 459:253 460:251 461:215 465:84 466:236 467:251 468:251 469:77 485:109 486:251 487:253 488:251 489:215 492:11 493:160 494:251 495:251 496:96 513:109 514:251 515:253 516:251 517:137 520:150 521:251 522:251 523:251 524:71 541:109 542:251 543:253 544:251 545:35 547:130 548:253 549:251 550:251 551:173 552:20 569:110 570:253 571:255 572:253 573:98 574:150 575:253 576:255 577:253 578:164 597:109 598:251 599:253 600:251 601:251 602:251 603:251 604:253 605:251 606:35 625:93 626:241 627:253 628:251 629:251 630:251 631:251 632:216 633:112 634:5 654:103 655:253 656:251 657:251 658:251 659:251 683:124 684:251 685:225 686:71 687:71 -0 128:73 129:253 130:227 131:73 132:21 156:73 157:251 158:251 159:251 160:174 182:16 183:166 184:228 185:251 186:251 187:251 188:122 210:62 211:220 212:253 213:251 214:251 215:251 216:251 217:79 238:79 239:231 240:253 241:251 242:251 243:251 244:251 245:232 246:77 264:145 265:253 266:253 267:253 268:255 269:253 270:253 271:253 272:253 273:255 274:108 292:144 293:251 294:251 295:251 296:253 297:168 298:107 299:169 300:251 301:253 302:189 303:20 318:27 319:89 320:236 321:251 322:235 323:215 324:164 325:15 326:6 327:129 328:251 329:253 330:251 331:35 345:47 346:211 347:253 348:251 349:251 350:142 354:37 355:251 356:251 357:253 358:251 359:35 373:109 374:251 375:253 376:251 377:251 378:142 382:11 383:148 384:251 385:253 386:251 387:164 400:11 401:150 402:253 403:255 404:211 405:25 410:11 411:150 412:253 413:255 414:211 415:25 428:140 429:251 430:251 431:253 432:107 438:37 439:251 440:251 441:211 442:46 456:190 457:251 458:251 459:253 460:128 461:5 466:37 467:251 468:251 469:51 484:115 485:251 486:251 487:253 488:188 489:20 492:32 493:109 494:129 495:251 496:173 497:103 512:217 513:251 514:251 515:201 516:30 520:73 521:251 522:251 523:251 524:71 540:166 541:253 542:253 543:255 544:149 545:73 546:150 547:253 548:255 549:253 550:253 551:143 568:140 569:251 570:251 571:253 572:251 573:251 574:251 575:251 576:253 577:251 578:230 579:61 596:190 597:251 598:251 599:253 600:251 601:251 602:251 603:251 604:242 605:215 606:55 624:21 625:189 626:251 627:253 628:251 629:251 630:251 631:173 632:103 653:31 654:200 655:253 656:251 657:96 658:71 659:20 -1 155:178 156:255 157:105 182:6 183:188 184:253 185:216 186:14 210:14 211:202 212:253 213:253 214:23 238:12 239:199 240:253 241:128 242:6 266:42 267:253 268:253 269:158 294:42 295:253 296:253 297:158 322:155 323:253 324:253 325:158 350:160 351:253 352:253 353:147 378:160 379:253 380:253 381:41 405:17 406:225 407:253 408:235 409:31 433:24 434:253 435:253 436:176 461:24 462:253 463:253 464:176 489:24 490:253 491:253 492:176 517:24 518:253 519:253 520:176 545:24 546:253 547:253 548:162 573:46 574:253 575:253 576:59 601:142 602:253 603:253 604:59 629:142 630:253 631:253 632:59 657:142 658:253 659:202 660:8 685:87 686:253 687:139 -0 154:46 155:105 156:254 157:254 158:254 159:254 160:255 161:239 162:41 180:37 181:118 182:222 183:254 184:253 185:253 186:253 187:253 188:253 189:253 190:211 191:54 207:14 208:200 209:253 210:253 211:254 212:253 213:253 214:253 215:253 216:253 217:253 218:253 219:116 233:16 234:160 235:236 236:253 237:253 238:253 239:254 240:253 241:253 242:246 243:229 244:253 245:253 246:253 247:116 261:99 262:253 263:253 264:253 265:253 266:253 267:254 268:253 269:253 270:213 271:99 272:253 273:253 274:253 275:116 288:25 289:194 290:253 291:253 292:253 293:253 294:131 295:97 296:169 297:253 298:93 299:99 300:253 301:253 302:253 303:116 316:206 317:253 318:253 319:251 320:233 321:127 322:9 324:18 325:38 326:3 327:15 328:171 329:253 330:253 331:116 343:55 344:240 345:253 346:253 347:233 355:31 356:186 357:253 358:253 359:116 371:176 372:253 373:253 374:253 375:127 383:99 384:253 385:253 386:253 387:116 399:176 400:253 401:253 402:131 403:9 411:99 412:253 413:253 414:253 415:116 426:119 427:254 428:254 429:232 430:75 440:158 441:254 442:254 443:117 454:118 455:253 456:253 457:154 468:156 469:253 470:253 471:116 482:118 483:253 484:253 485:154 496:156 497:253 498:253 499:116 509:46 510:222 511:253 512:253 513:154 522:7 523:116 524:246 525:253 526:180 527:9 538:118 539:253 540:253 541:154 550:116 551:253 552:253 553:253 554:174 566:118 567:253 568:253 569:154 577:110 578:246 579:253 580:253 581:240 582:67 594:118 595:253 596:253 597:238 598:215 599:49 600:20 601:20 602:20 603:66 604:215 605:241 606:253 607:245 608:233 609:64 622:82 623:229 624:253 625:253 626:253 627:253 628:253 629:253 630:253 631:254 632:253 633:253 634:240 635:107 651:176 652:253 653:253 654:253 655:253 656:253 657:253 658:253 659:254 660:253 661:253 662:108 679:40 680:239 681:253 682:253 683:253 684:253 685:253 686:253 687:254 688:161 689:57 690:4 -0 152:56 153:105 154:220 155:254 156:63 178:18 179:166 180:233 181:253 182:253 183:253 184:236 185:209 186:209 187:209 188:77 189:18 206:84 207:253 208:253 209:253 210:253 211:253 212:254 213:253 214:253 215:253 216:253 217:172 218:8 233:57 234:238 235:253 236:253 237:253 238:253 239:253 240:254 241:253 242:253 243:253 244:253 245:253 246:119 260:14 261:238 262:253 263:253 264:253 265:253 266:253 267:253 268:179 269:196 270:253 271:253 272:253 273:253 274:238 275:12 288:33 289:253 290:253 291:253 292:253 293:253 294:248 295:134 297:18 298:83 299:237 300:253 301:253 302:253 303:14 316:164 317:253 318:253 319:253 320:253 321:253 322:128 327:57 328:119 329:214 330:253 331:94 343:57 344:248 345:253 346:253 347:253 348:126 349:14 350:4 357:179 358:253 359:248 360:56 371:175 372:253 373:253 374:240 375:190 376:28 385:179 386:253 387:253 388:173 399:209 400:253 401:253 402:178 413:92 414:253 415:253 416:208 427:211 428:254 429:254 430:179 442:135 443:255 444:209 455:209 456:253 457:253 458:90 470:134 471:253 472:208 483:209 484:253 485:253 486:178 497:2 498:142 499:253 500:208 511:209 512:253 513:253 514:214 515:35 525:30 526:253 527:253 528:208 539:165 540:253 541:253 542:253 543:215 544:36 553:163 554:253 555:253 556:164 567:18 568:172 569:253 570:253 571:253 572:214 573:127 574:7 580:72 581:232 582:253 583:171 584:17 596:8 597:182 598:253 599:253 600:253 601:253 602:162 603:56 607:64 608:240 609:253 610:253 611:14 625:7 626:173 627:253 628:253 629:253 630:253 631:245 632:241 633:239 634:239 635:246 636:253 637:225 638:14 639:1 654:18 655:59 656:138 657:224 658:253 659:253 660:254 661:253 662:253 663:253 664:240 665:96 685:37 686:104 687:192 688:255 689:253 690:253 691:182 692:73 -1 130:7 131:176 132:254 133:224 158:51 159:253 160:253 161:223 185:4 186:170 187:253 188:253 189:214 213:131 214:253 215:253 216:217 217:39 241:209 242:253 243:253 244:134 268:75 269:240 270:253 271:239 272:26 296:184 297:253 298:245 299:63 323:142 324:255 325:253 326:185 350:62 351:229 352:254 353:242 354:73 377:54 378:229 379:253 380:254 381:105 405:152 406:254 407:254 408:213 409:26 432:32 433:243 434:253 435:253 436:115 459:2 460:142 461:253 462:253 463:155 487:30 488:253 489:253 490:232 491:55 515:75 516:253 517:253 518:164 542:72 543:232 544:253 545:189 546:17 570:224 571:253 572:253 573:163 597:43 598:235 599:253 600:253 601:195 602:21 625:28 626:231 627:253 628:253 629:184 630:14 654:225 655:253 656:253 657:75 -0 155:21 156:176 157:253 158:253 159:124 182:105 183:176 184:251 185:251 186:251 187:251 188:105 208:58 209:217 210:241 211:253 212:251 213:251 214:251 215:251 216:243 217:113 218:5 235:63 236:231 237:251 238:251 239:253 240:251 241:251 242:251 243:251 244:253 245:251 246:113 263:144 264:251 265:251 266:251 267:253 268:251 269:251 270:251 271:251 272:253 273:251 274:215 290:125 291:253 292:253 293:253 294:253 295:255 296:253 297:253 298:253 299:253 300:255 301:253 302:227 303:42 318:253 319:251 320:251 321:251 322:251 323:253 324:251 325:251 326:251 327:251 328:253 329:251 330:251 331:142 345:27 346:253 347:251 348:251 349:235 350:241 351:253 352:251 353:246 354:137 355:35 356:98 357:251 358:251 359:236 360:61 372:47 373:211 374:253 375:251 376:235 377:82 378:103 379:253 380:251 381:137 384:73 385:251 386:251 387:251 388:71 399:27 400:211 401:251 402:253 403:251 404:86 407:72 408:71 409:10 412:73 413:251 414:251 415:173 416:20 427:89 428:253 429:253 430:255 431:253 432:35 440:73 441:253 442:253 443:253 444:72 454:84 455:236 456:251 457:251 458:253 459:251 460:138 468:73 469:251 470:251 471:251 472:71 481:63 482:236 483:251 484:251 485:251 486:227 487:251 488:246 489:138 490:11 494:16 495:37 496:228 497:251 498:246 499:137 500:10 509:73 510:251 511:251 512:251 513:173 514:42 515:142 516:142 517:142 518:41 522:109 523:251 524:253 525:251 526:137 537:73 538:251 539:251 540:173 541:20 549:27 550:211 551:251 552:253 553:147 554:10 565:73 566:253 567:253 568:143 575:21 576:176 577:253 578:253 579:253 593:73 594:251 595:251 596:205 597:144 603:176 604:251 605:251 606:188 607:107 621:62 622:236 623:251 624:251 625:251 626:218 627:217 628:217 629:217 630:217 631:253 632:230 633:189 634:20 650:83 651:158 652:251 653:251 654:253 655:251 656:251 657:251 658:251 659:253 660:107 679:37 680:251 681:251 682:253 683:251 684:251 685:251 686:122 687:72 688:30 -1 151:68 152:45 153:131 154:131 155:131 156:101 157:68 158:92 159:44 187:19 188:170 211:29 212:112 213:89 215:40 216:222 239:120 240:254 241:251 242:127 243:40 244:222 267:197 268:254 269:254 270:91 271:40 272:222 294:64 295:247 296:254 297:236 298:50 299:40 300:107 322:184 323:254 324:254 325:91 327:6 328:14 350:203 351:254 352:254 353:71 377:23 378:218 379:254 380:254 381:71 405:113 406:254 407:255 408:239 409:53 433:210 434:254 435:254 436:195 460:62 461:242 462:254 463:241 464:88 468:28 488:86 489:254 490:254 491:189 495:28 496:104 516:106 517:254 518:254 519:168 523:40 524:91 544:216 545:254 546:245 547:51 551:35 552:80 572:216 573:254 574:102 599:55 600:239 601:254 602:52 627:166 628:254 629:210 630:23 655:223 656:252 657:104 683:223 684:169 -0 125:29 126:170 127:255 128:255 129:141 151:29 152:198 153:255 154:255 155:255 156:226 157:255 158:86 178:141 179:255 180:255 181:170 182:29 184:86 185:255 186:255 187:141 204:29 205:226 206:255 207:198 208:57 213:226 214:255 215:255 216:226 217:114 231:29 232:255 233:255 234:114 241:141 242:170 243:114 244:255 245:255 246:141 259:226 260:255 261:170 269:29 270:57 273:141 274:255 275:226 286:57 287:255 288:170 302:114 303:255 304:198 314:226 315:255 331:170 332:255 333:57 342:255 343:226 360:255 361:170 370:255 371:170 388:114 389:198 398:255 399:226 416:86 417:255 426:198 427:255 444:86 445:255 454:114 455:255 456:57 472:86 473:255 482:29 483:255 484:226 500:141 501:255 511:170 512:255 513:170 528:226 529:198 539:29 540:226 541:255 542:170 555:29 556:255 557:114 568:29 569:226 570:255 571:141 582:57 583:226 584:226 598:141 599:255 600:255 601:170 602:86 607:29 608:86 609:226 610:255 611:226 612:29 627:86 628:198 629:255 630:255 631:255 632:255 633:255 634:255 635:255 636:255 637:255 638:141 639:29 657:29 658:114 659:170 660:170 661:170 662:170 663:170 664:86 -0 153:203 154:254 155:252 156:252 157:252 158:214 159:51 160:20 180:62 181:221 182:252 183:250 184:250 185:250 186:252 187:250 188:160 189:20 207:62 208:211 209:250 210:252 211:250 212:250 213:250 214:252 215:250 216:250 217:49 234:41 235:221 236:250 237:250 238:252 239:250 240:250 241:250 242:252 243:250 244:128 245:10 262:254 263:252 264:252 265:252 266:254 267:252 268:252 269:252 270:254 271:252 272:252 273:90 290:150 291:190 292:250 293:250 294:252 295:250 296:250 297:169 298:171 299:250 300:250 301:250 302:82 318:31 319:191 320:250 321:250 322:252 323:189 324:100 325:20 326:172 327:250 328:250 329:250 330:80 346:213 347:250 348:250 349:250 350:212 351:29 354:252 355:250 356:250 357:250 374:92 375:252 376:252 377:252 382:51 383:252 384:252 385:252 386:203 401:82 402:252 403:250 404:250 405:169 410:132 411:250 412:250 413:250 414:121 428:92 429:231 430:252 431:250 432:159 433:20 438:252 439:250 440:250 441:250 456:30 457:211 458:252 459:250 460:221 461:40 466:90 467:250 468:250 469:250 470:163 484:31 485:213 486:254 487:232 488:80 494:92 495:252 496:252 497:212 498:163 512:151 513:250 514:252 515:149 522:252 523:250 524:250 525:49 540:60 541:221 542:252 543:210 544:60 550:252 551:250 552:250 553:49 569:202 570:252 571:250 572:221 573:40 576:123 577:202 578:252 579:250 580:250 581:49 596:123 597:243 598:255 599:252 600:252 601:252 602:254 603:252 604:252 605:252 606:254 607:252 608:100 625:121 626:171 627:250 628:250 629:250 630:252 631:250 632:250 633:250 634:252 635:250 636:100 654:20 655:160 656:250 657:250 658:252 659:250 660:250 661:250 662:252 663:189 664:40 683:20 684:170 685:250 686:252 687:250 688:128 689:49 690:49 691:29 -1 98:64 99:191 100:70 125:68 126:243 127:253 128:249 129:63 152:30 153:223 154:253 155:253 156:247 157:41 179:73 180:238 181:253 182:253 183:253 184:242 206:73 207:236 208:253 209:253 210:253 211:253 212:242 234:182 235:253 236:253 237:191 238:247 239:253 240:149 262:141 263:253 264:143 265:86 266:249 267:253 268:122 290:9 291:36 292:7 293:14 294:233 295:253 296:122 322:230 323:253 324:122 350:230 351:253 352:122 378:231 379:255 380:123 406:230 407:253 408:52 433:61 434:245 435:253 461:98 462:253 463:253 468:35 469:12 489:98 490:253 491:253 494:9 495:142 496:233 497:146 517:190 518:253 519:253 520:128 521:7 522:99 523:253 524:253 525:180 544:29 545:230 546:253 547:253 548:252 549:210 550:253 551:253 552:253 553:140 571:28 572:207 573:253 574:253 575:253 576:254 577:253 578:253 579:235 580:70 581:9 599:126 600:253 601:253 602:253 603:253 604:254 605:253 606:168 607:19 627:79 628:253 629:253 630:201 631:190 632:132 633:63 634:5 -1 125:26 126:240 127:72 153:25 154:238 155:208 182:209 183:226 184:14 210:209 211:254 212:43 238:175 239:254 240:128 266:63 267:254 268:204 294:107 295:254 296:204 322:88 323:254 324:204 350:55 351:254 352:204 378:126 379:254 380:204 406:126 407:254 408:189 434:169 435:254 436:121 462:209 463:254 464:193 490:209 491:254 492:111 517:22 518:235 519:254 520:37 545:137 546:254 547:227 548:16 573:205 574:255 575:185 601:205 602:254 603:125 629:205 630:254 631:125 657:111 658:212 659:43 -0 155:62 156:91 157:213 158:255 159:228 160:91 161:12 182:70 183:230 184:253 185:253 186:253 187:253 188:253 189:152 190:7 210:246 211:253 212:253 213:253 214:253 215:253 216:253 217:253 218:106 237:21 238:247 239:253 240:253 241:253 242:253 243:253 244:253 245:208 246:24 265:156 266:253 267:253 268:253 269:253 270:253 271:253 272:253 273:195 292:88 293:238 294:253 295:253 296:253 297:221 298:253 299:253 300:253 301:195 320:230 321:253 322:253 323:253 324:198 325:40 326:177 327:253 328:253 329:195 346:56 347:156 348:251 349:253 350:189 351:182 352:15 354:86 355:240 356:253 357:210 358:28 374:213 375:253 376:253 377:156 378:3 383:205 384:253 385:253 386:106 401:121 402:252 403:253 404:135 405:3 411:46 412:253 413:253 414:106 428:28 429:212 430:253 431:248 432:23 439:42 440:253 441:253 442:106 456:197 457:253 458:234 459:70 467:42 468:253 469:253 470:106 483:11 484:202 485:253 486:187 495:58 496:253 497:210 498:27 511:107 512:253 513:253 514:40 522:53 523:227 524:253 525:195 539:107 540:253 541:253 542:40 549:47 550:227 551:253 552:231 553:58 567:107 568:253 569:253 570:40 575:5 576:131 577:222 578:253 579:231 580:59 595:14 596:204 597:253 598:226 599:222 600:73 601:58 602:58 603:170 604:253 605:253 606:227 607:58 624:197 625:253 626:253 627:253 628:253 629:253 630:253 631:253 632:253 633:238 634:58 652:33 653:179 654:241 655:253 656:253 657:253 658:253 659:250 660:116 661:14 682:75 683:179 684:253 685:151 686:89 687:86 -1 157:42 158:228 159:253 160:253 185:144 186:251 187:251 188:251 212:89 213:236 214:251 215:235 216:215 239:79 240:253 241:251 242:251 243:142 267:180 268:253 269:251 270:251 271:142 294:32 295:202 296:255 297:253 298:216 322:109 323:251 324:253 325:251 326:112 349:6 350:129 351:251 352:253 353:127 354:5 377:37 378:251 379:251 380:253 381:107 405:166 406:251 407:251 408:201 409:30 432:42 433:228 434:253 435:253 460:144 461:251 462:251 463:147 487:63 488:236 489:251 490:251 491:71 515:150 516:251 517:251 518:204 519:41 543:253 544:251 545:251 546:142 571:255 572:253 573:164 598:105 599:253 600:251 601:35 626:180 627:253 628:251 629:35 654:180 655:253 656:251 657:35 682:180 683:253 684:251 685:35 -1 128:62 129:254 130:213 156:102 157:253 158:252 159:102 160:20 184:102 185:254 186:253 187:254 188:50 212:102 213:253 214:252 215:253 216:50 240:102 241:254 242:253 243:254 244:50 268:142 269:253 270:252 271:253 272:50 295:51 296:253 297:254 298:253 299:224 300:20 323:132 324:252 325:253 326:252 327:162 351:173 352:253 353:254 354:253 355:102 378:82 379:253 380:252 381:253 382:252 383:61 406:203 407:254 408:253 409:254 410:233 433:41 434:243 435:253 436:252 437:253 438:111 461:132 462:253 463:254 464:253 465:203 488:41 489:253 490:252 491:253 492:252 493:40 515:11 516:213 517:254 518:253 519:254 520:151 543:92 544:252 545:253 546:252 547:192 548:50 570:21 571:214 572:253 573:255 574:253 575:41 598:142 599:253 600:252 601:253 602:171 625:113 626:253 627:255 628:253 629:203 630:40 653:30 654:131 655:233 656:111 -0 154:28 155:195 156:254 157:254 158:254 159:254 160:254 161:255 162:61 181:6 182:191 183:253 184:253 185:253 186:253 187:253 188:253 189:253 190:60 208:26 209:190 210:253 211:253 212:253 213:253 214:240 215:191 216:242 217:253 218:60 235:15 236:187 237:253 238:253 239:253 240:253 241:253 242:200 244:211 245:253 246:60 262:22 263:66 264:253 265:253 266:253 267:253 268:241 269:209 270:44 271:23 272:218 273:253 274:60 290:124 291:253 292:253 293:253 294:253 295:253 296:182 299:131 300:253 301:253 302:60 318:38 319:217 320:253 321:253 322:244 323:111 324:37 327:131 328:253 329:253 330:60 346:124 347:253 348:253 349:253 350:165 354:22 355:182 356:253 357:253 358:60 374:124 375:253 376:253 377:240 378:45 382:53 383:253 384:253 385:249 386:58 401:16 402:168 403:253 404:216 405:45 410:53 411:253 412:253 413:138 429:159 430:253 431:253 432:147 438:53 439:253 440:253 441:138 456:136 457:252 458:253 459:227 460:5 466:53 467:253 468:243 469:101 484:140 485:253 486:253 487:124 494:156 495:253 496:218 511:13 512:164 513:253 514:142 515:5 521:32 522:233 523:253 524:218 539:62 540:253 541:253 542:130 548:37 549:203 550:253 551:253 552:127 567:62 568:253 569:253 570:147 571:36 572:36 573:36 574:36 575:151 576:222 577:253 578:245 579:127 580:8 595:34 596:202 597:253 598:253 599:253 600:253 601:253 602:253 603:253 604:253 605:253 606:200 624:140 625:253 626:253 627:253 628:253 629:253 630:253 631:253 632:248 633:235 634:65 652:87 653:173 654:253 655:253 656:253 657:253 658:253 659:253 660:182 681:14 682:78 683:96 684:253 685:253 686:253 687:137 688:56 -0 123:8 124:76 125:202 126:254 127:255 128:163 129:37 130:2 150:13 151:182 152:253 153:253 154:253 155:253 156:253 157:253 158:23 177:15 178:179 179:253 180:253 181:212 182:91 183:218 184:253 185:253 186:179 187:109 205:105 206:253 207:253 208:160 209:35 210:156 211:253 212:253 213:253 214:253 215:250 216:113 232:19 233:212 234:253 235:253 236:88 237:121 238:253 239:233 240:128 241:91 242:245 243:253 244:248 245:114 260:104 261:253 262:253 263:110 264:2 265:142 266:253 267:90 270:26 271:199 272:253 273:248 274:63 287:1 288:173 289:253 290:253 291:29 293:84 294:228 295:39 299:72 300:251 301:253 302:215 303:29 315:36 316:253 317:253 318:203 319:13 328:82 329:253 330:253 331:170 343:36 344:253 345:253 346:164 356:11 357:198 358:253 359:184 360:6 371:36 372:253 373:253 374:82 385:138 386:253 387:253 388:35 399:128 400:253 401:253 402:47 413:48 414:253 415:253 416:35 427:154 428:253 429:253 430:47 441:48 442:253 443:253 444:35 455:102 456:253 457:253 458:99 469:48 470:253 471:253 472:35 483:36 484:253 485:253 486:164 496:16 497:208 498:253 499:211 500:17 511:32 512:244 513:253 514:175 515:4 524:44 525:253 526:253 527:156 540:171 541:253 542:253 543:29 551:30 552:217 553:253 554:188 555:19 568:171 569:253 570:253 571:59 578:60 579:217 580:253 581:253 582:70 596:78 597:253 598:253 599:231 600:48 604:26 605:128 606:249 607:253 608:244 609:94 610:15 624:8 625:151 626:253 627:253 628:234 629:101 630:121 631:219 632:229 633:253 634:253 635:201 636:80 653:38 654:232 655:253 656:253 657:253 658:253 659:253 660:253 661:253 662:201 663:66 -0 127:68 128:254 129:255 130:254 131:107 153:11 154:176 155:230 156:253 157:253 158:253 159:212 180:28 181:197 182:253 183:253 184:253 185:253 186:253 187:229 188:107 189:14 208:194 209:253 210:253 211:253 212:253 213:253 214:253 215:253 216:253 217:53 235:69 236:241 237:253 238:253 239:253 240:253 241:241 242:186 243:253 244:253 245:195 262:10 263:161 264:253 265:253 266:253 267:246 268:40 269:57 270:231 271:253 272:253 273:195 290:140 291:253 292:253 293:253 294:253 295:154 297:25 298:253 299:253 300:253 301:195 318:213 319:253 320:253 321:253 322:135 323:8 325:3 326:128 327:253 328:253 329:195 345:77 346:238 347:253 348:253 349:253 350:7 354:116 355:253 356:253 357:195 372:11 373:165 374:253 375:253 376:231 377:70 378:1 382:78 383:237 384:253 385:195 400:33 401:253 402:253 403:253 404:182 411:200 412:253 413:195 428:98 429:253 430:253 431:253 432:24 439:42 440:253 441:195 456:197 457:253 458:253 459:253 460:24 467:163 468:253 469:195 484:197 485:253 486:253 487:189 488:13 494:53 495:227 496:253 497:121 512:197 513:253 514:253 515:114 521:21 522:227 523:253 524:231 525:27 540:197 541:253 542:253 543:114 547:5 548:131 549:143 550:253 551:231 552:59 568:197 569:253 570:253 571:236 572:73 573:58 574:217 575:223 576:253 577:253 578:253 579:174 596:197 597:253 598:253 599:253 600:253 601:253 602:253 603:253 604:253 605:253 606:253 607:48 624:149 625:253 626:253 627:253 628:253 629:253 630:253 631:253 632:253 633:182 634:15 635:3 652:12 653:168 654:253 655:253 656:253 657:253 658:253 659:248 660:89 661:23 -1 157:85 158:255 159:103 160:1 185:205 186:253 187:253 188:30 213:205 214:253 215:253 216:30 240:44 241:233 242:253 243:244 244:27 268:135 269:253 270:253 271:100 296:153 297:253 298:240 299:76 323:12 324:208 325:253 326:166 351:69 352:253 353:253 354:142 378:14 379:110 380:253 381:235 382:33 406:63 407:223 408:235 409:130 434:186 435:253 436:235 437:37 461:17 462:145 463:253 464:231 465:35 489:69 490:220 491:231 492:123 516:18 517:205 518:253 519:176 520:27 543:17 544:125 545:253 546:185 547:39 571:71 572:214 573:231 574:41 599:167 600:253 601:225 602:33 626:72 627:205 628:207 629:14 653:30 654:249 655:233 656:49 681:32 682:253 683:89 -1 126:94 127:132 154:250 155:250 156:4 182:250 183:254 184:95 210:250 211:254 212:95 238:250 239:254 240:95 266:250 267:254 268:95 294:250 295:254 296:95 322:250 323:254 324:95 350:250 351:254 352:95 378:250 379:254 380:95 405:77 406:254 407:250 408:19 433:96 434:254 435:249 461:53 462:253 463:252 464:43 490:250 491:251 492:32 517:85 518:254 519:249 545:96 546:254 547:249 573:83 574:254 575:250 576:14 602:250 603:254 604:95 630:250 631:255 632:95 658:132 659:254 660:95 -1 124:32 125:253 126:31 152:32 153:251 154:149 180:32 181:251 182:188 208:32 209:251 210:188 236:32 237:251 238:228 239:59 264:32 265:253 266:253 267:95 292:28 293:236 294:251 295:114 321:127 322:251 323:251 349:127 350:251 351:251 377:48 378:232 379:251 406:223 407:253 408:159 434:221 435:251 436:158 462:142 463:251 464:158 490:64 491:251 492:242 493:55 518:64 519:251 520:253 521:161 546:64 547:253 548:255 549:221 574:16 575:181 576:253 577:220 603:79 604:253 605:236 606:63 632:213 633:251 634:126 660:96 661:251 662:126 -1 129:39 130:254 131:255 132:254 133:140 157:136 158:253 159:253 160:228 161:67 184:6 185:227 186:253 187:253 188:58 211:29 212:188 213:253 214:253 215:253 216:17 239:95 240:253 241:253 242:253 243:157 244:8 266:3 267:107 268:253 269:253 270:245 271:77 294:29 295:253 296:253 297:240 298:100 322:141 323:253 324:253 325:215 349:129 350:248 351:253 352:253 353:215 377:151 378:253 379:253 380:253 381:144 405:151 406:253 407:253 408:253 409:27 431:3 432:102 433:242 434:253 435:253 436:110 437:3 459:97 460:253 461:253 462:253 463:214 464:55 487:207 488:253 489:253 490:253 491:158 515:67 516:253 517:253 518:253 519:158 543:207 544:253 545:253 546:240 547:88 571:207 572:253 573:253 574:224 598:32 599:217 600:253 601:253 602:224 626:141 627:253 628:253 629:253 630:133 654:36 655:219 656:253 657:140 658:10 -0 123:59 124:55 149:71 150:192 151:254 152:250 153:147 154:17 176:123 177:247 178:253 179:254 180:253 181:253 182:196 183:79 184:176 185:175 186:175 187:124 188:48 203:87 204:247 205:247 206:176 207:95 208:102 209:117 210:243 211:237 212:192 213:232 214:253 215:253 216:245 217:152 218:6 230:23 231:229 232:253 233:138 238:219 239:58 241:95 242:118 243:80 244:230 245:254 246:196 247:30 258:120 259:254 260:205 261:8 266:114 272:38 273:255 274:254 275:155 276:5 286:156 287:253 288:92 301:61 302:235 303:253 304:102 314:224 315:253 316:78 330:117 331:253 332:196 333:18 342:254 343:253 344:78 358:9 359:211 360:253 361:73 370:254 371:253 372:78 387:175 388:253 389:155 398:194 399:254 400:101 415:79 416:254 417:155 426:112 427:253 428:211 429:9 443:73 444:251 445:200 454:41 455:241 456:253 457:87 471:25 472:240 473:253 483:147 484:253 485:227 486:47 499:94 500:253 501:200 511:5 512:193 513:253 514:230 515:76 527:175 528:253 529:155 540:31 541:219 542:254 543:255 544:126 545:18 553:14 554:149 555:254 556:244 557:45 569:21 570:158 571:254 572:253 573:226 574:162 575:118 576:96 577:20 578:20 579:73 580:118 581:224 582:253 583:247 584:85 598:30 599:155 600:253 601:253 602:253 603:253 604:254 605:253 606:253 607:253 608:253 609:254 610:247 611:84 627:5 628:27 629:117 630:206 631:244 632:229 633:213 634:213 635:213 636:176 637:117 638:32 659:45 660:23 -1 128:58 129:139 156:247 157:247 158:25 183:121 184:253 185:156 186:3 211:133 212:253 213:145 238:11 239:227 240:253 241:145 266:7 267:189 268:253 269:145 294:35 295:252 296:253 297:145 322:146 323:252 324:253 325:131 350:146 351:252 352:253 353:13 378:146 379:252 380:253 381:13 406:147 407:253 408:255 409:13 434:146 435:252 436:253 437:13 462:146 463:252 464:253 465:13 490:146 491:252 492:253 493:13 517:22 518:230 519:252 520:221 521:9 545:22 546:230 547:252 548:133 574:146 575:252 576:133 602:146 603:252 604:120 630:146 631:252 658:146 659:252 -1 129:28 130:247 131:255 132:165 156:47 157:221 158:252 159:252 160:164 184:177 185:252 186:252 187:252 188:164 212:177 213:252 214:252 215:223 216:78 240:177 241:252 242:252 243:197 267:114 268:236 269:252 270:235 271:42 294:5 295:148 296:252 297:252 298:230 321:14 322:135 323:252 324:252 325:252 326:230 349:78 350:252 351:252 352:252 353:252 354:162 377:78 378:252 379:252 380:252 381:252 382:9 405:78 406:252 407:252 408:252 409:252 410:9 432:32 433:200 434:252 435:252 436:252 437:105 438:3 459:10 460:218 461:252 462:252 463:252 464:105 465:8 487:225 488:252 489:252 490:252 491:240 492:69 514:44 515:237 516:252 517:252 518:228 519:85 541:59 542:218 543:252 544:252 545:225 546:93 568:65 569:208 570:252 571:252 572:252 573:175 596:133 597:252 598:252 599:252 600:225 601:68 624:133 625:252 626:252 627:244 628:54 652:133 653:252 654:252 655:48 -0 156:13 157:6 181:10 182:77 183:145 184:253 185:190 186:67 207:11 208:77 209:193 210:252 211:252 212:253 213:252 214:238 215:157 216:71 217:26 233:10 234:78 235:193 236:252 237:252 238:252 239:252 240:253 241:252 242:252 243:252 244:252 245:228 246:128 247:49 248:5 259:6 260:78 261:194 262:252 263:252 264:252 265:252 266:252 267:252 268:253 269:217 270:192 271:232 272:252 273:252 274:252 275:252 276:135 277:3 286:4 287:147 288:252 289:252 290:252 291:252 292:252 293:252 294:252 295:252 296:175 297:26 299:40 300:145 301:235 302:252 303:252 304:252 305:104 314:208 315:252 316:252 317:252 318:252 319:252 320:252 321:133 322:48 323:48 329:71 330:236 331:252 332:252 333:230 342:253 343:185 344:170 345:252 346:252 347:252 348:173 349:22 358:102 359:252 360:252 361:252 370:24 371:141 372:243 373:252 374:252 375:186 376:5 386:8 387:220 388:252 389:252 398:70 399:247 400:252 401:252 402:165 403:37 414:81 415:251 416:252 417:194 426:255 427:253 428:253 429:251 430:69 441:39 442:231 443:253 444:253 445:127 454:253 455:252 456:249 457:127 468:6 469:147 470:252 471:252 472:190 473:5 482:253 483:252 484:216 495:7 496:145 497:252 498:252 499:252 500:69 510:253 511:252 512:223 513:16 522:25 523:185 524:252 525:252 526:252 527:107 528:8 538:167 539:252 540:252 541:181 542:18 549:105 550:191 551:252 552:252 553:235 554:151 555:10 566:37 567:221 568:252 569:252 570:210 571:193 572:96 573:73 574:130 575:188 576:194 577:227 578:252 579:252 580:235 581:128 595:97 596:220 597:252 598:252 599:252 600:252 601:252 602:252 603:252 604:253 605:252 606:252 607:236 608:70 624:40 625:174 626:252 627:252 628:252 629:252 630:252 631:252 632:253 633:197 634:138 635:29 653:5 654:23 655:116 656:143 657:143 658:143 659:143 660:24 661:10 -0 127:28 128:164 129:254 130:233 131:148 132:11 154:3 155:164 156:254 157:234 158:225 159:254 160:204 182:91 183:254 184:235 185:48 186:32 187:166 188:251 189:92 208:33 209:111 210:214 211:205 212:49 215:24 216:216 217:210 235:34 236:217 237:254 238:254 239:211 244:87 245:237 246:43 262:34 263:216 264:254 265:254 266:252 267:243 268:61 272:38 273:248 274:182 290:171 291:254 292:184 293:205 294:175 295:36 301:171 302:227 317:28 318:234 319:190 320:13 321:193 322:157 329:124 330:238 331:26 345:140 346:254 347:131 349:129 350:157 357:124 358:254 359:95 373:201 374:238 375:56 377:70 378:103 385:124 386:254 387:148 400:62 401:255 402:210 413:150 414:254 415:122 428:86 429:254 430:201 431:15 440:28 441:237 442:246 443:44 456:128 457:254 458:143 468:34 469:243 470:227 484:62 485:254 486:210 496:58 497:249 498:179 512:30 513:240 514:210 524:207 525:254 526:64 541:216 542:231 543:34 551:129 552:248 553:170 554:9 569:131 570:254 571:170 577:17 578:129 579:248 580:225 581:24 597:50 598:245 599:245 600:184 601:106 602:106 603:106 604:133 605:231 606:254 607:244 608:53 626:67 627:249 628:254 629:254 630:254 631:254 632:254 633:251 634:193 635:40 655:38 656:157 657:248 658:166 659:166 660:139 661:57 -0 129:105 130:255 131:219 132:67 133:67 134:52 156:20 157:181 158:253 159:253 160:253 161:253 162:226 163:69 182:4 183:129 184:206 185:253 186:253 187:253 188:253 189:253 190:253 191:130 209:9 210:141 211:253 212:253 213:253 214:253 215:253 216:253 217:253 218:253 219:166 220:20 237:134 238:253 239:253 240:253 241:253 242:253 243:253 244:253 245:253 246:253 247:253 248:65 262:2 263:83 264:207 265:246 266:253 267:253 268:253 269:253 270:253 271:249 272:234 273:247 274:253 275:253 276:65 290:83 291:253 292:253 293:253 294:253 295:253 296:189 297:253 298:253 299:205 301:179 302:253 303:253 304:65 317:85 318:234 319:253 320:253 321:253 322:253 323:157 324:26 325:164 326:151 327:83 329:179 330:253 331:253 332:65 344:65 345:237 346:253 347:253 348:253 349:67 350:36 351:14 353:15 354:12 357:179 358:253 359:253 360:65 371:4 372:141 373:253 374:253 375:221 376:158 377:23 385:179 386:253 387:253 388:65 399:129 400:253 401:253 402:241 403:62 412:72 413:226 414:253 415:175 416:24 426:119 427:247 428:253 429:253 430:206 439:8 440:134 441:253 442:253 443:130 454:132 455:253 456:253 457:194 458:27 467:125 468:253 469:253 470:253 471:130 481:45 482:213 483:253 484:253 485:112 493:70 494:170 495:247 496:253 497:253 498:89 499:43 509:67 510:253 511:253 512:196 513:55 514:9 520:8 521:131 522:253 523:253 524:253 525:86 526:1 537:67 538:253 539:253 540:253 541:253 542:129 546:43 547:114 548:134 549:253 550:253 551:231 552:139 553:41 565:20 566:167 567:253 568:253 569:253 570:247 571:179 572:179 573:179 574:206 575:253 576:253 577:253 578:253 579:72 594:103 595:240 596:253 597:253 598:253 599:253 600:253 601:253 602:253 603:253 604:244 605:119 606:8 607:1 623:107 624:253 625:253 626:253 627:253 628:253 629:253 630:253 631:175 632:111 651:3 652:121 653:253 654:253 655:253 656:253 657:253 658:182 659:24 -0 125:22 126:183 127:252 128:254 129:252 130:252 131:252 132:76 151:85 152:85 153:168 154:250 155:250 156:252 157:250 158:250 159:250 160:250 161:71 163:43 164:85 165:14 178:107 179:252 180:250 181:250 182:250 183:250 184:252 185:250 186:250 187:250 188:250 189:210 191:127 192:250 193:146 205:114 206:237 207:252 208:250 209:250 210:250 211:250 212:252 213:250 214:250 215:250 216:250 217:210 219:127 220:250 221:250 232:107 233:237 234:250 235:252 236:250 237:250 238:250 239:74 240:41 241:41 242:41 243:41 244:217 245:34 247:127 248:250 249:250 259:15 260:148 261:252 262:252 263:254 264:238 265:105 275:128 276:252 277:252 286:15 287:140 288:250 289:250 290:250 291:167 292:111 303:127 304:250 305:250 314:43 315:250 316:250 317:250 318:250 331:127 332:250 333:250 342:183 343:250 344:250 345:250 346:110 358:57 359:210 360:250 361:250 370:252 371:250 372:250 373:110 374:7 386:85 387:250 388:250 389:250 398:254 399:252 400:252 401:83 414:86 415:252 416:252 417:217 426:252 427:250 428:250 429:138 430:14 441:15 442:140 443:250 444:250 445:41 454:252 455:250 456:250 457:250 458:41 469:43 470:250 471:250 472:250 473:41 482:252 483:250 484:250 485:250 486:181 497:183 498:250 499:250 500:250 501:41 510:76 511:250 512:250 513:250 514:250 524:177 525:252 526:250 527:250 528:110 529:7 538:36 539:224 540:252 541:252 542:252 543:219 544:43 545:43 546:43 547:7 549:15 550:43 551:183 552:252 553:255 554:252 555:126 567:85 568:250 569:250 570:250 571:252 572:250 573:250 574:250 575:111 576:86 577:140 578:250 579:250 580:250 581:252 582:222 583:83 595:42 596:188 597:250 598:250 599:252 600:250 601:250 602:250 603:250 604:252 605:250 606:250 607:250 608:250 609:126 610:83 624:127 625:250 626:250 627:252 628:250 629:250 630:250 631:250 632:252 633:250 634:250 635:137 636:83 652:21 653:41 654:217 655:252 656:250 657:250 658:250 659:250 660:217 661:41 662:41 663:14 -1 155:114 156:206 157:25 183:238 184:252 185:55 211:222 212:252 213:55 239:113 240:252 241:55 267:113 268:252 269:55 295:255 296:253 297:56 323:253 324:176 325:6 350:32 351:253 352:233 353:43 378:140 379:253 380:195 381:19 406:140 407:253 408:167 433:29 434:253 435:141 461:29 462:252 463:140 489:29 490:252 491:140 517:29 518:252 519:140 545:29 546:252 547:140 573:169 574:253 575:79 601:169 602:252 628:76 629:234 630:141 656:197 657:233 658:37 684:197 685:223 -1 127:73 128:253 129:253 130:63 155:115 156:252 157:252 158:144 183:217 184:252 185:252 186:144 210:63 211:237 212:252 213:252 214:144 238:109 239:252 240:252 241:252 266:109 267:252 268:252 269:252 294:109 295:252 296:252 297:252 322:191 323:252 324:252 325:252 349:145 350:255 351:253 352:253 353:253 376:32 377:237 378:253 379:252 380:252 381:210 404:37 405:252 406:253 407:252 408:252 409:108 432:37 433:252 434:253 435:252 436:252 437:108 460:21 461:207 462:255 463:253 464:253 465:108 489:144 490:253 491:252 492:252 493:108 516:27 517:221 518:253 519:252 520:252 521:108 544:16 545:190 546:253 547:252 548:252 549:108 573:145 574:255 575:253 576:253 577:253 601:144 602:253 603:252 604:252 605:210 629:144 630:253 631:252 632:252 633:108 657:62 658:253 659:252 660:252 661:108 -1 120:85 121:253 122:132 123:9 147:82 148:241 149:251 150:251 151:128 175:175 176:251 177:251 178:251 179:245 180:121 203:13 204:204 205:251 206:251 207:251 208:245 209:107 232:39 233:251 234:251 235:251 236:251 237:167 238:22 260:15 261:155 262:251 263:251 264:251 265:251 266:177 289:15 290:157 291:248 292:251 293:251 294:251 295:165 319:214 320:251 321:251 322:251 323:212 324:78 325:24 347:109 348:251 349:251 350:251 351:253 352:251 353:170 354:10 375:5 376:57 377:162 378:251 379:253 380:251 381:251 382:18 405:106 406:239 407:255 408:253 409:253 410:213 434:105 435:253 436:251 437:251 438:230 439:72 463:253 464:251 465:251 466:251 467:221 468:67 491:72 492:251 493:251 494:251 495:251 496:96 519:36 520:199 521:251 522:251 523:251 524:155 525:15 548:45 549:204 550:251 551:251 552:251 553:157 577:161 578:249 579:251 580:251 581:248 582:147 606:233 607:251 608:251 609:251 610:173 634:233 635:251 636:251 637:251 638:173 662:53 663:131 664:251 665:251 666:173 -1 126:15 127:200 128:255 129:90 154:42 155:254 156:254 157:173 182:42 183:254 184:254 185:199 210:26 211:237 212:254 213:221 214:12 239:213 240:254 241:231 242:17 267:213 268:254 269:199 295:213 296:254 297:199 323:213 324:254 325:96 350:20 351:232 352:254 353:33 378:84 379:254 380:229 381:17 406:168 407:254 408:203 433:8 434:217 435:254 436:187 461:84 462:254 463:254 464:48 489:195 490:254 491:254 492:37 516:20 517:233 518:254 519:212 520:4 544:132 545:254 546:254 547:82 571:9 572:215 573:254 574:254 575:116 576:46 599:55 600:254 601:254 602:254 603:254 604:121 627:113 628:254 629:254 630:254 631:254 632:40 655:12 656:163 657:254 658:185 659:58 660:1 -0 182:32 183:57 184:57 185:57 186:57 187:57 188:57 189:57 208:67 209:185 210:229 211:252 212:252 213:252 214:253 215:252 216:252 217:252 218:185 219:66 234:13 235:188 236:246 237:252 238:253 239:252 240:252 241:252 242:241 243:139 244:177 245:252 246:253 247:246 248:187 249:13 261:26 262:255 263:253 264:244 265:175 266:101 274:126 275:244 276:253 277:153 288:82 289:243 290:253 291:214 292:81 303:169 304:252 305:252 315:19 316:215 317:252 318:206 319:56 331:169 332:252 333:252 343:157 344:252 345:252 346:13 359:169 360:252 361:151 370:41 371:253 372:253 373:128 386:92 387:253 388:206 389:13 398:166 399:252 400:196 401:9 414:216 415:252 416:142 426:253 427:252 428:168 441:89 442:253 443:208 444:13 454:253 455:252 456:68 468:38 469:225 470:253 471:96 482:254 483:253 484:56 495:45 496:229 497:253 498:151 510:253 511:252 512:81 522:70 523:225 524:252 525:227 538:216 539:252 540:168 548:29 549:134 550:253 551:252 552:186 553:31 566:91 567:252 568:243 569:125 573:51 574:114 575:113 576:210 577:252 578:253 579:151 580:19 595:157 596:253 597:253 598:254 599:253 600:253 601:253 602:254 603:253 604:244 605:175 606:51 623:19 624:122 625:196 626:197 627:221 628:196 629:196 630:197 631:121 632:56 655:25 -0 127:42 128:235 129:255 130:84 153:15 154:132 155:208 156:253 157:253 158:171 159:108 180:6 181:177 182:253 183:253 184:253 185:253 186:253 187:242 188:110 208:151 209:253 210:253 211:253 212:253 213:253 214:253 215:253 216:139 235:48 236:208 237:253 238:253 239:253 240:253 241:253 242:253 243:253 244:139 263:85 264:253 265:253 266:253 267:253 268:236 269:156 270:184 271:253 272:148 273:6 290:7 291:141 292:253 293:253 294:253 295:253 296:27 298:170 299:253 300:253 301:74 318:19 319:253 320:253 321:253 322:253 323:253 324:27 326:170 327:253 328:253 329:74 345:16 346:186 347:253 348:253 349:253 350:242 351:105 352:4 354:170 355:253 356:253 357:94 358:1 373:141 374:253 375:253 376:253 377:242 378:100 382:170 383:253 384:253 385:253 386:8 401:141 402:253 403:253 404:253 405:224 410:170 411:253 412:253 413:253 414:8 428:12 429:158 430:253 431:253 432:230 433:51 438:18 439:237 440:253 441:253 442:8 456:76 457:253 458:253 459:218 460:61 467:236 468:253 469:253 470:8 484:76 485:253 486:253 487:168 495:110 496:253 497:132 498:3 512:76 513:253 514:253 515:168 521:20 522:174 523:239 524:147 525:5 539:5 540:155 541:253 542:253 543:168 548:102 549:170 550:253 551:253 552:139 567:3 568:128 569:253 570:253 571:228 572:179 573:179 574:179 575:179 576:245 577:253 578:253 579:219 580:41 596:76 597:253 598:253 599:253 600:253 601:253 602:253 603:253 604:253 605:253 606:253 607:163 624:39 625:199 626:253 627:253 628:253 629:253 630:253 631:253 632:253 633:253 634:170 635:9 653:36 654:219 655:253 656:253 657:253 658:253 659:253 660:224 661:65 662:22 -1 156:202 157:253 158:69 184:253 185:252 186:121 212:253 213:252 214:69 240:253 241:252 242:69 267:106 268:253 269:231 270:37 295:179 296:255 297:196 322:17 323:234 324:253 325:92 350:93 351:252 352:253 353:92 378:93 379:252 380:253 381:92 406:93 407:252 408:232 409:8 434:208 435:253 436:116 462:207 463:252 464:116 490:207 491:252 492:32 517:57 518:244 519:252 545:122 546:252 547:252 573:185 574:253 575:253 601:184 602:252 603:252 629:101 630:252 631:252 657:13 658:173 659:252 660:43 686:9 687:232 688:116 -1 156:73 157:253 158:253 159:253 160:124 184:73 185:251 186:251 187:251 188:251 212:99 213:251 214:251 215:251 216:225 240:253 241:251 242:251 243:251 244:71 266:79 267:180 268:253 269:251 270:251 271:173 272:20 294:110 295:253 296:255 297:253 298:216 322:109 323:251 324:253 325:251 326:215 350:109 351:251 352:253 353:251 354:215 378:109 379:251 380:253 381:251 382:137 406:109 407:251 408:253 409:251 410:35 433:37 434:253 435:253 436:255 437:253 438:35 461:140 462:251 463:251 464:253 465:168 466:15 488:125 489:246 490:251 491:251 492:190 493:15 516:144 517:251 518:251 519:251 520:180 543:53 544:221 545:251 546:251 547:251 548:51 571:125 572:253 573:253 574:253 575:201 598:105 599:253 600:251 601:251 602:188 603:30 626:180 627:253 628:251 629:251 630:142 654:180 655:253 656:251 657:235 658:82 682:180 683:253 684:251 685:215 -1 124:111 125:255 126:48 152:162 153:253 154:237 155:63 180:206 181:253 182:253 183:183 208:87 209:217 210:253 211:205 237:90 238:253 239:238 240:60 265:37 266:225 267:253 268:89 294:206 295:253 296:159 322:206 323:253 324:226 350:206 351:253 352:226 378:206 379:253 380:226 406:206 407:253 408:226 434:206 435:253 436:226 462:206 463:253 464:226 490:206 491:253 492:226 518:206 519:253 520:237 521:45 546:206 547:253 548:253 549:109 574:173 575:253 576:253 577:109 602:69 603:253 604:253 605:109 630:64 631:248 632:253 633:109 659:112 660:253 661:109 -0 99:70 100:255 101:165 102:114 127:122 128:253 129:253 130:253 131:120 155:165 156:253 157:253 158:253 159:234 160:52 183:99 184:253 185:253 186:253 187:253 188:228 189:26 209:60 210:168 211:238 212:202 213:174 214:253 215:253 216:253 217:127 235:91 236:81 237:1 238:215 239:128 240:28 241:12 242:181 243:253 244:253 245:175 246:3 262:18 263:204 264:253 265:77 270:7 271:253 272:253 273:253 274:54 289:54 290:248 291:253 292:253 293:143 298:1 299:127 300:253 301:253 302:188 317:104 318:253 319:253 320:253 321:20 327:81 328:249 329:253 330:191 345:192 346:253 347:253 348:218 349:5 356:203 357:253 358:208 359:21 372:56 373:237 374:253 375:250 376:100 384:104 385:253 386:253 387:75 400:76 401:253 402:253 403:224 412:119 413:253 414:253 415:75 428:80 429:253 430:253 431:103 439:4 440:241 441:253 442:218 443:32 456:213 457:253 458:253 459:103 467:125 468:253 469:253 470:191 484:213 485:253 486:253 487:103 494:3 495:176 496:253 497:253 498:135 512:213 513:253 514:253 515:103 521:9 522:162 523:253 524:253 525:226 526:37 540:179 541:253 542:253 543:135 548:46 549:157 550:253 551:253 552:253 553:63 568:23 569:188 570:253 571:249 572:179 573:179 574:179 575:179 576:233 577:253 578:253 579:233 580:156 581:10 597:51 598:235 599:253 600:253 601:253 602:253 603:253 604:253 605:251 606:232 607:120 626:16 627:124 628:253 629:253 630:253 631:253 632:152 633:104 -1 124:29 125:197 126:255 127:84 152:85 153:251 154:253 155:83 180:86 181:253 182:254 183:253 208:85 209:251 210:253 211:251 236:86 237:253 238:254 239:253 240:169 264:85 265:251 266:253 267:251 268:168 292:86 293:253 294:254 295:253 296:169 320:28 321:196 322:253 323:251 324:168 349:169 350:254 351:253 352:169 377:168 378:253 379:251 380:168 405:169 406:254 407:253 408:169 433:168 434:253 435:251 436:168 462:254 463:253 464:254 465:139 490:253 491:251 492:253 493:251 518:254 519:253 520:254 521:253 522:57 546:253 547:251 548:253 549:251 550:168 574:198 575:253 576:254 577:253 578:114 602:85 603:251 604:253 605:251 630:85 631:253 632:254 633:253 658:28 659:83 660:196 661:83 -1 159:31 160:210 161:253 162:163 187:198 188:252 189:252 190:162 213:10 214:86 215:242 216:252 217:252 218:66 241:164 242:252 243:252 244:252 245:188 246:8 268:53 269:242 270:252 271:252 272:225 273:14 296:78 297:252 298:252 299:252 300:204 323:56 324:231 325:252 326:252 327:212 328:35 351:157 352:252 353:252 354:252 355:37 377:8 378:132 379:253 380:252 381:252 382:230 383:24 405:45 406:252 407:253 408:252 409:154 410:55 427:7 428:55 433:107 434:253 435:255 436:228 437:53 454:15 455:24 456:23 460:110 461:242 462:252 463:228 464:59 482:57 483:83 487:88 488:247 489:252 490:252 491:140 514:15 515:189 516:252 517:252 518:252 542:74 543:252 544:252 545:238 546:90 570:178 571:252 572:252 573:189 597:40 598:217 599:252 600:252 601:59 625:75 626:252 627:252 628:252 629:85 630:61 653:62 654:239 655:252 656:156 657:14 682:178 683:252 684:14 -1 131:159 132:255 133:122 158:167 159:228 160:253 161:121 185:64 186:236 187:251 188:205 189:110 212:48 213:158 214:251 215:251 216:178 217:39 240:190 241:251 242:251 243:251 267:96 268:253 269:253 270:253 271:153 295:194 296:251 297:251 298:211 299:74 322:80 323:174 324:251 325:251 326:140 327:47 349:16 350:181 351:253 352:251 353:219 354:23 377:64 378:251 379:253 380:251 381:204 382:19 405:223 406:253 407:255 408:233 409:48 431:20 432:174 433:244 434:251 435:253 436:109 437:31 459:96 460:189 461:251 462:251 463:126 464:31 486:24 487:106 488:251 489:235 490:188 491:100 514:96 515:251 516:251 517:228 518:59 542:255 543:253 544:253 545:213 546:36 569:100 570:253 571:251 572:251 573:85 574:23 596:32 597:127 598:253 599:235 600:126 601:15 624:104 625:251 626:253 627:240 628:79 652:83 653:193 654:253 655:220 -0 153:92 154:191 155:178 156:253 157:242 158:141 159:104 160:29 180:26 181:253 182:252 183:252 184:252 185:253 186:252 187:252 188:252 189:108 190:19 206:57 207:123 208:222 209:253 210:252 211:252 212:252 213:168 214:224 215:252 216:252 217:253 218:84 233:176 234:243 235:252 236:252 237:253 238:252 239:252 240:252 242:19 243:153 244:252 245:253 246:209 247:25 259:10 260:128 261:255 262:253 263:244 264:225 265:114 266:194 267:253 268:178 272:163 273:254 274:253 275:168 287:85 288:252 289:253 290:189 291:56 294:19 295:133 296:9 300:38 301:253 302:252 303:168 314:19 315:191 316:252 317:194 318:19 329:253 330:252 331:234 332:22 342:107 343:252 344:252 345:13 357:253 358:252 359:252 360:128 370:169 371:253 372:241 385:141 386:253 387:253 388:140 397:19 398:225 399:252 400:139 413:66 414:252 415:252 416:139 425:29 426:252 427:252 428:52 441:29 442:252 443:252 444:139 453:29 454:252 455:252 456:28 469:29 470:252 471:252 472:40 481:141 482:253 483:253 484:91 497:154 498:253 499:168 509:66 510:252 511:252 512:165 525:253 526:252 527:168 537:19 538:224 539:252 540:252 552:126 553:253 554:252 555:80 566:169 567:252 568:252 569:214 570:38 579:126 580:249 581:253 582:151 583:6 594:26 595:223 596:253 597:254 598:253 599:128 600:29 604:13 605:41 606:216 607:253 608:253 609:226 610:38 623:122 624:252 625:253 626:252 627:252 628:252 629:169 630:169 631:169 632:206 633:253 634:252 635:252 636:202 637:38 651:19 652:56 653:168 654:224 655:252 656:252 657:253 658:252 659:252 660:252 661:253 662:233 663:130 664:6 682:94 683:139 684:190 685:153 686:252 687:164 688:139 689:28 690:22 -1 128:53 129:250 130:255 131:25 156:167 157:253 158:253 159:25 182:3 183:123 184:247 185:253 186:253 187:25 210:9 211:253 212:253 213:253 214:253 215:25 238:9 239:253 240:253 241:253 242:253 243:25 266:9 267:253 268:253 269:253 270:180 271:13 294:9 295:253 296:253 297:253 298:104 322:9 323:253 324:253 325:253 326:104 350:15 351:253 352:253 353:253 354:104 378:184 379:253 380:253 381:228 382:68 406:184 407:253 408:253 409:182 433:103 434:251 435:253 436:253 437:12 461:106 462:253 463:253 464:253 465:8 488:24 489:238 490:253 491:253 492:253 493:8 516:27 517:253 518:253 519:253 520:253 521:8 544:27 545:253 546:253 547:253 548:253 549:8 572:27 573:253 574:253 575:253 576:177 577:4 600:160 601:253 602:253 603:253 604:87 628:202 629:253 630:253 631:219 632:54 656:81 657:253 658:247 659:51 -0 122:63 123:176 124:253 125:253 126:159 127:113 128:63 150:140 151:253 152:252 153:252 154:252 155:252 156:241 157:100 158:66 177:54 178:227 179:253 180:252 181:252 182:252 183:252 184:253 185:252 186:239 187:181 188:57 204:38 205:224 206:252 207:253 208:226 209:246 210:252 211:252 212:253 213:252 214:252 215:252 216:252 217:108 218:3 232:57 233:252 234:252 235:253 236:27 237:88 238:112 239:112 240:112 241:112 242:142 243:252 244:252 245:253 246:152 247:31 260:198 261:253 262:253 263:79 270:32 271:153 272:253 273:255 274:253 275:196 287:76 288:246 289:252 290:127 299:3 300:106 301:253 302:252 303:214 304:28 315:194 316:252 317:252 318:112 329:143 330:252 331:252 332:193 343:225 344:252 345:217 346:37 357:38 358:234 359:252 360:223 370:63 371:240 372:252 373:84 386:146 387:252 388:223 398:114 399:253 400:228 401:47 414:147 415:253 416:253 417:112 426:159 427:252 428:195 442:225 443:252 444:252 445:112 454:253 455:252 456:195 470:225 471:252 472:230 473:25 482:159 483:252 484:202 485:10 497:92 498:243 499:252 500:208 510:113 511:252 512:252 513:161 524:79 525:253 526:252 527:220 528:37 538:114 539:253 540:253 541:253 542:174 543:63 550:26 551:128 552:253 553:255 554:253 555:133 566:12 567:228 568:252 569:252 570:252 571:241 572:100 573:85 574:76 576:85 577:131 578:231 579:252 580:252 581:253 582:129 583:6 595:97 596:208 597:252 598:252 599:253 600:252 601:252 602:246 603:197 604:253 605:252 606:252 607:252 608:220 609:133 610:6 624:19 625:99 626:239 627:253 628:252 629:252 630:252 631:252 632:253 633:252 634:245 635:223 636:99 654:63 655:112 656:112 657:221 658:252 659:252 660:253 661:127 662:87 -0 153:12 154:136 155:254 156:255 157:195 158:115 159:3 180:6 181:175 182:253 183:196 184:160 185:252 186:253 187:15 208:130 209:253 210:234 211:4 213:27 214:205 215:232 216:40 235:54 236:246 237:253 238:68 242:24 243:243 244:106 262:3 263:134 264:235 265:99 266:4 271:132 272:247 273:77 290:56 291:253 292:62 299:23 300:233 301:129 318:179 319:183 320:4 328:182 329:220 345:21 346:232 347:59 356:95 357:232 358:21 373:128 374:183 385:228 386:85 401:187 402:124 413:228 414:186 429:187 430:124 441:228 442:104 457:187 458:124 469:169 470:184 485:187 486:124 497:203 498:150 513:187 514:124 524:10 525:220 526:39 541:187 542:155 552:111 553:201 569:129 570:228 571:7 579:12 580:181 581:76 598:234 599:166 600:9 606:24 607:209 608:106 626:139 627:250 628:167 629:11 630:2 631:11 632:11 633:129 634:227 635:90 636:11 655:95 656:247 657:253 658:178 659:253 660:253 661:244 662:86 684:47 685:175 686:253 687:232 688:149 689:40 -1 128:255 129:253 130:57 156:253 157:251 158:225 159:56 183:169 184:254 185:253 186:254 187:84 211:168 212:253 213:251 214:253 215:83 238:85 239:253 240:254 241:253 242:169 266:85 267:251 268:253 269:251 270:56 294:141 295:253 296:254 297:253 322:253 323:251 324:253 325:251 350:254 351:253 352:254 353:253 378:253 379:251 380:253 381:251 406:254 407:253 408:254 409:196 433:114 434:253 435:251 436:253 437:83 461:169 462:254 463:253 464:226 465:56 489:168 490:253 491:251 492:168 516:85 517:253 518:254 519:253 544:85 545:251 546:253 547:251 572:254 573:253 574:254 575:253 600:253 601:251 602:253 603:251 628:254 629:253 630:254 631:253 656:139 657:251 658:253 659:138 -0 151:23 152:167 153:208 154:254 155:255 156:129 157:19 179:151 180:253 181:253 182:253 183:253 184:253 185:209 186:26 207:181 208:253 209:253 210:253 211:227 212:181 213:253 214:207 215:22 235:227 236:253 237:253 238:253 239:92 240:38 241:226 242:253 243:129 244:2 263:193 264:253 265:253 266:248 267:62 269:50 270:253 271:253 272:45 291:170 292:253 293:253 294:135 297:12 298:208 299:253 300:119 318:16 319:232 320:253 321:253 322:21 326:60 327:253 328:185 346:164 347:253 348:253 349:224 350:14 354:14 355:217 356:247 357:62 373:3 374:193 375:253 376:250 377:64 383:199 384:253 385:179 401:67 402:253 403:253 404:205 411:98 412:253 413:188 429:151 430:253 431:245 432:43 439:63 440:250 441:188 457:151 458:253 459:243 468:244 469:222 470:22 485:151 486:253 487:217 496:244 497:253 498:115 512:3 513:195 514:253 515:134 524:156 525:253 526:150 541:140 542:253 543:134 552:239 553:253 554:139 569:44 570:253 571:134 579:53 580:246 581:237 582:32 597:8 598:200 599:229 600:40 606:25 607:225 608:253 609:188 626:120 627:250 628:230 629:58 630:17 632:12 633:42 634:213 635:253 636:238 637:84 655:151 656:253 657:253 658:217 659:179 660:206 661:253 662:253 663:196 664:118 683:18 684:58 685:145 686:152 687:253 688:214 689:145 690:74 691:7 -1 130:24 131:150 132:233 133:38 156:14 157:89 158:253 159:254 160:254 161:71 183:78 184:203 185:254 186:254 187:254 188:232 189:77 190:54 191:8 209:12 210:155 211:240 212:254 213:223 214:76 215:254 216:254 217:254 218:254 219:68 235:3 236:101 237:216 238:254 239:227 240:122 241:26 242:110 243:254 244:254 245:254 246:184 247:100 262:46 263:222 264:254 265:254 266:179 267:48 270:181 271:254 272:254 273:146 274:6 288:2 289:145 290:248 291:254 292:182 293:111 294:4 297:3 298:250 299:254 300:206 301:3 315:6 316:144 317:254 318:254 319:171 325:125 326:254 327:252 328:80 342:6 343:142 344:254 345:179 346:95 347:4 352:61 353:246 354:254 355:150 370:64 371:254 372:177 373:14 380:124 381:254 382:246 383:32 398:108 399:97 400:15 407:24 408:226 409:254 410:116 435:177 436:255 437:254 438:5 463:196 464:254 465:99 466:1 490:3 491:199 492:254 493:79 518:129 519:254 520:254 521:23 546:178 547:254 548:192 549:8 550:3 551:43 573:11 574:198 575:254 576:128 577:66 578:130 579:225 595:137 596:202 597:106 598:84 599:84 600:84 601:112 602:254 603:254 604:254 605:254 606:212 607:151 623:172 624:254 625:254 626:254 627:254 628:254 629:254 630:254 631:254 632:254 633:162 634:75 651:12 652:106 653:177 654:254 655:254 656:254 657:235 658:135 659:100 660:17 661:2 -0 125:120 126:253 127:253 128:63 151:38 152:131 153:246 154:252 155:252 156:203 157:15 179:222 180:252 181:252 182:252 183:252 184:166 185:38 205:4 206:107 207:253 208:252 209:252 210:252 211:252 212:253 213:224 214:137 215:26 233:107 234:252 235:253 236:252 237:220 238:128 239:252 240:253 241:252 242:252 243:239 244:140 261:170 262:253 263:255 264:168 267:79 268:192 269:253 270:253 271:253 272:253 273:255 274:90 288:51 289:243 290:252 291:215 292:33 296:12 297:74 298:233 299:252 300:252 301:253 302:195 303:19 316:166 317:252 318:252 319:31 326:43 327:149 328:195 329:253 330:252 331:177 332:19 343:57 344:234 345:252 346:252 357:237 358:252 359:252 360:180 361:13 371:85 372:252 373:252 374:173 385:50 386:237 387:252 388:252 389:112 399:226 400:253 401:240 402:63 414:163 415:253 416:253 417:112 426:38 427:234 428:252 429:176 442:85 443:252 444:252 445:158 454:113 455:252 456:252 457:84 470:19 471:209 472:252 473:252 482:207 483:252 484:252 485:84 498:10 499:203 500:252 501:236 510:253 511:252 512:252 513:84 526:85 527:252 528:252 529:112 538:114 539:253 540:253 541:146 553:51 554:159 555:253 556:240 557:63 566:75 567:243 568:252 569:249 570:146 579:57 580:85 581:238 582:252 583:252 584:99 595:116 596:252 597:252 598:252 599:198 600:197 601:165 602:57 603:57 604:57 605:182 606:197 607:234 608:252 609:253 610:233 611:164 612:19 623:28 624:84 625:180 626:252 627:253 628:252 629:252 630:252 631:252 632:253 633:252 634:252 635:252 636:252 637:225 638:71 653:13 654:112 655:253 656:252 657:252 658:252 659:252 660:253 661:252 662:252 663:157 664:112 -1 127:155 128:253 129:126 155:253 156:251 157:141 158:4 183:253 184:251 185:251 186:31 211:253 212:251 213:251 214:31 239:253 240:251 241:251 242:31 267:255 268:253 269:253 270:31 293:8 294:131 295:253 296:251 297:235 298:27 321:64 322:251 323:253 324:251 325:126 349:64 350:251 351:253 352:251 353:126 377:64 378:251 379:253 380:251 381:126 405:64 406:253 407:255 408:221 433:182 434:251 435:253 436:200 460:64 461:236 462:251 463:253 464:62 487:8 488:158 489:251 490:251 491:169 492:8 515:32 516:251 517:251 518:251 519:158 543:32 544:253 545:253 546:253 547:159 571:32 572:251 573:251 574:251 575:39 599:32 600:251 601:251 602:251 627:32 628:251 629:251 630:251 631:100 655:32 656:251 657:251 658:251 -0 101:88 102:127 103:5 126:19 127:58 128:20 129:14 130:217 131:19 152:7 153:146 154:247 155:253 156:235 157:27 158:84 159:81 180:126 181:253 182:164 183:19 184:15 187:156 188:9 208:214 209:222 210:34 215:234 216:58 235:59 236:254 237:116 243:235 244:58 263:141 264:251 265:72 271:151 272:140 291:224 292:233 299:136 300:223 319:254 320:218 327:136 328:253 347:254 348:135 355:136 356:253 374:23 375:255 376:114 383:137 384:231 402:98 403:254 404:122 411:136 412:155 430:98 431:254 432:106 439:166 440:155 458:98 459:254 460:128 467:234 468:193 486:98 487:254 488:135 494:61 495:248 496:118 515:255 516:238 517:18 521:13 522:224 523:254 524:58 543:201 544:253 545:128 546:2 548:5 549:150 550:253 551:167 552:9 571:18 572:226 573:253 574:49 575:31 576:156 577:253 578:228 579:13 600:147 601:253 602:243 603:241 604:254 605:227 606:43 628:5 629:126 630:245 631:253 632:231 633:46 -0 127:37 128:141 129:156 130:156 131:194 132:194 133:47 153:11 154:132 155:239 156:253 157:253 158:253 159:253 160:254 161:181 180:25 181:172 182:253 183:235 184:167 185:78 186:93 187:174 188:254 189:247 190:54 207:26 208:210 209:253 210:237 211:90 216:201 217:253 218:78 235:192 236:253 237:237 238:58 244:156 245:253 246:78 262:141 263:254 264:235 265:53 269:19 270:5 272:156 273:254 274:78 289:46 290:254 291:253 292:92 296:17 297:226 298:217 299:49 300:148 301:253 302:78 317:165 318:254 319:239 320:24 324:20 325:253 326:253 327:58 328:18 329:115 330:24 344:37 345:248 346:254 347:91 352:2 353:117 354:250 355:163 356:91 372:77 373:253 374:254 375:39 382:196 383:253 384:173 400:159 401:254 402:218 403:15 410:77 411:254 412:255 413:61 428:234 429:253 430:113 438:21 439:226 440:254 441:135 455:25 456:240 457:253 458:68 467:195 468:254 469:135 483:79 484:253 485:253 495:195 496:254 497:135 511:79 512:253 513:253 514:76 523:195 524:254 525:99 540:212 541:254 542:209 543:9 550:10 551:209 552:196 553:15 568:54 569:253 570:254 571:137 572:36 576:2 577:20 578:168 579:253 580:60 596:28 597:235 598:254 599:253 600:199 601:124 602:79 603:79 604:167 605:253 606:253 607:185 608:30 625:15 626:117 627:217 628:253 629:253 630:253 631:254 632:253 633:240 634:109 635:12 655:27 656:126 657:208 658:253 659:193 660:147 661:40 -0 154:32 155:134 156:218 157:254 158:254 159:254 160:217 161:84 176:44 177:208 178:215 179:156 180:35 181:119 182:236 183:246 184:136 185:91 186:69 187:151 188:249 189:246 190:78 203:44 204:230 205:254 206:254 207:254 208:254 209:254 210:196 211:48 216:60 217:224 218:210 219:24 231:118 232:254 233:202 234:19 235:201 236:254 237:181 238:9 245:35 246:233 247:168 259:193 260:223 261:34 263:59 264:163 265:236 266:15 274:140 275:205 276:8 286:60 287:254 288:176 293:38 302:54 303:237 304:80 314:59 315:254 316:93 331:131 332:200 342:59 343:240 344:24 359:79 360:214 370:59 371:234 387:67 388:248 389:54 398:59 399:234 416:235 417:58 426:60 427:235 443:79 444:255 445:59 454:59 455:251 456:66 471:79 472:250 473:54 482:59 483:254 484:108 499:146 500:214 510:5 511:203 512:187 513:3 526:4 527:188 528:199 539:118 540:254 541:57 554:96 555:254 556:117 567:16 568:237 569:224 570:14 581:14 582:187 583:206 584:8 596:88 597:252 598:186 599:16 608:16 609:187 610:252 611:125 625:100 626:254 627:237 628:94 629:24 635:13 636:214 637:254 638:166 653:3 654:57 655:215 656:248 657:241 658:235 659:197 660:137 661:137 662:137 663:231 664:238 665:155 666:25 684:57 685:155 686:246 687:254 688:254 689:254 690:254 691:147 692:36 -1 124:102 125:252 126:252 127:41 152:102 153:250 154:250 155:202 180:102 181:250 182:250 183:232 184:91 208:102 209:250 210:250 211:212 212:29 236:102 237:252 238:252 239:254 240:150 264:102 265:250 266:250 267:252 268:149 292:102 293:250 294:250 295:252 296:149 320:102 321:250 322:250 323:252 324:231 325:80 349:152 350:252 351:254 352:252 353:100 377:151 378:250 379:252 380:250 381:100 405:151 406:250 407:252 408:250 409:100 433:151 434:250 435:252 436:250 437:100 461:123 462:243 463:254 464:252 465:100 490:202 491:252 492:250 493:100 518:80 519:252 520:250 521:190 522:30 547:252 548:250 549:250 550:49 575:255 576:252 577:252 578:252 579:214 580:31 603:171 604:250 605:250 606:250 607:252 608:190 609:40 631:20 632:160 633:250 634:250 635:252 636:250 637:100 660:20 661:170 662:250 663:212 664:49 665:20 -0 124:20 125:121 126:197 127:253 128:64 151:23 152:200 153:252 154:252 155:252 156:184 157:6 178:25 179:197 180:252 181:252 182:252 183:252 184:253 185:228 186:107 187:15 205:26 206:196 207:252 208:252 209:252 210:252 211:252 212:253 213:252 214:252 215:219 216:178 217:21 233:186 234:252 235:238 236:94 237:67 238:224 239:217 240:53 241:109 242:245 243:252 244:252 245:213 246:63 260:98 261:242 262:252 263:101 266:39 267:31 270:109 271:128 272:241 273:252 274:207 275:97 287:17 288:230 289:252 290:241 291:56 300:109 301:252 302:252 303:229 304:17 314:13 315:192 316:252 317:243 318:96 328:25 329:127 330:252 331:252 332:120 342:121 343:252 344:252 345:165 357:125 358:252 359:252 360:246 361:70 370:190 371:252 372:252 373:39 385:26 386:210 387:252 388:252 389:119 398:255 399:253 400:159 414:22 415:209 416:253 417:183 426:253 427:252 428:103 443:34 444:252 445:252 454:253 455:252 456:26 471:27 472:252 473:252 482:253 483:252 484:168 485:13 499:70 500:252 501:209 510:147 511:252 512:252 513:75 526:68 527:233 528:252 529:119 538:121 539:252 540:252 541:189 542:40 552:15 553:82 554:231 555:252 556:214 557:31 566:38 567:135 568:248 569:252 570:231 571:145 572:41 573:41 574:41 575:41 576:20 577:24 578:37 579:83 580:194 581:252 582:252 583:212 584:33 596:83 597:213 598:252 599:252 600:252 601:252 602:252 603:252 604:204 605:213 606:243 607:252 608:252 609:252 610:212 611:34 625:34 626:140 627:238 628:248 629:252 630:252 631:252 632:253 633:252 634:252 635:241 636:238 637:238 638:75 656:82 657:119 658:119 659:119 660:120 661:119 662:119 663:19 -1 127:20 128:254 129:255 130:37 155:19 156:253 157:253 158:134 183:19 184:253 185:253 186:246 187:125 211:76 212:253 213:253 214:253 215:158 239:207 240:253 241:253 242:253 243:158 267:207 268:253 269:253 270:253 271:158 294:48 295:223 296:253 297:253 298:243 299:106 322:141 323:253 324:253 325:253 326:113 349:65 350:237 351:253 352:253 353:253 354:36 377:76 378:253 379:253 380:253 381:253 382:36 405:76 406:253 407:253 408:253 409:253 410:36 433:76 434:253 435:253 436:253 437:118 438:4 460:4 461:148 462:253 463:253 464:253 465:103 488:10 489:253 490:253 491:253 492:253 493:103 516:10 517:253 518:253 519:253 520:173 521:7 544:10 545:253 546:253 547:253 548:168 572:143 573:253 574:253 575:239 576:49 600:198 601:253 602:253 603:234 615:140 628:198 629:253 630:253 631:234 656:198 657:253 658:253 659:234 -0 235:40 236:37 238:7 239:77 240:137 241:136 242:136 243:136 244:136 245:40 246:6 261:16 262:135 263:254 264:233 266:152 267:215 268:96 269:140 270:155 271:118 272:230 273:254 274:158 275:68 288:19 289:164 290:254 291:114 294:235 295:140 301:99 302:230 303:254 304:186 305:14 315:70 316:226 317:242 318:121 322:104 323:195 324:38 330:33 331:179 332:253 333:140 342:41 343:241 344:198 345:43 359:24 360:209 361:223 370:164 371:250 372:66 388:136 389:253 398:254 399:158 416:136 417:215 426:255 427:76 442:5 443:127 444:246 445:133 454:254 455:122 469:5 470:150 471:247 472:91 473:9 482:254 483:165 495:13 496:79 497:194 498:216 499:84 510:111 511:251 512:87 519:16 520:25 521:40 522:107 523:186 524:213 525:117 526:25 538:14 539:185 540:235 541:142 542:23 546:91 547:157 548:231 549:207 550:126 551:49 569:143 570:195 571:255 572:254 573:254 574:244 575:157 576:76 599:39 600:39 601:39 602:33 -1 128:166 129:255 130:187 131:6 156:165 157:253 158:253 159:13 183:15 184:191 185:253 186:253 187:13 211:49 212:253 213:253 214:253 215:13 239:141 240:253 241:253 242:169 243:4 266:4 267:189 268:253 269:249 270:53 294:69 295:253 296:253 297:246 322:69 323:253 324:253 325:246 350:118 351:253 352:253 353:124 378:206 379:253 380:231 381:21 405:66 406:241 407:253 408:199 433:105 434:253 435:253 436:89 460:3 461:228 462:253 463:252 464:86 488:111 489:253 490:253 491:205 516:166 517:253 518:253 519:75 543:43 544:249 545:253 546:193 547:9 570:4 571:160 572:253 573:253 574:184 598:37 599:253 600:253 601:253 602:88 626:140 627:253 628:253 629:186 630:18 654:14 655:253 656:253 657:27 -1 128:117 129:128 155:2 156:199 157:127 183:81 184:254 185:87 211:116 212:254 213:48 239:175 240:234 241:18 266:5 267:230 268:168 294:80 295:255 296:142 322:80 323:255 324:142 350:80 351:251 352:57 378:129 379:239 406:164 407:209 433:28 434:245 435:159 461:64 462:254 463:144 489:84 490:254 491:80 517:143 518:254 519:30 544:3 545:225 546:200 572:48 573:254 574:174 600:48 601:254 602:174 628:93 629:254 630:129 656:53 657:234 658:41 -1 129:159 130:142 156:11 157:220 158:141 184:78 185:254 186:141 212:111 213:254 214:109 240:196 241:221 242:15 267:26 268:221 269:159 295:63 296:254 297:159 323:178 324:254 325:93 350:7 351:191 352:254 353:97 378:42 379:255 380:254 381:41 406:42 407:254 408:195 409:10 434:141 435:255 436:78 461:11 462:202 463:254 464:59 489:86 490:254 491:254 492:59 517:142 518:254 519:248 520:52 545:142 546:254 547:195 573:142 574:254 575:164 601:142 602:254 603:77 629:142 630:254 631:131 657:77 658:172 659:5 -0 124:66 125:254 126:254 127:58 128:60 129:59 130:59 131:50 151:73 152:233 153:253 154:253 155:148 156:254 157:253 158:253 159:232 160:73 179:156 180:253 181:253 182:253 183:117 184:255 185:253 186:253 187:253 188:223 189:176 190:162 205:37 206:116 207:246 208:253 209:180 210:18 211:4 212:18 213:109 214:241 215:253 216:253 217:253 218:236 219:28 233:235 234:253 235:253 236:245 237:107 242:109 243:170 244:253 245:253 246:253 247:174 261:235 262:253 263:253 264:233 271:15 272:156 273:253 274:253 275:223 276:72 287:10 288:156 289:250 290:253 291:253 292:67 300:99 301:253 302:253 303:253 304:127 305:5 315:118 316:253 317:253 318:253 319:204 320:26 328:68 329:223 330:253 331:253 332:253 333:57 342:32 343:191 344:253 345:253 346:253 347:97 357:156 358:253 359:253 360:253 361:57 370:59 371:253 372:253 373:253 374:253 375:97 385:36 386:224 387:253 388:253 389:57 398:60 399:254 400:255 401:254 402:156 413:37 414:226 415:254 416:254 417:58 426:59 427:253 428:253 429:253 430:154 441:156 442:253 443:253 444:253 445:57 454:59 455:253 456:253 457:253 458:154 469:156 470:253 471:253 472:253 473:57 482:59 483:253 484:253 485:253 486:246 487:90 496:16 497:171 498:253 499:253 500:231 501:49 510:59 511:253 512:253 513:253 514:253 515:156 516:91 524:99 525:253 526:253 527:222 528:71 538:59 539:253 540:253 541:253 542:253 543:253 544:245 545:109 551:145 552:194 553:253 554:253 555:174 566:9 567:38 568:174 569:251 570:253 571:253 572:253 573:241 574:215 575:215 576:217 577:215 578:215 579:250 580:253 581:253 582:221 583:26 597:235 598:253 599:253 600:253 601:253 602:253 603:253 604:254 605:253 606:253 607:253 608:253 609:204 610:26 625:108 626:116 627:200 628:253 629:253 630:253 631:253 632:254 633:253 634:253 635:253 636:199 637:44 655:36 656:57 657:118 658:253 659:253 660:58 661:57 662:57 663:57 664:35 -1 129:101 130:222 131:84 157:225 158:252 159:84 184:89 185:246 186:208 187:19 212:128 213:252 214:195 239:79 240:253 241:252 242:195 267:141 268:255 269:253 270:133 294:26 295:240 296:253 297:252 298:55 322:60 323:252 324:253 325:154 326:12 349:7 350:178 351:252 352:253 353:27 377:57 378:252 379:252 380:253 381:27 405:57 406:253 407:253 408:204 409:15 433:104 434:252 435:252 436:94 460:19 461:209 462:252 463:252 488:101 489:252 490:252 491:157 516:225 517:252 518:252 519:112 544:226 545:253 546:240 547:63 572:225 573:252 574:223 600:225 601:252 602:223 628:225 629:252 630:242 631:75 656:146 657:252 658:236 659:50 -0 124:41 125:254 126:254 127:157 128:34 129:34 130:218 131:255 132:206 133:34 134:18 151:53 152:238 153:252 154:252 155:252 156:252 157:252 158:252 159:252 160:252 161:252 162:162 163:26 178:66 179:220 180:252 181:252 182:252 183:209 184:153 185:223 186:252 187:252 188:252 189:252 190:252 191:98 206:166 207:252 208:252 209:252 210:252 211:141 213:85 214:230 215:252 216:252 217:252 218:252 219:98 234:166 235:252 236:252 237:252 238:252 239:141 242:73 243:102 244:252 245:252 246:252 247:98 262:166 263:252 264:252 265:252 266:191 267:30 271:5 272:97 273:252 274:252 275:220 276:51 289:123 290:245 291:252 292:252 293:202 294:14 300:56 301:252 302:252 303:252 304:65 316:18 317:154 318:252 319:252 320:241 328:56 329:252 330:252 331:252 332:65 343:21 344:146 345:252 346:252 347:252 348:241 356:56 357:252 358:252 359:252 360:65 371:67 372:252 373:252 374:252 375:252 376:241 384:56 385:252 386:252 387:252 388:65 399:67 400:252 401:252 402:252 403:252 404:116 412:56 413:252 414:252 415:252 416:65 427:67 428:252 429:252 430:252 431:252 432:20 440:56 441:252 442:252 443:252 444:65 455:67 456:252 457:252 458:252 459:87 460:4 468:56 469:252 470:252 471:124 472:11 483:67 484:252 485:252 486:252 487:54 494:19 495:236 496:245 497:252 498:252 499:98 511:67 512:252 513:252 514:252 515:97 516:5 521:39 522:219 523:252 524:252 525:252 526:252 527:98 539:67 540:252 541:252 542:252 543:252 544:102 545:89 546:89 547:89 548:89 549:203 550:252 551:252 552:252 553:252 554:209 555:64 567:67 568:252 569:252 570:252 571:252 572:252 573:252 574:252 575:252 576:252 577:252 578:252 579:252 580:226 581:130 582:68 595:67 596:252 597:252 598:252 599:252 600:252 601:252 602:252 603:252 604:252 605:252 606:252 607:239 608:77 623:17 624:65 625:163 626:252 627:252 628:252 629:252 630:252 631:252 632:252 633:252 634:96 635:59 653:17 654:176 655:252 656:252 657:252 658:252 659:155 660:32 661:32 662:6 -0 96:56 97:247 98:121 124:24 125:242 126:245 127:122 153:231 154:253 155:253 156:104 157:12 181:90 182:253 183:253 184:254 185:221 186:120 187:120 188:85 206:67 207:75 208:36 209:11 210:56 211:222 212:254 213:253 214:253 215:253 216:245 217:207 218:36 233:86 234:245 235:249 236:105 239:44 240:224 241:230 242:253 243:253 244:253 245:253 246:214 247:10 260:8 261:191 262:253 263:143 269:29 270:119 271:119 272:158 273:253 274:253 275:94 288:15 289:253 290:226 291:48 300:4 301:183 302:253 303:248 304:56 316:42 317:253 318:178 329:179 330:253 331:184 332:14 344:164 345:253 346:178 357:179 358:253 359:163 371:61 372:254 373:254 374:179 384:76 385:254 386:254 387:164 399:60 400:253 401:253 402:178 411:29 412:206 413:253 414:253 415:40 427:60 428:253 429:253 430:178 439:120 440:253 441:253 442:245 443:13 455:60 456:253 457:253 458:178 467:120 468:253 469:239 470:63 483:60 484:253 485:253 486:178 494:14 495:238 496:253 497:179 511:18 512:190 513:253 514:231 515:70 521:43 522:184 523:253 524:253 525:74 540:86 541:253 542:253 543:239 544:134 545:8 548:56 549:163 550:253 551:253 552:213 553:35 568:16 569:253 570:253 571:253 572:253 573:240 574:239 575:239 576:247 577:253 578:253 579:210 580:27 596:4 597:59 598:204 599:253 600:253 601:253 602:253 603:253 604:254 605:253 606:250 607:110 626:31 627:122 628:253 629:253 630:253 631:253 632:255 633:217 634:98 -0 125:19 126:164 127:253 128:255 129:253 130:118 131:59 132:36 153:78 154:251 155:251 156:253 157:251 158:251 159:251 160:199 161:45 180:14 181:198 182:251 183:251 184:253 185:251 186:251 187:251 188:251 189:204 190:26 208:5 209:117 210:251 211:251 212:243 213:212 214:239 215:251 216:251 217:251 218:218 236:95 237:251 238:251 239:251 240:120 242:175 243:251 244:251 245:251 246:231 263:97 264:237 265:251 266:251 267:251 270:67 271:240 272:251 273:251 274:243 275:108 290:8 291:163 292:251 293:251 294:240 295:81 299:68 300:251 301:251 302:251 303:179 304:9 317:13 318:145 319:251 320:251 321:226 322:80 327:39 328:251 329:251 330:251 331:251 332:115 345:144 346:251 347:251 348:251 349:173 355:18 356:167 357:251 358:251 359:251 360:115 373:233 374:251 375:251 376:251 377:173 384:98 385:251 386:251 387:251 388:115 400:176 401:253 402:253 403:216 404:179 412:99 413:253 414:253 415:253 416:116 427:55 428:210 429:251 430:251 431:96 440:98 441:251 442:251 443:214 444:62 455:117 456:251 457:251 458:251 459:96 467:28 468:204 469:251 470:237 471:53 482:55 483:241 484:251 485:251 486:160 487:7 494:28 495:222 496:251 497:251 498:231 510:59 511:251 512:251 513:251 514:153 520:23 521:98 522:204 523:251 524:251 525:251 526:156 538:59 539:251 540:251 541:251 542:153 546:85 547:155 548:179 549:251 550:251 551:251 552:251 553:154 554:15 566:59 567:251 568:251 569:251 570:236 571:214 572:214 573:214 574:234 575:251 576:253 577:251 578:251 579:248 580:156 581:15 594:41 595:209 596:251 597:251 598:251 599:251 600:251 601:251 602:251 603:251 604:253 605:251 606:196 607:146 623:54 624:115 625:241 626:251 627:251 628:251 629:251 630:251 631:251 632:253 633:187 634:35 653:83 654:251 655:251 656:251 657:251 658:251 659:101 660:57 661:31 -1 129:232 130:255 131:107 156:58 157:244 158:253 159:106 184:95 185:253 186:253 187:106 212:95 213:253 214:253 215:106 240:95 241:253 242:249 243:69 268:144 269:253 270:192 295:97 296:233 297:253 298:66 323:195 324:253 325:253 326:5 350:38 351:232 352:253 353:182 354:2 377:10 378:160 379:253 380:231 381:53 405:42 406:253 407:253 408:158 433:141 434:253 435:253 436:115 460:75 461:245 462:253 463:183 464:4 487:1 488:147 489:253 490:251 491:58 515:20 516:253 517:253 518:180 543:202 544:253 545:226 546:27 571:243 572:253 573:212 598:85 599:251 600:253 601:173 626:209 627:253 628:244 629:57 654:169 655:253 656:174 -1 127:63 128:128 129:2 155:63 156:254 157:123 183:63 184:254 185:179 211:63 212:254 213:179 239:63 240:254 241:179 267:142 268:254 269:179 295:187 296:254 297:158 323:187 324:254 325:55 350:68 351:235 352:254 353:55 378:181 379:254 380:254 381:55 406:181 407:254 408:202 409:14 434:181 435:254 436:186 462:181 463:254 464:146 490:181 491:254 492:62 518:181 519:254 520:62 546:181 547:254 548:62 574:181 575:255 576:62 602:181 603:254 604:241 605:52 630:181 631:254 632:222 633:30 658:181 659:224 660:34 -1 130:131 131:255 132:184 133:15 157:99 158:247 159:253 160:182 161:15 185:124 186:253 187:253 188:253 189:38 212:9 213:171 214:253 215:253 216:140 217:1 240:47 241:253 242:253 243:251 244:117 267:43 268:219 269:253 270:253 271:153 295:78 296:253 297:253 298:253 299:84 323:97 324:253 325:253 326:244 327:74 350:69 351:243 352:253 353:253 354:183 377:10 378:168 379:253 380:253 381:215 382:34 405:31 406:253 407:253 408:253 409:129 433:107 434:253 435:253 436:242 437:67 460:24 461:204 462:253 463:253 464:187 488:95 489:253 490:253 491:201 492:25 516:239 517:253 518:253 519:176 543:119 544:251 545:253 546:253 547:138 570:30 571:212 572:253 573:252 574:165 575:8 598:193 599:253 600:253 601:222 626:193 627:253 628:253 629:189 654:193 655:253 656:201 657:27 -0 125:57 126:255 127:253 128:198 129:85 153:168 154:253 155:251 156:253 157:251 158:169 159:56 180:86 181:253 182:254 183:253 184:254 185:253 186:254 187:253 188:57 208:197 209:251 210:253 211:251 212:253 213:251 214:253 215:251 216:225 217:56 235:169 236:255 237:253 238:226 239:56 241:114 242:254 243:253 244:254 245:84 262:57 263:224 264:253 265:251 266:56 270:139 271:251 272:253 273:83 290:141 291:253 292:255 293:84 298:57 299:225 300:254 301:196 318:253 319:251 320:253 321:83 327:168 328:253 329:83 345:169 346:254 347:253 348:169 355:169 356:254 357:253 358:169 373:168 374:253 375:251 376:56 383:168 384:253 385:251 386:56 401:169 402:254 403:84 412:254 413:253 429:168 430:253 431:83 440:253 441:251 456:29 457:197 458:254 459:84 467:169 468:254 469:196 484:85 485:251 486:253 487:83 494:57 495:224 496:253 497:83 512:57 513:225 514:254 515:139 521:57 522:141 523:253 524:254 525:84 541:168 542:253 543:251 544:169 545:56 547:114 548:169 549:224 550:253 551:251 552:253 553:83 569:169 570:254 571:253 572:254 573:253 574:254 575:253 576:254 577:253 578:254 579:253 580:226 581:56 597:56 598:253 599:251 600:253 601:251 602:253 603:251 604:253 605:251 606:253 607:251 608:56 626:169 627:225 628:254 629:253 630:254 631:253 632:254 633:253 634:226 635:56 655:56 656:253 657:251 658:253 659:251 660:84 661:83 662:56 -0 127:12 128:105 129:224 130:255 131:247 132:22 155:131 156:254 157:254 158:243 159:252 160:76 182:131 183:225 184:254 185:224 186:48 187:136 208:13 209:109 210:252 211:254 212:254 213:254 214:197 215:76 235:9 236:181 237:254 238:254 239:240 240:229 241:237 242:254 243:252 244:152 245:21 262:9 263:143 264:254 265:254 266:226 267:36 269:22 270:138 271:254 272:254 273:188 289:13 290:181 291:254 292:254 293:250 294:64 298:2 299:53 300:236 301:252 302:131 317:102 318:254 319:254 320:254 321:111 328:56 329:243 330:251 331:42 344:30 345:186 346:254 347:254 348:206 349:29 357:199 358:254 359:91 372:92 373:254 374:254 375:237 376:13 385:134 386:254 387:91 400:133 401:254 402:254 403:126 413:134 414:250 415:17 428:187 429:254 430:237 431:23 441:200 442:183 456:187 457:254 458:213 467:2 468:134 469:252 470:101 484:183 485:254 486:133 495:14 496:254 497:234 498:34 512:92 513:254 514:161 522:84 523:204 524:254 525:56 540:92 541:254 542:229 549:85 550:252 551:252 552:188 553:11 568:56 569:252 570:229 575:3 576:53 577:235 578:253 579:166 597:224 598:245 599:130 600:68 601:68 602:134 603:214 604:254 605:254 606:159 625:141 626:254 627:254 628:254 629:254 630:254 631:254 632:233 633:95 634:3 653:14 654:152 655:254 656:254 657:254 658:186 659:157 660:53 -1 130:226 131:247 132:55 157:99 158:248 159:254 160:230 161:30 185:125 186:254 187:254 188:254 189:38 213:125 214:254 215:254 216:212 217:24 240:18 241:223 242:254 243:252 244:118 268:24 269:254 270:254 271:239 295:27 296:195 297:254 298:254 299:93 323:78 324:254 325:254 326:246 327:74 351:158 352:254 353:254 354:185 378:41 379:239 380:254 381:254 382:43 405:22 406:218 407:254 408:254 409:167 410:9 433:32 434:254 435:254 436:254 437:130 460:24 461:187 462:254 463:254 464:234 465:16 488:189 489:254 490:254 491:254 492:128 515:64 516:247 517:254 518:255 519:219 520:42 543:139 544:254 545:254 546:222 547:40 570:30 571:213 572:254 573:235 574:45 598:194 599:254 600:254 601:223 626:194 627:254 628:254 629:190 654:194 655:254 656:202 657:27 -1 130:166 131:253 132:124 133:53 158:140 159:251 160:251 161:180 185:125 186:246 187:251 188:251 189:51 212:32 213:190 214:251 215:251 216:251 217:103 240:21 241:174 242:251 243:251 244:251 268:73 269:176 270:253 271:253 272:201 296:149 297:251 298:251 299:251 300:71 323:27 324:228 325:251 326:251 327:157 328:10 351:180 352:253 353:251 354:251 355:142 377:27 378:180 379:231 380:253 381:251 382:96 383:41 405:89 406:253 407:253 408:255 409:211 410:25 433:217 434:251 435:251 436:253 437:107 460:21 461:221 462:251 463:251 464:242 465:92 487:32 488:190 489:251 490:251 491:251 492:103 515:202 516:251 517:251 518:251 519:122 542:53 543:255 544:253 545:253 546:221 547:51 570:180 571:253 572:251 573:251 574:142 598:180 599:253 600:251 601:251 602:142 626:180 627:253 628:251 629:157 630:82 654:180 655:253 656:147 657:10 -1 129:17 130:206 131:229 132:44 157:2 158:125 159:254 160:123 185:95 186:254 187:254 188:123 212:78 213:240 214:254 215:254 216:123 240:100 241:254 242:254 243:254 244:123 267:2 268:129 269:254 270:254 271:220 272:20 295:9 296:254 297:254 298:254 299:123 322:22 323:179 324:254 325:254 326:254 327:49 350:83 351:254 352:254 353:254 354:183 355:19 378:136 379:254 380:254 381:254 382:139 404:3 405:111 406:252 407:254 408:254 409:232 410:45 432:67 433:254 434:254 435:254 436:216 437:40 459:14 460:192 461:254 462:254 463:254 464:140 486:23 487:192 488:254 489:254 490:254 491:246 514:77 515:254 516:254 517:255 518:241 519:100 541:65 542:235 543:254 544:254 545:254 546:172 568:30 569:238 570:254 571:254 572:254 573:219 574:26 596:34 597:254 598:254 599:254 600:216 601:41 624:34 625:254 626:254 627:254 628:188 652:12 653:170 654:254 655:254 656:82 -1 130:218 131:253 132:124 157:84 158:236 159:251 160:251 184:63 185:236 186:251 187:251 188:122 212:73 213:251 214:251 215:251 216:173 240:202 241:251 242:251 243:251 244:71 267:53 268:255 269:253 270:253 271:253 272:72 295:180 296:253 297:251 298:251 299:188 300:30 323:180 324:253 325:251 326:251 327:142 350:47 351:211 352:253 353:251 354:235 355:82 377:27 378:211 379:251 380:253 381:251 382:215 405:89 406:253 407:253 408:255 409:253 410:164 433:217 434:251 435:251 436:253 437:168 438:15 460:21 461:221 462:251 463:251 464:253 465:107 487:32 488:190 489:251 490:251 491:251 492:221 493:61 515:73 516:251 517:251 518:251 519:251 520:180 543:255 544:253 545:253 546:253 547:201 570:105 571:253 572:251 573:251 574:251 575:71 598:180 599:253 600:251 601:246 602:137 603:10 626:180 627:253 628:251 629:215 654:180 655:253 656:251 657:86 -1 124:102 125:180 126:1 152:140 153:254 154:130 180:140 181:254 182:204 208:140 209:254 210:204 236:72 237:254 238:204 264:25 265:231 266:250 267:135 292:11 293:211 294:254 295:222 321:101 322:254 323:250 324:15 349:96 350:254 351:254 352:95 377:2 378:251 379:254 380:95 405:2 406:251 407:254 408:95 433:96 434:254 435:254 436:95 461:53 462:253 463:254 464:139 490:250 491:254 492:235 493:27 518:201 519:254 520:254 521:128 546:80 547:254 548:254 549:139 574:65 575:254 576:254 577:139 602:150 603:254 604:254 605:139 630:229 631:254 632:254 633:43 658:52 659:196 660:168 661:9 -0 128:87 129:208 130:249 155:27 156:212 157:254 158:195 182:118 183:225 184:254 185:254 186:232 187:147 188:46 209:115 210:248 211:254 212:254 213:254 214:254 215:254 216:230 217:148 218:12 236:18 237:250 238:254 239:245 240:226 241:254 242:254 243:254 244:254 245:254 246:148 263:92 264:205 265:254 266:250 267:101 268:20 269:194 270:254 271:254 272:254 273:254 274:229 275:53 291:152 292:254 293:254 294:94 297:14 298:124 299:187 300:254 301:254 302:254 303:213 318:95 319:252 320:254 321:206 322:15 327:3 328:6 329:51 330:231 331:254 332:94 345:50 346:246 347:254 348:254 349:20 358:200 359:254 360:96 372:21 373:184 374:254 375:254 376:147 377:2 386:200 387:254 388:96 400:177 401:254 402:254 403:218 404:33 413:16 414:211 415:254 416:96 427:11 428:219 429:254 430:251 431:92 441:84 442:254 443:232 444:44 455:101 456:254 457:254 458:141 469:162 470:254 471:231 472:42 483:235 484:254 485:227 486:42 496:51 497:238 498:254 499:213 511:235 512:254 513:199 524:160 525:254 526:229 527:52 539:235 540:254 541:199 549:10 550:84 551:150 552:253 553:254 554:147 567:235 568:254 569:213 570:20 575:17 576:63 577:158 578:254 579:254 580:254 581:155 582:12 595:122 596:248 597:254 598:204 599:98 600:42 601:177 602:180 603:200 604:254 605:254 606:253 607:213 608:82 609:10 624:203 625:254 626:254 627:254 628:254 629:254 630:254 631:254 632:251 633:219 634:94 652:35 653:221 654:254 655:254 656:254 657:254 658:254 659:217 660:95 -1 126:134 127:230 154:133 155:231 156:10 182:133 183:253 184:96 210:133 211:253 212:96 238:133 239:253 240:183 266:133 267:253 268:217 294:133 295:253 296:217 322:133 323:253 324:217 350:133 351:253 352:217 378:133 379:253 380:217 406:134 407:254 408:218 434:133 435:253 436:159 462:133 463:253 464:199 490:156 491:253 492:96 518:254 519:247 520:73 546:254 547:248 548:74 573:99 574:254 575:245 576:64 600:89 601:230 602:254 603:125 627:140 628:251 629:253 630:243 631:10 655:114 656:242 657:195 658:69 -1 125:29 126:85 127:255 128:139 153:197 154:251 155:253 156:251 181:254 182:253 183:254 184:253 209:253 210:251 211:253 212:251 237:254 238:253 239:254 240:253 265:253 266:251 267:253 268:138 293:254 294:253 295:254 296:196 321:253 322:251 323:253 324:196 349:254 350:253 351:254 352:84 377:253 378:251 379:253 380:196 405:254 406:253 407:254 408:253 433:253 434:251 435:253 436:251 461:254 462:253 463:254 464:253 489:253 490:251 491:253 492:251 517:254 518:253 519:254 520:253 545:253 546:251 547:253 548:251 573:254 574:253 575:254 576:253 601:253 602:251 603:253 604:251 629:57 630:225 631:254 632:253 658:56 659:253 660:251 -1 125:149 126:255 127:254 128:58 153:215 154:253 155:183 156:2 180:41 181:232 182:253 183:181 208:92 209:253 210:253 211:181 236:92 237:253 238:253 239:181 264:92 265:253 266:253 267:181 292:92 293:253 294:253 295:181 320:92 321:253 322:253 323:181 348:92 349:253 350:253 351:181 376:92 377:253 378:253 379:181 404:92 405:253 406:253 407:181 432:92 433:253 434:253 435:181 460:92 461:253 462:253 463:181 488:31 489:228 490:253 491:181 517:198 518:253 519:228 520:54 545:33 546:226 547:253 548:195 549:7 574:199 575:253 576:253 577:75 602:34 603:218 604:253 605:228 606:117 607:14 608:12 631:33 632:219 633:253 634:253 635:253 636:211 660:32 661:123 662:149 663:230 664:41 -1 130:79 131:203 132:141 157:51 158:240 159:240 160:140 185:88 186:252 187:252 188:140 213:197 214:252 215:252 216:140 241:197 242:252 243:252 244:140 268:147 269:253 270:253 271:253 295:38 296:234 297:252 298:242 299:89 323:113 324:252 325:252 326:223 350:16 351:207 352:252 353:252 354:129 377:16 378:203 379:253 380:252 381:220 382:37 405:29 406:253 407:255 408:253 409:56 432:19 433:181 434:252 435:253 436:176 437:6 460:166 461:252 462:252 463:228 464:52 487:10 488:203 489:252 490:252 491:126 514:63 515:178 516:252 517:252 518:173 542:114 543:253 544:253 545:225 570:238 571:252 572:252 573:99 596:7 597:135 598:253 599:252 600:176 601:19 624:29 625:252 626:253 627:252 628:55 652:13 653:189 654:253 655:204 656:25 -1 126:94 127:254 128:75 154:166 155:253 156:231 182:208 183:253 184:147 210:208 211:253 212:116 238:208 239:253 240:168 266:146 267:254 268:222 294:166 295:253 296:116 322:208 323:253 324:116 350:166 351:253 352:158 378:145 379:253 380:231 406:209 407:254 408:169 434:187 435:253 436:168 462:93 463:253 464:116 490:93 491:253 492:116 518:93 519:253 520:116 546:94 547:254 548:179 549:11 574:93 575:253 576:246 577:101 602:145 603:253 604:255 605:92 630:93 631:253 632:246 633:59 658:93 659:253 660:74 -0 127:46 128:105 129:254 130:254 131:224 132:59 133:59 134:9 155:196 156:254 157:253 158:253 159:253 160:253 161:253 162:128 182:96 183:235 184:254 185:253 186:253 187:253 188:253 189:253 190:247 191:122 208:4 209:101 210:244 211:253 212:254 213:234 214:241 215:253 216:253 217:253 218:253 219:186 220:18 236:96 237:253 238:253 239:253 240:232 241:83 242:109 243:170 244:253 245:253 246:253 247:253 248:116 264:215 265:253 266:253 267:253 268:196 271:40 272:253 273:253 274:253 275:253 276:116 290:8 291:141 292:247 293:253 294:253 295:237 296:29 299:6 300:38 301:171 302:253 303:253 304:116 317:13 318:146 319:253 320:253 321:253 322:253 323:57 329:156 330:253 331:253 332:116 345:40 346:253 347:253 348:253 349:253 350:178 351:27 357:156 358:253 359:253 360:116 372:136 373:204 374:253 375:253 376:253 377:192 378:27 385:156 386:253 387:253 388:116 399:28 400:195 401:254 402:254 403:254 404:250 405:135 412:99 413:255 414:254 415:254 416:117 427:118 428:253 429:253 430:253 431:253 432:142 439:19 440:170 441:253 442:253 443:216 444:62 454:42 455:212 456:253 457:253 458:253 459:253 460:38 466:124 467:188 468:253 469:253 470:253 471:174 482:59 483:253 484:253 485:253 486:237 487:93 488:3 491:31 492:40 493:130 494:247 495:253 496:253 497:253 498:204 499:13 510:59 511:253 512:253 513:253 514:154 518:54 519:218 520:254 521:253 522:253 523:253 524:253 525:253 526:38 538:59 539:253 540:253 541:253 542:215 543:156 544:156 545:156 546:209 547:253 548:255 549:253 550:253 551:253 552:192 553:97 554:15 566:55 567:242 568:253 569:253 570:253 571:253 572:253 573:253 574:253 575:253 576:254 577:253 578:253 579:204 580:23 595:118 596:253 597:253 598:253 599:253 600:253 601:253 602:253 603:253 604:254 605:216 606:174 607:13 623:54 624:116 625:243 626:253 627:253 628:253 629:253 630:253 631:146 632:117 633:62 653:53 654:132 655:253 656:253 657:192 658:57 659:13 -1 125:42 126:232 127:254 128:58 153:86 154:253 155:253 156:58 181:86 182:253 183:253 184:58 209:206 210:253 211:253 212:58 237:215 238:253 239:253 240:58 265:215 266:253 267:253 268:58 293:215 294:253 295:253 296:58 321:215 322:253 323:253 324:58 349:215 350:253 351:253 352:58 377:215 378:253 379:253 380:58 405:215 406:253 407:253 408:58 433:188 434:253 435:253 436:85 461:86 462:253 463:253 464:200 465:12 489:29 490:223 491:253 492:253 493:151 518:209 519:253 520:253 521:194 546:128 547:253 548:253 549:200 550:8 574:32 575:213 576:253 577:253 578:152 579:6 603:32 604:221 605:253 606:253 607:153 608:5 632:90 633:215 634:253 635:253 636:151 661:59 662:253 663:253 664:84 -1 156:60 157:229 158:38 184:187 185:254 186:78 211:121 212:252 213:254 214:78 239:197 240:254 241:206 242:6 267:197 268:254 269:202 294:27 295:218 296:233 297:62 322:117 323:254 324:195 350:203 351:254 352:195 377:64 378:244 379:254 380:195 405:79 406:254 407:255 408:161 433:79 434:254 435:254 436:65 461:79 462:254 463:241 464:52 489:79 490:254 491:189 517:79 518:254 519:189 545:79 546:254 547:189 573:79 574:254 575:189 601:79 602:254 603:194 604:5 629:35 630:219 631:254 632:72 658:34 659:223 660:195 687:129 688:195 -1 101:11 102:150 103:72 129:37 130:251 131:71 157:63 158:251 159:71 185:217 186:251 187:71 213:217 214:251 215:71 240:145 241:253 242:253 243:72 267:42 268:206 269:251 270:251 271:71 295:99 296:251 297:251 298:251 299:71 323:253 324:251 325:251 326:251 327:71 350:130 351:253 352:251 353:251 354:251 355:71 377:110 378:253 379:255 380:253 381:253 382:253 383:72 405:109 406:251 407:253 408:251 409:251 410:188 411:30 433:109 434:251 435:253 436:251 437:246 438:123 460:16 461:170 462:251 463:253 464:251 465:215 488:37 489:251 490:251 491:253 492:251 493:86 516:218 517:253 518:253 519:255 520:253 521:35 543:84 544:236 545:251 546:251 547:253 548:168 549:15 571:144 572:251 573:251 574:251 575:190 576:15 599:144 600:251 601:251 602:251 603:180 626:53 627:221 628:251 629:251 630:251 631:180 -0 127:45 128:254 129:254 130:254 131:148 132:24 133:9 154:43 155:254 156:252 157:252 158:252 159:252 160:252 161:121 162:13 181:58 182:237 183:254 184:252 185:252 186:252 187:252 188:252 189:252 190:68 208:69 209:224 210:252 211:254 212:252 213:252 214:252 215:252 216:252 217:252 218:135 219:17 235:75 236:216 237:252 238:252 239:254 240:231 241:168 242:252 243:252 244:252 245:252 246:252 247:45 262:77 263:212 264:252 265:252 266:252 267:242 268:93 269:32 270:114 271:177 272:252 273:252 274:252 275:158 276:12 289:75 290:212 291:252 292:252 293:252 294:252 295:231 299:116 300:252 301:252 302:252 303:252 304:21 316:69 317:216 318:252 319:252 320:252 321:252 322:252 323:62 327:116 328:252 329:252 330:252 331:252 332:21 344:93 345:252 346:252 347:252 348:252 349:252 350:62 355:21 356:158 357:252 358:252 359:252 360:21 371:64 372:239 373:252 374:252 375:252 376:252 377:252 378:21 384:139 385:252 386:252 387:252 388:21 398:5 399:87 400:254 401:254 402:254 403:254 404:237 405:41 411:11 412:150 413:254 414:254 415:254 416:22 425:5 426:85 427:252 428:252 429:252 430:252 431:222 432:55 439:116 440:252 441:252 442:252 443:214 444:18 453:24 454:252 455:252 456:252 457:252 458:252 459:91 466:26 467:153 468:252 469:252 470:252 471:45 481:24 482:252 483:252 484:252 485:252 486:252 487:91 492:18 493:93 494:151 495:252 496:252 497:252 498:184 499:28 509:24 510:252 511:252 512:252 513:252 514:252 515:164 516:116 517:116 518:116 519:117 520:141 521:252 522:252 523:252 524:252 525:252 526:68 537:24 538:252 539:252 540:252 541:252 542:252 543:252 544:252 545:252 546:252 547:254 548:252 549:252 550:252 551:252 552:252 553:163 554:31 565:9 566:121 567:252 568:252 569:252 570:252 571:252 572:252 573:252 574:252 575:254 576:252 577:252 578:252 579:178 580:91 581:33 594:13 595:119 596:252 597:252 598:252 599:252 600:252 601:252 602:252 603:254 604:252 605:252 606:184 607:37 623:13 624:121 625:252 626:252 627:252 628:252 629:252 630:252 631:254 632:214 633:45 634:28 652:8 653:21 654:21 655:169 656:252 657:252 658:41 659:22 660:18 -0 125:218 126:253 127:253 128:255 129:149 130:62 151:42 152:144 153:236 154:251 155:251 156:253 157:251 158:236 159:144 160:144 179:99 180:251 181:251 182:251 183:225 184:253 185:251 186:251 187:251 188:251 189:166 190:16 206:79 207:253 208:251 209:251 210:204 211:41 212:143 213:205 214:251 215:251 216:251 217:253 218:169 219:15 233:79 234:231 235:253 236:251 237:225 238:41 241:41 242:226 243:251 244:251 245:253 246:251 247:164 260:37 261:253 262:253 263:255 264:253 265:35 271:79 272:232 273:255 274:253 275:227 276:42 288:140 289:251 290:251 291:253 292:168 293:15 300:77 301:253 302:251 303:251 304:142 315:21 316:221 317:251 318:251 319:164 320:15 329:227 330:251 331:251 332:236 333:61 342:32 343:190 344:251 345:251 346:251 357:73 358:251 359:251 360:251 361:71 370:73 371:251 372:251 373:251 374:251 385:73 386:251 387:251 388:251 389:71 398:73 399:253 400:253 401:253 402:201 413:73 414:253 415:253 416:253 417:72 426:176 427:251 428:251 429:251 430:71 441:73 442:251 443:251 444:251 445:71 454:253 455:251 456:251 457:157 458:10 469:73 470:251 471:251 472:251 473:71 482:253 483:251 484:251 485:142 497:150 498:251 499:251 500:204 501:41 510:124 511:251 512:251 513:220 514:180 524:130 525:253 526:251 527:225 528:41 538:73 539:253 540:253 541:253 542:253 543:73 544:73 545:10 549:42 550:73 551:150 552:253 553:255 554:253 555:216 566:31 567:189 568:251 569:251 570:251 571:253 572:251 573:159 574:144 575:144 576:145 577:206 578:251 579:251 580:251 581:253 582:168 583:92 595:20 596:195 597:251 598:251 599:253 600:251 601:251 602:251 603:251 604:253 605:251 606:251 607:251 608:225 609:164 610:15 624:21 625:142 626:220 627:253 628:251 629:251 630:251 631:251 632:253 633:251 634:251 635:204 636:41 654:51 655:72 656:174 657:251 658:251 659:251 660:253 661:147 662:71 663:41 -0 127:60 128:96 129:96 130:48 153:16 154:171 155:228 156:253 157:251 158:220 159:51 160:32 181:127 182:251 183:251 184:253 185:251 186:251 187:251 188:251 189:80 207:24 208:182 209:236 210:251 211:211 212:189 213:236 214:251 215:251 216:251 217:242 218:193 234:100 235:194 236:251 237:251 238:211 239:35 241:71 242:173 243:251 244:251 245:253 246:240 247:158 248:19 261:64 262:253 263:255 264:253 265:205 266:19 271:40 272:218 273:255 274:253 275:253 276:91 288:16 289:186 290:251 291:253 292:247 293:110 300:39 301:233 302:251 303:251 304:188 315:16 316:189 317:251 318:251 319:205 320:110 329:48 330:220 331:251 332:220 333:48 343:72 344:251 345:251 346:251 347:158 358:51 359:251 360:251 361:232 371:190 372:251 373:251 374:251 375:59 386:32 387:251 388:251 389:251 398:96 399:253 400:253 401:253 402:95 414:32 415:253 416:253 417:193 426:214 427:251 428:251 429:204 430:23 442:52 443:251 444:251 445:94 454:253 455:251 456:251 457:109 469:48 470:221 471:251 472:219 473:47 482:253 483:251 484:251 485:70 497:234 498:251 499:251 500:188 510:253 511:251 512:251 513:188 523:40 524:158 525:253 526:251 527:172 528:70 539:191 540:253 541:253 542:253 543:96 544:24 549:12 550:174 551:253 552:253 553:255 554:221 567:71 568:251 569:251 570:251 571:253 572:205 573:190 574:190 575:190 576:191 577:197 578:251 579:251 580:231 581:221 582:93 595:16 596:126 597:236 598:251 599:253 600:251 601:251 602:251 603:251 604:253 605:251 606:251 607:140 608:47 625:67 626:188 627:189 628:188 629:188 630:188 631:188 632:189 633:188 634:109 635:4 -0 126:32 127:202 128:255 129:253 130:253 131:175 132:21 152:84 153:144 154:190 155:251 156:253 157:251 158:251 159:251 160:174 176:6 177:37 178:166 179:218 180:236 181:251 182:251 183:251 184:253 185:251 186:251 187:251 188:251 189:156 204:115 205:251 206:251 207:253 208:251 209:251 210:251 211:251 212:253 213:251 214:251 215:251 216:251 217:180 231:105 232:241 233:251 234:251 235:253 236:251 237:251 238:251 239:122 240:72 241:71 242:71 243:148 244:251 245:180 258:73 259:253 260:253 261:253 262:253 263:202 264:253 265:253 266:143 286:31 287:189 288:251 289:251 290:251 291:31 292:189 293:251 294:142 314:63 315:236 316:251 317:251 318:96 320:124 321:246 322:142 330:21 331:166 332:21 342:73 343:251 344:251 345:251 346:71 349:217 350:142 357:32 358:190 359:251 360:142 370:73 371:251 372:251 373:251 374:71 377:217 378:142 385:73 386:251 387:251 388:142 398:73 399:253 400:253 401:253 402:72 405:156 406:103 413:73 414:253 415:253 416:253 417:72 426:73 427:251 428:251 429:251 430:174 441:73 442:251 443:251 444:251 445:71 454:73 455:251 456:251 457:251 458:251 469:73 470:251 471:251 472:251 473:71 482:42 483:205 484:251 485:251 486:251 487:79 497:73 498:251 499:251 500:251 501:71 511:41 512:226 513:251 514:251 515:232 516:77 525:73 526:251 527:251 528:251 529:71 540:166 541:253 542:253 543:255 544:253 545:227 546:73 547:21 553:125 554:253 555:253 556:143 568:16 569:169 570:251 571:253 572:251 573:251 574:251 575:174 576:105 579:63 580:144 581:253 582:251 583:251 584:142 597:15 598:35 599:253 600:251 601:251 602:251 603:251 604:243 605:217 606:217 607:231 608:251 609:253 610:251 611:220 612:20 627:143 628:142 629:236 630:251 631:251 632:253 633:251 634:251 635:251 636:251 637:253 638:251 639:137 657:61 658:71 659:200 660:253 661:251 662:251 663:251 664:251 665:201 666:71 667:10 -1 130:218 131:170 132:108 157:32 158:227 159:252 160:232 185:129 186:252 187:252 188:252 212:1 213:253 214:252 215:252 216:168 240:144 241:253 242:252 243:236 244:62 268:144 269:253 270:252 271:215 296:144 297:253 298:252 299:112 323:21 324:206 325:253 326:252 327:71 351:99 352:253 353:255 354:119 378:63 379:242 380:252 381:253 382:35 406:94 407:252 408:252 409:154 410:10 433:145 434:237 435:252 436:252 461:255 462:253 463:253 464:108 487:11 488:155 489:253 490:252 491:179 492:15 514:11 515:150 516:252 517:253 518:200 519:20 542:73 543:252 544:252 545:253 546:97 569:47 570:233 571:253 572:253 596:1 597:149 598:252 599:252 600:252 624:1 625:252 626:252 627:246 628:132 652:1 653:169 654:252 655:132 -1 130:116 131:255 132:123 157:29 158:213 159:253 160:122 185:189 186:253 187:253 188:122 213:189 214:253 215:253 216:122 241:189 242:253 243:253 244:122 267:2 268:114 269:243 270:253 271:186 272:19 295:100 296:253 297:253 298:253 299:48 323:172 324:253 325:253 326:253 327:48 351:172 352:253 353:253 354:182 355:19 378:133 379:251 380:253 381:175 382:4 405:107 406:251 407:253 408:253 409:65 432:26 433:194 434:253 435:253 436:214 437:40 459:105 460:205 461:253 462:253 463:125 464:40 487:139 488:253 489:253 490:253 491:81 514:41 515:231 516:253 517:253 518:159 519:16 541:65 542:155 543:253 544:253 545:172 546:4 569:124 570:253 571:253 572:253 573:98 597:124 598:253 599:253 600:214 601:41 624:22 625:207 626:253 627:253 628:139 653:124 654:253 655:162 656:9 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_fpgrowth.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_fpgrowth.txt deleted file mode 100644 index c451583..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_fpgrowth.txt +++ /dev/null @@ -1,6 +0,0 @@ -r z h k p -z y x w v u t s -s x o n r -x z y m t s q e -z -x z y r q t p diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_isotonic_regression_libsvm_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_isotonic_regression_libsvm_data.txt deleted file mode 100644 index f39fe02..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_isotonic_regression_libsvm_data.txt +++ /dev/null @@ -1,100 +0,0 @@ -0.24579296 1:0.01 -0.28505864 1:0.02 -0.31208567 1:0.03 -0.35900051 1:0.04 -0.35747068 1:0.05 -0.16675166 1:0.06 -0.17491076 1:0.07 -0.04181540 1:0.08 -0.04793473 1:0.09 -0.03926568 1:0.10 -0.12952575 1:0.11 -0.00000000 1:0.12 -0.01376849 1:0.13 -0.13105558 1:0.14 -0.08873024 1:0.15 -0.12595614 1:0.16 -0.15247323 1:0.17 -0.25956145 1:0.18 -0.20040796 1:0.19 -0.19581846 1:0.20 -0.15757267 1:0.21 -0.13717491 1:0.22 -0.19020908 1:0.23 -0.19581846 1:0.24 -0.20091790 1:0.25 -0.16879143 1:0.26 -0.18510964 1:0.27 -0.20040796 1:0.28 -0.29576747 1:0.29 -0.43396226 1:0.30 -0.53391127 1:0.31 -0.52116267 1:0.32 -0.48546660 1:0.33 -0.49209587 1:0.34 -0.54156043 1:0.35 -0.59765426 1:0.36 -0.56144824 1:0.37 -0.58592555 1:0.38 -0.52983172 1:0.39 -0.50178480 1:0.40 -0.52626211 1:0.41 -0.58286588 1:0.42 -0.64660887 1:0.43 -0.68077511 1:0.44 -0.74298827 1:0.45 -0.64864865 1:0.46 -0.67261601 1:0.47 -0.65782764 1:0.48 -0.69811321 1:0.49 -0.63029067 1:0.50 -0.61601224 1:0.51 -0.63233044 1:0.52 -0.65323814 1:0.53 -0.65323814 1:0.54 -0.67363590 1:0.55 -0.67006629 1:0.56 -0.51555329 1:0.57 -0.50892402 1:0.58 -0.33299337 1:0.59 -0.36206017 1:0.60 -0.43090260 1:0.61 -0.45996940 1:0.62 -0.56348802 1:0.63 -0.54920959 1:0.64 -0.48393677 1:0.65 -0.48495665 1:0.66 -0.46965834 1:0.67 -0.45181030 1:0.68 -0.45843957 1:0.69 -0.47118817 1:0.70 -0.51555329 1:0.71 -0.58031617 1:0.72 -0.55481897 1:0.73 -0.56297807 1:0.74 -0.56603774 1:0.75 -0.57929628 1:0.76 -0.64762876 1:0.77 -0.66241713 1:0.78 -0.69301377 1:0.79 -0.65119837 1:0.80 -0.68332483 1:0.81 -0.66598674 1:0.82 -0.73890872 1:0.83 -0.73992861 1:0.84 -0.84242733 1:0.85 -0.91330954 1:0.86 -0.88016318 1:0.87 -0.90719021 1:0.88 -0.93115757 1:0.89 -0.93115757 1:0.90 -0.91942886 1:0.91 -0.92911780 1:0.92 -0.95665477 1:0.93 -0.95002550 1:0.94 -0.96940337 1:0.95 -1.00000000 1:0.96 -0.89801122 1:0.97 -0.90311066 1:0.98 -0.90362060 1:0.99 -0.83477817 1:1.0 \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_kmeans_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_kmeans_data.txt deleted file mode 100644 index 5001377..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_kmeans_data.txt +++ /dev/null @@ -1,6 +0,0 @@ -0 1:0.0 2:0.0 3:0.0 -1 1:0.1 2:0.1 3:0.1 -2 1:0.2 2:0.2 3:0.2 -3 1:9.0 2:9.0 3:9.0 -4 1:9.1 2:9.1 3:9.1 -5 1:9.2 2:9.2 3:9.2 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_lda_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_lda_data.txt deleted file mode 100644 index 2e76702..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_lda_data.txt +++ /dev/null @@ -1,12 +0,0 @@ -1 2 6 0 2 3 1 1 0 0 3 -1 3 0 1 3 0 0 2 0 0 1 -1 4 1 0 0 4 9 0 1 2 0 -2 1 0 3 0 0 5 0 2 3 9 -3 1 1 9 3 0 2 0 0 1 3 -4 2 0 3 4 5 1 1 1 4 0 -2 1 0 3 0 0 5 0 2 2 9 -1 1 1 9 2 1 2 0 0 1 3 -4 4 0 3 4 2 1 3 0 0 0 -2 8 2 0 3 0 2 0 2 7 2 -1 1 1 9 0 2 2 0 0 3 3 -4 1 0 0 4 5 1 3 0 1 0 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_lda_libsvm_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_lda_libsvm_data.txt deleted file mode 100644 index bf118d7..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_lda_libsvm_data.txt +++ /dev/null @@ -1,12 +0,0 @@ -0 1:1 2:2 3:6 4:0 5:2 6:3 7:1 8:1 9:0 10:0 11:3 -1 1:1 2:3 3:0 4:1 5:3 6:0 7:0 8:2 9:0 10:0 11:1 -2 1:1 2:4 3:1 4:0 5:0 6:4 7:9 8:0 9:1 10:2 11:0 -3 1:2 2:1 3:0 4:3 5:0 6:0 7:5 8:0 9:2 10:3 11:9 -4 1:3 2:1 3:1 4:9 5:3 6:0 7:2 8:0 9:0 10:1 11:3 -5 1:4 2:2 3:0 4:3 5:4 6:5 7:1 8:1 9:1 10:4 11:0 -6 1:2 2:1 3:0 4:3 5:0 6:0 7:5 8:0 9:2 10:2 11:9 -7 1:1 2:1 3:1 4:9 5:2 6:1 7:2 8:0 9:0 10:1 11:3 -8 1:4 2:4 3:0 4:3 5:4 6:2 7:1 8:3 9:0 10:0 11:0 -9 1:2 2:8 3:2 4:0 5:3 6:0 7:2 8:0 9:2 10:7 11:2 -10 1:1 2:1 3:1 4:9 5:0 6:2 7:2 8:0 9:0 10:3 11:3 -11 1:4 2:1 3:0 4:0 5:4 6:5 7:1 8:3 9:0 10:1 11:0 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt deleted file mode 100644 index 861c70c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt +++ /dev/null @@ -1,100 +0,0 @@ -0 128:51 129:159 130:253 131:159 132:50 155:48 156:238 157:252 158:252 159:252 160:237 182:54 183:227 184:253 185:252 186:239 187:233 188:252 189:57 190:6 208:10 209:60 210:224 211:252 212:253 213:252 214:202 215:84 216:252 217:253 218:122 236:163 237:252 238:252 239:252 240:253 241:252 242:252 243:96 244:189 245:253 246:167 263:51 264:238 265:253 266:253 267:190 268:114 269:253 270:228 271:47 272:79 273:255 274:168 290:48 291:238 292:252 293:252 294:179 295:12 296:75 297:121 298:21 301:253 302:243 303:50 317:38 318:165 319:253 320:233 321:208 322:84 329:253 330:252 331:165 344:7 345:178 346:252 347:240 348:71 349:19 350:28 357:253 358:252 359:195 372:57 373:252 374:252 375:63 385:253 386:252 387:195 400:198 401:253 402:190 413:255 414:253 415:196 427:76 428:246 429:252 430:112 441:253 442:252 443:148 455:85 456:252 457:230 458:25 467:7 468:135 469:253 470:186 471:12 483:85 484:252 485:223 494:7 495:131 496:252 497:225 498:71 511:85 512:252 513:145 521:48 522:165 523:252 524:173 539:86 540:253 541:225 548:114 549:238 550:253 551:162 567:85 568:252 569:249 570:146 571:48 572:29 573:85 574:178 575:225 576:253 577:223 578:167 579:56 595:85 596:252 597:252 598:252 599:229 600:215 601:252 602:252 603:252 604:196 605:130 623:28 624:199 625:252 626:252 627:253 628:252 629:252 630:233 631:145 652:25 653:128 654:252 655:253 656:252 657:141 658:37 -1 159:124 160:253 161:255 162:63 186:96 187:244 188:251 189:253 190:62 214:127 215:251 216:251 217:253 218:62 241:68 242:236 243:251 244:211 245:31 246:8 268:60 269:228 270:251 271:251 272:94 296:155 297:253 298:253 299:189 323:20 324:253 325:251 326:235 327:66 350:32 351:205 352:253 353:251 354:126 378:104 379:251 380:253 381:184 382:15 405:80 406:240 407:251 408:193 409:23 432:32 433:253 434:253 435:253 436:159 460:151 461:251 462:251 463:251 464:39 487:48 488:221 489:251 490:251 491:172 515:234 516:251 517:251 518:196 519:12 543:253 544:251 545:251 546:89 570:159 571:255 572:253 573:253 574:31 597:48 598:228 599:253 600:247 601:140 602:8 625:64 626:251 627:253 628:220 653:64 654:251 655:253 656:220 681:24 682:193 683:253 684:220 -1 125:145 126:255 127:211 128:31 152:32 153:237 154:253 155:252 156:71 180:11 181:175 182:253 183:252 184:71 209:144 210:253 211:252 212:71 236:16 237:191 238:253 239:252 240:71 264:26 265:221 266:253 267:252 268:124 269:31 293:125 294:253 295:252 296:252 297:108 322:253 323:252 324:252 325:108 350:255 351:253 352:253 353:108 378:253 379:252 380:252 381:108 406:253 407:252 408:252 409:108 434:253 435:252 436:252 437:108 462:255 463:253 464:253 465:170 490:253 491:252 492:252 493:252 494:42 518:149 519:252 520:252 521:252 522:144 546:109 547:252 548:252 549:252 550:144 575:218 576:253 577:253 578:255 579:35 603:175 604:252 605:252 606:253 607:35 631:73 632:252 633:252 634:253 635:35 659:31 660:211 661:252 662:253 663:35 -1 153:5 154:63 155:197 181:20 182:254 183:230 184:24 209:20 210:254 211:254 212:48 237:20 238:254 239:255 240:48 265:20 266:254 267:254 268:57 293:20 294:254 295:254 296:108 321:16 322:239 323:254 324:143 350:178 351:254 352:143 378:178 379:254 380:143 406:178 407:254 408:162 434:178 435:254 436:240 462:113 463:254 464:240 490:83 491:254 492:245 493:31 518:79 519:254 520:246 521:38 547:214 548:254 549:150 575:144 576:241 577:8 603:144 604:240 605:2 631:144 632:254 633:82 659:230 660:247 661:40 687:168 688:209 689:31 -1 152:1 153:168 154:242 155:28 180:10 181:228 182:254 183:100 209:190 210:254 211:122 237:83 238:254 239:162 265:29 266:254 267:248 268:25 293:29 294:255 295:254 296:103 321:29 322:254 323:254 324:109 349:29 350:254 351:254 352:109 377:29 378:254 379:254 380:109 405:29 406:255 407:254 408:109 433:29 434:254 435:254 436:109 461:29 462:254 463:254 464:63 489:29 490:254 491:254 492:28 517:29 518:254 519:254 520:28 545:29 546:254 547:254 548:35 573:29 574:254 575:254 576:109 601:6 602:212 603:254 604:109 630:203 631:254 632:178 658:155 659:254 660:190 686:32 687:199 688:104 -0 130:64 131:253 132:255 133:63 157:96 158:205 159:251 160:253 161:205 162:111 163:4 184:96 185:189 186:251 187:251 188:253 189:251 190:251 191:31 209:16 210:64 211:223 212:244 213:251 214:251 215:211 216:213 217:251 218:251 219:31 236:80 237:181 238:251 239:253 240:251 241:251 242:251 243:94 244:96 245:251 246:251 247:31 263:92 264:253 265:253 266:253 267:255 268:253 269:253 270:253 271:95 272:96 273:253 274:253 275:31 290:92 291:236 292:251 293:243 294:220 295:233 296:251 297:251 298:243 299:82 300:96 301:251 302:251 303:31 317:80 318:253 319:251 320:251 321:188 323:96 324:251 325:251 326:109 328:96 329:251 330:251 331:31 344:96 345:240 346:253 347:243 348:188 349:42 351:96 352:204 353:109 354:4 356:12 357:197 358:251 359:31 372:221 373:251 374:253 375:121 379:36 380:23 385:190 386:251 387:31 399:48 400:234 401:253 413:191 414:253 415:31 426:44 427:221 428:251 429:251 440:12 441:197 442:251 443:31 454:190 455:251 456:251 457:251 468:96 469:251 470:251 471:31 482:190 483:251 484:251 485:113 495:40 496:234 497:251 498:219 499:23 510:190 511:251 512:251 513:94 522:40 523:217 524:253 525:231 526:47 538:191 539:253 540:253 541:253 548:12 549:174 550:253 551:253 552:219 553:39 566:67 567:236 568:251 569:251 570:191 571:190 572:111 573:72 574:190 575:191 576:197 577:251 578:243 579:121 580:39 595:63 596:236 597:251 598:253 599:251 600:251 601:251 602:251 603:253 604:251 605:188 606:94 624:27 625:129 626:253 627:251 628:251 629:251 630:251 631:229 632:168 633:15 654:95 655:212 656:251 657:211 658:94 659:59 -1 159:121 160:254 161:136 186:13 187:230 188:253 189:248 190:99 213:4 214:118 215:253 216:253 217:225 218:42 241:61 242:253 243:253 244:253 245:74 268:32 269:206 270:253 271:253 272:186 273:9 296:211 297:253 298:253 299:239 300:69 324:254 325:253 326:253 327:133 351:142 352:255 353:253 354:186 355:8 378:149 379:229 380:254 381:207 382:21 405:54 406:229 407:253 408:254 409:105 433:152 434:254 435:254 436:213 437:26 460:112 461:251 462:253 463:253 464:26 487:29 488:212 489:253 490:250 491:149 514:36 515:214 516:253 517:253 518:137 542:75 543:253 544:253 545:253 546:59 570:93 571:253 572:253 573:189 574:17 598:224 599:253 600:253 601:84 625:43 626:235 627:253 628:126 629:1 653:99 654:248 655:253 656:119 682:225 683:235 684:49 -1 100:166 101:222 102:55 128:197 129:254 130:218 131:5 155:29 156:249 157:254 158:254 159:9 183:45 184:254 185:254 186:174 187:2 210:4 211:164 212:254 213:254 214:85 238:146 239:254 240:254 241:254 242:85 265:101 266:245 267:254 268:254 269:254 270:85 292:97 293:248 294:254 295:204 296:254 297:254 298:85 315:12 316:59 317:98 318:151 319:237 320:254 321:254 322:109 323:35 324:254 325:254 326:85 343:41 344:216 345:254 346:254 347:239 348:153 349:37 350:4 351:32 352:254 353:254 354:85 372:7 373:44 374:44 375:30 379:32 380:254 381:254 382:96 407:19 408:230 409:254 410:174 436:197 437:254 438:110 464:197 465:254 466:85 492:197 493:253 494:63 515:37 516:54 517:54 518:45 519:26 520:84 521:221 522:84 523:21 524:31 525:162 526:78 540:6 541:41 542:141 543:244 544:254 545:254 546:248 547:236 548:254 549:254 550:254 551:233 552:239 553:254 554:138 567:23 568:167 569:254 570:254 571:254 572:254 573:229 574:228 575:185 576:138 577:138 578:138 579:138 580:138 581:138 582:44 595:113 596:254 597:254 598:254 599:179 600:64 601:5 623:32 624:209 625:183 626:97 -0 155:53 156:255 157:253 158:253 159:253 160:124 183:180 184:253 185:251 186:251 187:251 188:251 189:145 190:62 209:32 210:217 211:241 212:253 213:251 214:251 215:251 216:251 217:253 218:107 237:37 238:251 239:251 240:253 241:251 242:251 243:251 244:251 245:253 246:107 265:166 266:251 267:251 268:253 269:251 270:96 271:148 272:251 273:253 274:107 291:73 292:253 293:253 294:253 295:253 296:130 299:110 300:253 301:255 302:108 319:73 320:251 321:251 322:251 323:251 327:109 328:251 329:253 330:107 347:202 348:251 349:251 350:251 351:225 354:6 355:129 356:251 357:253 358:107 375:150 376:251 377:251 378:251 379:71 382:115 383:251 384:251 385:253 386:107 403:253 404:251 405:251 406:173 407:20 410:217 411:251 412:251 413:253 414:107 430:182 431:255 432:253 433:216 438:218 439:253 440:253 441:182 457:63 458:221 459:253 460:251 461:215 465:84 466:236 467:251 468:251 469:77 485:109 486:251 487:253 488:251 489:215 492:11 493:160 494:251 495:251 496:96 513:109 514:251 515:253 516:251 517:137 520:150 521:251 522:251 523:251 524:71 541:109 542:251 543:253 544:251 545:35 547:130 548:253 549:251 550:251 551:173 552:20 569:110 570:253 571:255 572:253 573:98 574:150 575:253 576:255 577:253 578:164 597:109 598:251 599:253 600:251 601:251 602:251 603:251 604:253 605:251 606:35 625:93 626:241 627:253 628:251 629:251 630:251 631:251 632:216 633:112 634:5 654:103 655:253 656:251 657:251 658:251 659:251 683:124 684:251 685:225 686:71 687:71 -0 128:73 129:253 130:227 131:73 132:21 156:73 157:251 158:251 159:251 160:174 182:16 183:166 184:228 185:251 186:251 187:251 188:122 210:62 211:220 212:253 213:251 214:251 215:251 216:251 217:79 238:79 239:231 240:253 241:251 242:251 243:251 244:251 245:232 246:77 264:145 265:253 266:253 267:253 268:255 269:253 270:253 271:253 272:253 273:255 274:108 292:144 293:251 294:251 295:251 296:253 297:168 298:107 299:169 300:251 301:253 302:189 303:20 318:27 319:89 320:236 321:251 322:235 323:215 324:164 325:15 326:6 327:129 328:251 329:253 330:251 331:35 345:47 346:211 347:253 348:251 349:251 350:142 354:37 355:251 356:251 357:253 358:251 359:35 373:109 374:251 375:253 376:251 377:251 378:142 382:11 383:148 384:251 385:253 386:251 387:164 400:11 401:150 402:253 403:255 404:211 405:25 410:11 411:150 412:253 413:255 414:211 415:25 428:140 429:251 430:251 431:253 432:107 438:37 439:251 440:251 441:211 442:46 456:190 457:251 458:251 459:253 460:128 461:5 466:37 467:251 468:251 469:51 484:115 485:251 486:251 487:253 488:188 489:20 492:32 493:109 494:129 495:251 496:173 497:103 512:217 513:251 514:251 515:201 516:30 520:73 521:251 522:251 523:251 524:71 540:166 541:253 542:253 543:255 544:149 545:73 546:150 547:253 548:255 549:253 550:253 551:143 568:140 569:251 570:251 571:253 572:251 573:251 574:251 575:251 576:253 577:251 578:230 579:61 596:190 597:251 598:251 599:253 600:251 601:251 602:251 603:251 604:242 605:215 606:55 624:21 625:189 626:251 627:253 628:251 629:251 630:251 631:173 632:103 653:31 654:200 655:253 656:251 657:96 658:71 659:20 -1 155:178 156:255 157:105 182:6 183:188 184:253 185:216 186:14 210:14 211:202 212:253 213:253 214:23 238:12 239:199 240:253 241:128 242:6 266:42 267:253 268:253 269:158 294:42 295:253 296:253 297:158 322:155 323:253 324:253 325:158 350:160 351:253 352:253 353:147 378:160 379:253 380:253 381:41 405:17 406:225 407:253 408:235 409:31 433:24 434:253 435:253 436:176 461:24 462:253 463:253 464:176 489:24 490:253 491:253 492:176 517:24 518:253 519:253 520:176 545:24 546:253 547:253 548:162 573:46 574:253 575:253 576:59 601:142 602:253 603:253 604:59 629:142 630:253 631:253 632:59 657:142 658:253 659:202 660:8 685:87 686:253 687:139 -0 154:46 155:105 156:254 157:254 158:254 159:254 160:255 161:239 162:41 180:37 181:118 182:222 183:254 184:253 185:253 186:253 187:253 188:253 189:253 190:211 191:54 207:14 208:200 209:253 210:253 211:254 212:253 213:253 214:253 215:253 216:253 217:253 218:253 219:116 233:16 234:160 235:236 236:253 237:253 238:253 239:254 240:253 241:253 242:246 243:229 244:253 245:253 246:253 247:116 261:99 262:253 263:253 264:253 265:253 266:253 267:254 268:253 269:253 270:213 271:99 272:253 273:253 274:253 275:116 288:25 289:194 290:253 291:253 292:253 293:253 294:131 295:97 296:169 297:253 298:93 299:99 300:253 301:253 302:253 303:116 316:206 317:253 318:253 319:251 320:233 321:127 322:9 324:18 325:38 326:3 327:15 328:171 329:253 330:253 331:116 343:55 344:240 345:253 346:253 347:233 355:31 356:186 357:253 358:253 359:116 371:176 372:253 373:253 374:253 375:127 383:99 384:253 385:253 386:253 387:116 399:176 400:253 401:253 402:131 403:9 411:99 412:253 413:253 414:253 415:116 426:119 427:254 428:254 429:232 430:75 440:158 441:254 442:254 443:117 454:118 455:253 456:253 457:154 468:156 469:253 470:253 471:116 482:118 483:253 484:253 485:154 496:156 497:253 498:253 499:116 509:46 510:222 511:253 512:253 513:154 522:7 523:116 524:246 525:253 526:180 527:9 538:118 539:253 540:253 541:154 550:116 551:253 552:253 553:253 554:174 566:118 567:253 568:253 569:154 577:110 578:246 579:253 580:253 581:240 582:67 594:118 595:253 596:253 597:238 598:215 599:49 600:20 601:20 602:20 603:66 604:215 605:241 606:253 607:245 608:233 609:64 622:82 623:229 624:253 625:253 626:253 627:253 628:253 629:253 630:253 631:254 632:253 633:253 634:240 635:107 651:176 652:253 653:253 654:253 655:253 656:253 657:253 658:253 659:254 660:253 661:253 662:108 679:40 680:239 681:253 682:253 683:253 684:253 685:253 686:253 687:254 688:161 689:57 690:4 -0 152:56 153:105 154:220 155:254 156:63 178:18 179:166 180:233 181:253 182:253 183:253 184:236 185:209 186:209 187:209 188:77 189:18 206:84 207:253 208:253 209:253 210:253 211:253 212:254 213:253 214:253 215:253 216:253 217:172 218:8 233:57 234:238 235:253 236:253 237:253 238:253 239:253 240:254 241:253 242:253 243:253 244:253 245:253 246:119 260:14 261:238 262:253 263:253 264:253 265:253 266:253 267:253 268:179 269:196 270:253 271:253 272:253 273:253 274:238 275:12 288:33 289:253 290:253 291:253 292:253 293:253 294:248 295:134 297:18 298:83 299:237 300:253 301:253 302:253 303:14 316:164 317:253 318:253 319:253 320:253 321:253 322:128 327:57 328:119 329:214 330:253 331:94 343:57 344:248 345:253 346:253 347:253 348:126 349:14 350:4 357:179 358:253 359:248 360:56 371:175 372:253 373:253 374:240 375:190 376:28 385:179 386:253 387:253 388:173 399:209 400:253 401:253 402:178 413:92 414:253 415:253 416:208 427:211 428:254 429:254 430:179 442:135 443:255 444:209 455:209 456:253 457:253 458:90 470:134 471:253 472:208 483:209 484:253 485:253 486:178 497:2 498:142 499:253 500:208 511:209 512:253 513:253 514:214 515:35 525:30 526:253 527:253 528:208 539:165 540:253 541:253 542:253 543:215 544:36 553:163 554:253 555:253 556:164 567:18 568:172 569:253 570:253 571:253 572:214 573:127 574:7 580:72 581:232 582:253 583:171 584:17 596:8 597:182 598:253 599:253 600:253 601:253 602:162 603:56 607:64 608:240 609:253 610:253 611:14 625:7 626:173 627:253 628:253 629:253 630:253 631:245 632:241 633:239 634:239 635:246 636:253 637:225 638:14 639:1 654:18 655:59 656:138 657:224 658:253 659:253 660:254 661:253 662:253 663:253 664:240 665:96 685:37 686:104 687:192 688:255 689:253 690:253 691:182 692:73 -1 130:7 131:176 132:254 133:224 158:51 159:253 160:253 161:223 185:4 186:170 187:253 188:253 189:214 213:131 214:253 215:253 216:217 217:39 241:209 242:253 243:253 244:134 268:75 269:240 270:253 271:239 272:26 296:184 297:253 298:245 299:63 323:142 324:255 325:253 326:185 350:62 351:229 352:254 353:242 354:73 377:54 378:229 379:253 380:254 381:105 405:152 406:254 407:254 408:213 409:26 432:32 433:243 434:253 435:253 436:115 459:2 460:142 461:253 462:253 463:155 487:30 488:253 489:253 490:232 491:55 515:75 516:253 517:253 518:164 542:72 543:232 544:253 545:189 546:17 570:224 571:253 572:253 573:163 597:43 598:235 599:253 600:253 601:195 602:21 625:28 626:231 627:253 628:253 629:184 630:14 654:225 655:253 656:253 657:75 -0 155:21 156:176 157:253 158:253 159:124 182:105 183:176 184:251 185:251 186:251 187:251 188:105 208:58 209:217 210:241 211:253 212:251 213:251 214:251 215:251 216:243 217:113 218:5 235:63 236:231 237:251 238:251 239:253 240:251 241:251 242:251 243:251 244:253 245:251 246:113 263:144 264:251 265:251 266:251 267:253 268:251 269:251 270:251 271:251 272:253 273:251 274:215 290:125 291:253 292:253 293:253 294:253 295:255 296:253 297:253 298:253 299:253 300:255 301:253 302:227 303:42 318:253 319:251 320:251 321:251 322:251 323:253 324:251 325:251 326:251 327:251 328:253 329:251 330:251 331:142 345:27 346:253 347:251 348:251 349:235 350:241 351:253 352:251 353:246 354:137 355:35 356:98 357:251 358:251 359:236 360:61 372:47 373:211 374:253 375:251 376:235 377:82 378:103 379:253 380:251 381:137 384:73 385:251 386:251 387:251 388:71 399:27 400:211 401:251 402:253 403:251 404:86 407:72 408:71 409:10 412:73 413:251 414:251 415:173 416:20 427:89 428:253 429:253 430:255 431:253 432:35 440:73 441:253 442:253 443:253 444:72 454:84 455:236 456:251 457:251 458:253 459:251 460:138 468:73 469:251 470:251 471:251 472:71 481:63 482:236 483:251 484:251 485:251 486:227 487:251 488:246 489:138 490:11 494:16 495:37 496:228 497:251 498:246 499:137 500:10 509:73 510:251 511:251 512:251 513:173 514:42 515:142 516:142 517:142 518:41 522:109 523:251 524:253 525:251 526:137 537:73 538:251 539:251 540:173 541:20 549:27 550:211 551:251 552:253 553:147 554:10 565:73 566:253 567:253 568:143 575:21 576:176 577:253 578:253 579:253 593:73 594:251 595:251 596:205 597:144 603:176 604:251 605:251 606:188 607:107 621:62 622:236 623:251 624:251 625:251 626:218 627:217 628:217 629:217 630:217 631:253 632:230 633:189 634:20 650:83 651:158 652:251 653:251 654:253 655:251 656:251 657:251 658:251 659:253 660:107 679:37 680:251 681:251 682:253 683:251 684:251 685:251 686:122 687:72 688:30 -1 151:68 152:45 153:131 154:131 155:131 156:101 157:68 158:92 159:44 187:19 188:170 211:29 212:112 213:89 215:40 216:222 239:120 240:254 241:251 242:127 243:40 244:222 267:197 268:254 269:254 270:91 271:40 272:222 294:64 295:247 296:254 297:236 298:50 299:40 300:107 322:184 323:254 324:254 325:91 327:6 328:14 350:203 351:254 352:254 353:71 377:23 378:218 379:254 380:254 381:71 405:113 406:254 407:255 408:239 409:53 433:210 434:254 435:254 436:195 460:62 461:242 462:254 463:241 464:88 468:28 488:86 489:254 490:254 491:189 495:28 496:104 516:106 517:254 518:254 519:168 523:40 524:91 544:216 545:254 546:245 547:51 551:35 552:80 572:216 573:254 574:102 599:55 600:239 601:254 602:52 627:166 628:254 629:210 630:23 655:223 656:252 657:104 683:223 684:169 -0 125:29 126:170 127:255 128:255 129:141 151:29 152:198 153:255 154:255 155:255 156:226 157:255 158:86 178:141 179:255 180:255 181:170 182:29 184:86 185:255 186:255 187:141 204:29 205:226 206:255 207:198 208:57 213:226 214:255 215:255 216:226 217:114 231:29 232:255 233:255 234:114 241:141 242:170 243:114 244:255 245:255 246:141 259:226 260:255 261:170 269:29 270:57 273:141 274:255 275:226 286:57 287:255 288:170 302:114 303:255 304:198 314:226 315:255 331:170 332:255 333:57 342:255 343:226 360:255 361:170 370:255 371:170 388:114 389:198 398:255 399:226 416:86 417:255 426:198 427:255 444:86 445:255 454:114 455:255 456:57 472:86 473:255 482:29 483:255 484:226 500:141 501:255 511:170 512:255 513:170 528:226 529:198 539:29 540:226 541:255 542:170 555:29 556:255 557:114 568:29 569:226 570:255 571:141 582:57 583:226 584:226 598:141 599:255 600:255 601:170 602:86 607:29 608:86 609:226 610:255 611:226 612:29 627:86 628:198 629:255 630:255 631:255 632:255 633:255 634:255 635:255 636:255 637:255 638:141 639:29 657:29 658:114 659:170 660:170 661:170 662:170 663:170 664:86 -0 153:203 154:254 155:252 156:252 157:252 158:214 159:51 160:20 180:62 181:221 182:252 183:250 184:250 185:250 186:252 187:250 188:160 189:20 207:62 208:211 209:250 210:252 211:250 212:250 213:250 214:252 215:250 216:250 217:49 234:41 235:221 236:250 237:250 238:252 239:250 240:250 241:250 242:252 243:250 244:128 245:10 262:254 263:252 264:252 265:252 266:254 267:252 268:252 269:252 270:254 271:252 272:252 273:90 290:150 291:190 292:250 293:250 294:252 295:250 296:250 297:169 298:171 299:250 300:250 301:250 302:82 318:31 319:191 320:250 321:250 322:252 323:189 324:100 325:20 326:172 327:250 328:250 329:250 330:80 346:213 347:250 348:250 349:250 350:212 351:29 354:252 355:250 356:250 357:250 374:92 375:252 376:252 377:252 382:51 383:252 384:252 385:252 386:203 401:82 402:252 403:250 404:250 405:169 410:132 411:250 412:250 413:250 414:121 428:92 429:231 430:252 431:250 432:159 433:20 438:252 439:250 440:250 441:250 456:30 457:211 458:252 459:250 460:221 461:40 466:90 467:250 468:250 469:250 470:163 484:31 485:213 486:254 487:232 488:80 494:92 495:252 496:252 497:212 498:163 512:151 513:250 514:252 515:149 522:252 523:250 524:250 525:49 540:60 541:221 542:252 543:210 544:60 550:252 551:250 552:250 553:49 569:202 570:252 571:250 572:221 573:40 576:123 577:202 578:252 579:250 580:250 581:49 596:123 597:243 598:255 599:252 600:252 601:252 602:254 603:252 604:252 605:252 606:254 607:252 608:100 625:121 626:171 627:250 628:250 629:250 630:252 631:250 632:250 633:250 634:252 635:250 636:100 654:20 655:160 656:250 657:250 658:252 659:250 660:250 661:250 662:252 663:189 664:40 683:20 684:170 685:250 686:252 687:250 688:128 689:49 690:49 691:29 -1 98:64 99:191 100:70 125:68 126:243 127:253 128:249 129:63 152:30 153:223 154:253 155:253 156:247 157:41 179:73 180:238 181:253 182:253 183:253 184:242 206:73 207:236 208:253 209:253 210:253 211:253 212:242 234:182 235:253 236:253 237:191 238:247 239:253 240:149 262:141 263:253 264:143 265:86 266:249 267:253 268:122 290:9 291:36 292:7 293:14 294:233 295:253 296:122 322:230 323:253 324:122 350:230 351:253 352:122 378:231 379:255 380:123 406:230 407:253 408:52 433:61 434:245 435:253 461:98 462:253 463:253 468:35 469:12 489:98 490:253 491:253 494:9 495:142 496:233 497:146 517:190 518:253 519:253 520:128 521:7 522:99 523:253 524:253 525:180 544:29 545:230 546:253 547:253 548:252 549:210 550:253 551:253 552:253 553:140 571:28 572:207 573:253 574:253 575:253 576:254 577:253 578:253 579:235 580:70 581:9 599:126 600:253 601:253 602:253 603:253 604:254 605:253 606:168 607:19 627:79 628:253 629:253 630:201 631:190 632:132 633:63 634:5 -1 125:26 126:240 127:72 153:25 154:238 155:208 182:209 183:226 184:14 210:209 211:254 212:43 238:175 239:254 240:128 266:63 267:254 268:204 294:107 295:254 296:204 322:88 323:254 324:204 350:55 351:254 352:204 378:126 379:254 380:204 406:126 407:254 408:189 434:169 435:254 436:121 462:209 463:254 464:193 490:209 491:254 492:111 517:22 518:235 519:254 520:37 545:137 546:254 547:227 548:16 573:205 574:255 575:185 601:205 602:254 603:125 629:205 630:254 631:125 657:111 658:212 659:43 -0 155:62 156:91 157:213 158:255 159:228 160:91 161:12 182:70 183:230 184:253 185:253 186:253 187:253 188:253 189:152 190:7 210:246 211:253 212:253 213:253 214:253 215:253 216:253 217:253 218:106 237:21 238:247 239:253 240:253 241:253 242:253 243:253 244:253 245:208 246:24 265:156 266:253 267:253 268:253 269:253 270:253 271:253 272:253 273:195 292:88 293:238 294:253 295:253 296:253 297:221 298:253 299:253 300:253 301:195 320:230 321:253 322:253 323:253 324:198 325:40 326:177 327:253 328:253 329:195 346:56 347:156 348:251 349:253 350:189 351:182 352:15 354:86 355:240 356:253 357:210 358:28 374:213 375:253 376:253 377:156 378:3 383:205 384:253 385:253 386:106 401:121 402:252 403:253 404:135 405:3 411:46 412:253 413:253 414:106 428:28 429:212 430:253 431:248 432:23 439:42 440:253 441:253 442:106 456:197 457:253 458:234 459:70 467:42 468:253 469:253 470:106 483:11 484:202 485:253 486:187 495:58 496:253 497:210 498:27 511:107 512:253 513:253 514:40 522:53 523:227 524:253 525:195 539:107 540:253 541:253 542:40 549:47 550:227 551:253 552:231 553:58 567:107 568:253 569:253 570:40 575:5 576:131 577:222 578:253 579:231 580:59 595:14 596:204 597:253 598:226 599:222 600:73 601:58 602:58 603:170 604:253 605:253 606:227 607:58 624:197 625:253 626:253 627:253 628:253 629:253 630:253 631:253 632:253 633:238 634:58 652:33 653:179 654:241 655:253 656:253 657:253 658:253 659:250 660:116 661:14 682:75 683:179 684:253 685:151 686:89 687:86 -1 157:42 158:228 159:253 160:253 185:144 186:251 187:251 188:251 212:89 213:236 214:251 215:235 216:215 239:79 240:253 241:251 242:251 243:142 267:180 268:253 269:251 270:251 271:142 294:32 295:202 296:255 297:253 298:216 322:109 323:251 324:253 325:251 326:112 349:6 350:129 351:251 352:253 353:127 354:5 377:37 378:251 379:251 380:253 381:107 405:166 406:251 407:251 408:201 409:30 432:42 433:228 434:253 435:253 460:144 461:251 462:251 463:147 487:63 488:236 489:251 490:251 491:71 515:150 516:251 517:251 518:204 519:41 543:253 544:251 545:251 546:142 571:255 572:253 573:164 598:105 599:253 600:251 601:35 626:180 627:253 628:251 629:35 654:180 655:253 656:251 657:35 682:180 683:253 684:251 685:35 -1 128:62 129:254 130:213 156:102 157:253 158:252 159:102 160:20 184:102 185:254 186:253 187:254 188:50 212:102 213:253 214:252 215:253 216:50 240:102 241:254 242:253 243:254 244:50 268:142 269:253 270:252 271:253 272:50 295:51 296:253 297:254 298:253 299:224 300:20 323:132 324:252 325:253 326:252 327:162 351:173 352:253 353:254 354:253 355:102 378:82 379:253 380:252 381:253 382:252 383:61 406:203 407:254 408:253 409:254 410:233 433:41 434:243 435:253 436:252 437:253 438:111 461:132 462:253 463:254 464:253 465:203 488:41 489:253 490:252 491:253 492:252 493:40 515:11 516:213 517:254 518:253 519:254 520:151 543:92 544:252 545:253 546:252 547:192 548:50 570:21 571:214 572:253 573:255 574:253 575:41 598:142 599:253 600:252 601:253 602:171 625:113 626:253 627:255 628:253 629:203 630:40 653:30 654:131 655:233 656:111 -0 154:28 155:195 156:254 157:254 158:254 159:254 160:254 161:255 162:61 181:6 182:191 183:253 184:253 185:253 186:253 187:253 188:253 189:253 190:60 208:26 209:190 210:253 211:253 212:253 213:253 214:240 215:191 216:242 217:253 218:60 235:15 236:187 237:253 238:253 239:253 240:253 241:253 242:200 244:211 245:253 246:60 262:22 263:66 264:253 265:253 266:253 267:253 268:241 269:209 270:44 271:23 272:218 273:253 274:60 290:124 291:253 292:253 293:253 294:253 295:253 296:182 299:131 300:253 301:253 302:60 318:38 319:217 320:253 321:253 322:244 323:111 324:37 327:131 328:253 329:253 330:60 346:124 347:253 348:253 349:253 350:165 354:22 355:182 356:253 357:253 358:60 374:124 375:253 376:253 377:240 378:45 382:53 383:253 384:253 385:249 386:58 401:16 402:168 403:253 404:216 405:45 410:53 411:253 412:253 413:138 429:159 430:253 431:253 432:147 438:53 439:253 440:253 441:138 456:136 457:252 458:253 459:227 460:5 466:53 467:253 468:243 469:101 484:140 485:253 486:253 487:124 494:156 495:253 496:218 511:13 512:164 513:253 514:142 515:5 521:32 522:233 523:253 524:218 539:62 540:253 541:253 542:130 548:37 549:203 550:253 551:253 552:127 567:62 568:253 569:253 570:147 571:36 572:36 573:36 574:36 575:151 576:222 577:253 578:245 579:127 580:8 595:34 596:202 597:253 598:253 599:253 600:253 601:253 602:253 603:253 604:253 605:253 606:200 624:140 625:253 626:253 627:253 628:253 629:253 630:253 631:253 632:248 633:235 634:65 652:87 653:173 654:253 655:253 656:253 657:253 658:253 659:253 660:182 681:14 682:78 683:96 684:253 685:253 686:253 687:137 688:56 -0 123:8 124:76 125:202 126:254 127:255 128:163 129:37 130:2 150:13 151:182 152:253 153:253 154:253 155:253 156:253 157:253 158:23 177:15 178:179 179:253 180:253 181:212 182:91 183:218 184:253 185:253 186:179 187:109 205:105 206:253 207:253 208:160 209:35 210:156 211:253 212:253 213:253 214:253 215:250 216:113 232:19 233:212 234:253 235:253 236:88 237:121 238:253 239:233 240:128 241:91 242:245 243:253 244:248 245:114 260:104 261:253 262:253 263:110 264:2 265:142 266:253 267:90 270:26 271:199 272:253 273:248 274:63 287:1 288:173 289:253 290:253 291:29 293:84 294:228 295:39 299:72 300:251 301:253 302:215 303:29 315:36 316:253 317:253 318:203 319:13 328:82 329:253 330:253 331:170 343:36 344:253 345:253 346:164 356:11 357:198 358:253 359:184 360:6 371:36 372:253 373:253 374:82 385:138 386:253 387:253 388:35 399:128 400:253 401:253 402:47 413:48 414:253 415:253 416:35 427:154 428:253 429:253 430:47 441:48 442:253 443:253 444:35 455:102 456:253 457:253 458:99 469:48 470:253 471:253 472:35 483:36 484:253 485:253 486:164 496:16 497:208 498:253 499:211 500:17 511:32 512:244 513:253 514:175 515:4 524:44 525:253 526:253 527:156 540:171 541:253 542:253 543:29 551:30 552:217 553:253 554:188 555:19 568:171 569:253 570:253 571:59 578:60 579:217 580:253 581:253 582:70 596:78 597:253 598:253 599:231 600:48 604:26 605:128 606:249 607:253 608:244 609:94 610:15 624:8 625:151 626:253 627:253 628:234 629:101 630:121 631:219 632:229 633:253 634:253 635:201 636:80 653:38 654:232 655:253 656:253 657:253 658:253 659:253 660:253 661:253 662:201 663:66 -0 127:68 128:254 129:255 130:254 131:107 153:11 154:176 155:230 156:253 157:253 158:253 159:212 180:28 181:197 182:253 183:253 184:253 185:253 186:253 187:229 188:107 189:14 208:194 209:253 210:253 211:253 212:253 213:253 214:253 215:253 216:253 217:53 235:69 236:241 237:253 238:253 239:253 240:253 241:241 242:186 243:253 244:253 245:195 262:10 263:161 264:253 265:253 266:253 267:246 268:40 269:57 270:231 271:253 272:253 273:195 290:140 291:253 292:253 293:253 294:253 295:154 297:25 298:253 299:253 300:253 301:195 318:213 319:253 320:253 321:253 322:135 323:8 325:3 326:128 327:253 328:253 329:195 345:77 346:238 347:253 348:253 349:253 350:7 354:116 355:253 356:253 357:195 372:11 373:165 374:253 375:253 376:231 377:70 378:1 382:78 383:237 384:253 385:195 400:33 401:253 402:253 403:253 404:182 411:200 412:253 413:195 428:98 429:253 430:253 431:253 432:24 439:42 440:253 441:195 456:197 457:253 458:253 459:253 460:24 467:163 468:253 469:195 484:197 485:253 486:253 487:189 488:13 494:53 495:227 496:253 497:121 512:197 513:253 514:253 515:114 521:21 522:227 523:253 524:231 525:27 540:197 541:253 542:253 543:114 547:5 548:131 549:143 550:253 551:231 552:59 568:197 569:253 570:253 571:236 572:73 573:58 574:217 575:223 576:253 577:253 578:253 579:174 596:197 597:253 598:253 599:253 600:253 601:253 602:253 603:253 604:253 605:253 606:253 607:48 624:149 625:253 626:253 627:253 628:253 629:253 630:253 631:253 632:253 633:182 634:15 635:3 652:12 653:168 654:253 655:253 656:253 657:253 658:253 659:248 660:89 661:23 -1 157:85 158:255 159:103 160:1 185:205 186:253 187:253 188:30 213:205 214:253 215:253 216:30 240:44 241:233 242:253 243:244 244:27 268:135 269:253 270:253 271:100 296:153 297:253 298:240 299:76 323:12 324:208 325:253 326:166 351:69 352:253 353:253 354:142 378:14 379:110 380:253 381:235 382:33 406:63 407:223 408:235 409:130 434:186 435:253 436:235 437:37 461:17 462:145 463:253 464:231 465:35 489:69 490:220 491:231 492:123 516:18 517:205 518:253 519:176 520:27 543:17 544:125 545:253 546:185 547:39 571:71 572:214 573:231 574:41 599:167 600:253 601:225 602:33 626:72 627:205 628:207 629:14 653:30 654:249 655:233 656:49 681:32 682:253 683:89 -1 126:94 127:132 154:250 155:250 156:4 182:250 183:254 184:95 210:250 211:254 212:95 238:250 239:254 240:95 266:250 267:254 268:95 294:250 295:254 296:95 322:250 323:254 324:95 350:250 351:254 352:95 378:250 379:254 380:95 405:77 406:254 407:250 408:19 433:96 434:254 435:249 461:53 462:253 463:252 464:43 490:250 491:251 492:32 517:85 518:254 519:249 545:96 546:254 547:249 573:83 574:254 575:250 576:14 602:250 603:254 604:95 630:250 631:255 632:95 658:132 659:254 660:95 -1 124:32 125:253 126:31 152:32 153:251 154:149 180:32 181:251 182:188 208:32 209:251 210:188 236:32 237:251 238:228 239:59 264:32 265:253 266:253 267:95 292:28 293:236 294:251 295:114 321:127 322:251 323:251 349:127 350:251 351:251 377:48 378:232 379:251 406:223 407:253 408:159 434:221 435:251 436:158 462:142 463:251 464:158 490:64 491:251 492:242 493:55 518:64 519:251 520:253 521:161 546:64 547:253 548:255 549:221 574:16 575:181 576:253 577:220 603:79 604:253 605:236 606:63 632:213 633:251 634:126 660:96 661:251 662:126 -1 129:39 130:254 131:255 132:254 133:140 157:136 158:253 159:253 160:228 161:67 184:6 185:227 186:253 187:253 188:58 211:29 212:188 213:253 214:253 215:253 216:17 239:95 240:253 241:253 242:253 243:157 244:8 266:3 267:107 268:253 269:253 270:245 271:77 294:29 295:253 296:253 297:240 298:100 322:141 323:253 324:253 325:215 349:129 350:248 351:253 352:253 353:215 377:151 378:253 379:253 380:253 381:144 405:151 406:253 407:253 408:253 409:27 431:3 432:102 433:242 434:253 435:253 436:110 437:3 459:97 460:253 461:253 462:253 463:214 464:55 487:207 488:253 489:253 490:253 491:158 515:67 516:253 517:253 518:253 519:158 543:207 544:253 545:253 546:240 547:88 571:207 572:253 573:253 574:224 598:32 599:217 600:253 601:253 602:224 626:141 627:253 628:253 629:253 630:133 654:36 655:219 656:253 657:140 658:10 -0 123:59 124:55 149:71 150:192 151:254 152:250 153:147 154:17 176:123 177:247 178:253 179:254 180:253 181:253 182:196 183:79 184:176 185:175 186:175 187:124 188:48 203:87 204:247 205:247 206:176 207:95 208:102 209:117 210:243 211:237 212:192 213:232 214:253 215:253 216:245 217:152 218:6 230:23 231:229 232:253 233:138 238:219 239:58 241:95 242:118 243:80 244:230 245:254 246:196 247:30 258:120 259:254 260:205 261:8 266:114 272:38 273:255 274:254 275:155 276:5 286:156 287:253 288:92 301:61 302:235 303:253 304:102 314:224 315:253 316:78 330:117 331:253 332:196 333:18 342:254 343:253 344:78 358:9 359:211 360:253 361:73 370:254 371:253 372:78 387:175 388:253 389:155 398:194 399:254 400:101 415:79 416:254 417:155 426:112 427:253 428:211 429:9 443:73 444:251 445:200 454:41 455:241 456:253 457:87 471:25 472:240 473:253 483:147 484:253 485:227 486:47 499:94 500:253 501:200 511:5 512:193 513:253 514:230 515:76 527:175 528:253 529:155 540:31 541:219 542:254 543:255 544:126 545:18 553:14 554:149 555:254 556:244 557:45 569:21 570:158 571:254 572:253 573:226 574:162 575:118 576:96 577:20 578:20 579:73 580:118 581:224 582:253 583:247 584:85 598:30 599:155 600:253 601:253 602:253 603:253 604:254 605:253 606:253 607:253 608:253 609:254 610:247 611:84 627:5 628:27 629:117 630:206 631:244 632:229 633:213 634:213 635:213 636:176 637:117 638:32 659:45 660:23 -1 128:58 129:139 156:247 157:247 158:25 183:121 184:253 185:156 186:3 211:133 212:253 213:145 238:11 239:227 240:253 241:145 266:7 267:189 268:253 269:145 294:35 295:252 296:253 297:145 322:146 323:252 324:253 325:131 350:146 351:252 352:253 353:13 378:146 379:252 380:253 381:13 406:147 407:253 408:255 409:13 434:146 435:252 436:253 437:13 462:146 463:252 464:253 465:13 490:146 491:252 492:253 493:13 517:22 518:230 519:252 520:221 521:9 545:22 546:230 547:252 548:133 574:146 575:252 576:133 602:146 603:252 604:120 630:146 631:252 658:146 659:252 -1 129:28 130:247 131:255 132:165 156:47 157:221 158:252 159:252 160:164 184:177 185:252 186:252 187:252 188:164 212:177 213:252 214:252 215:223 216:78 240:177 241:252 242:252 243:197 267:114 268:236 269:252 270:235 271:42 294:5 295:148 296:252 297:252 298:230 321:14 322:135 323:252 324:252 325:252 326:230 349:78 350:252 351:252 352:252 353:252 354:162 377:78 378:252 379:252 380:252 381:252 382:9 405:78 406:252 407:252 408:252 409:252 410:9 432:32 433:200 434:252 435:252 436:252 437:105 438:3 459:10 460:218 461:252 462:252 463:252 464:105 465:8 487:225 488:252 489:252 490:252 491:240 492:69 514:44 515:237 516:252 517:252 518:228 519:85 541:59 542:218 543:252 544:252 545:225 546:93 568:65 569:208 570:252 571:252 572:252 573:175 596:133 597:252 598:252 599:252 600:225 601:68 624:133 625:252 626:252 627:244 628:54 652:133 653:252 654:252 655:48 -0 156:13 157:6 181:10 182:77 183:145 184:253 185:190 186:67 207:11 208:77 209:193 210:252 211:252 212:253 213:252 214:238 215:157 216:71 217:26 233:10 234:78 235:193 236:252 237:252 238:252 239:252 240:253 241:252 242:252 243:252 244:252 245:228 246:128 247:49 248:5 259:6 260:78 261:194 262:252 263:252 264:252 265:252 266:252 267:252 268:253 269:217 270:192 271:232 272:252 273:252 274:252 275:252 276:135 277:3 286:4 287:147 288:252 289:252 290:252 291:252 292:252 293:252 294:252 295:252 296:175 297:26 299:40 300:145 301:235 302:252 303:252 304:252 305:104 314:208 315:252 316:252 317:252 318:252 319:252 320:252 321:133 322:48 323:48 329:71 330:236 331:252 332:252 333:230 342:253 343:185 344:170 345:252 346:252 347:252 348:173 349:22 358:102 359:252 360:252 361:252 370:24 371:141 372:243 373:252 374:252 375:186 376:5 386:8 387:220 388:252 389:252 398:70 399:247 400:252 401:252 402:165 403:37 414:81 415:251 416:252 417:194 426:255 427:253 428:253 429:251 430:69 441:39 442:231 443:253 444:253 445:127 454:253 455:252 456:249 457:127 468:6 469:147 470:252 471:252 472:190 473:5 482:253 483:252 484:216 495:7 496:145 497:252 498:252 499:252 500:69 510:253 511:252 512:223 513:16 522:25 523:185 524:252 525:252 526:252 527:107 528:8 538:167 539:252 540:252 541:181 542:18 549:105 550:191 551:252 552:252 553:235 554:151 555:10 566:37 567:221 568:252 569:252 570:210 571:193 572:96 573:73 574:130 575:188 576:194 577:227 578:252 579:252 580:235 581:128 595:97 596:220 597:252 598:252 599:252 600:252 601:252 602:252 603:252 604:253 605:252 606:252 607:236 608:70 624:40 625:174 626:252 627:252 628:252 629:252 630:252 631:252 632:253 633:197 634:138 635:29 653:5 654:23 655:116 656:143 657:143 658:143 659:143 660:24 661:10 -0 127:28 128:164 129:254 130:233 131:148 132:11 154:3 155:164 156:254 157:234 158:225 159:254 160:204 182:91 183:254 184:235 185:48 186:32 187:166 188:251 189:92 208:33 209:111 210:214 211:205 212:49 215:24 216:216 217:210 235:34 236:217 237:254 238:254 239:211 244:87 245:237 246:43 262:34 263:216 264:254 265:254 266:252 267:243 268:61 272:38 273:248 274:182 290:171 291:254 292:184 293:205 294:175 295:36 301:171 302:227 317:28 318:234 319:190 320:13 321:193 322:157 329:124 330:238 331:26 345:140 346:254 347:131 349:129 350:157 357:124 358:254 359:95 373:201 374:238 375:56 377:70 378:103 385:124 386:254 387:148 400:62 401:255 402:210 413:150 414:254 415:122 428:86 429:254 430:201 431:15 440:28 441:237 442:246 443:44 456:128 457:254 458:143 468:34 469:243 470:227 484:62 485:254 486:210 496:58 497:249 498:179 512:30 513:240 514:210 524:207 525:254 526:64 541:216 542:231 543:34 551:129 552:248 553:170 554:9 569:131 570:254 571:170 577:17 578:129 579:248 580:225 581:24 597:50 598:245 599:245 600:184 601:106 602:106 603:106 604:133 605:231 606:254 607:244 608:53 626:67 627:249 628:254 629:254 630:254 631:254 632:254 633:251 634:193 635:40 655:38 656:157 657:248 658:166 659:166 660:139 661:57 -0 129:105 130:255 131:219 132:67 133:67 134:52 156:20 157:181 158:253 159:253 160:253 161:253 162:226 163:69 182:4 183:129 184:206 185:253 186:253 187:253 188:253 189:253 190:253 191:130 209:9 210:141 211:253 212:253 213:253 214:253 215:253 216:253 217:253 218:253 219:166 220:20 237:134 238:253 239:253 240:253 241:253 242:253 243:253 244:253 245:253 246:253 247:253 248:65 262:2 263:83 264:207 265:246 266:253 267:253 268:253 269:253 270:253 271:249 272:234 273:247 274:253 275:253 276:65 290:83 291:253 292:253 293:253 294:253 295:253 296:189 297:253 298:253 299:205 301:179 302:253 303:253 304:65 317:85 318:234 319:253 320:253 321:253 322:253 323:157 324:26 325:164 326:151 327:83 329:179 330:253 331:253 332:65 344:65 345:237 346:253 347:253 348:253 349:67 350:36 351:14 353:15 354:12 357:179 358:253 359:253 360:65 371:4 372:141 373:253 374:253 375:221 376:158 377:23 385:179 386:253 387:253 388:65 399:129 400:253 401:253 402:241 403:62 412:72 413:226 414:253 415:175 416:24 426:119 427:247 428:253 429:253 430:206 439:8 440:134 441:253 442:253 443:130 454:132 455:253 456:253 457:194 458:27 467:125 468:253 469:253 470:253 471:130 481:45 482:213 483:253 484:253 485:112 493:70 494:170 495:247 496:253 497:253 498:89 499:43 509:67 510:253 511:253 512:196 513:55 514:9 520:8 521:131 522:253 523:253 524:253 525:86 526:1 537:67 538:253 539:253 540:253 541:253 542:129 546:43 547:114 548:134 549:253 550:253 551:231 552:139 553:41 565:20 566:167 567:253 568:253 569:253 570:247 571:179 572:179 573:179 574:206 575:253 576:253 577:253 578:253 579:72 594:103 595:240 596:253 597:253 598:253 599:253 600:253 601:253 602:253 603:253 604:244 605:119 606:8 607:1 623:107 624:253 625:253 626:253 627:253 628:253 629:253 630:253 631:175 632:111 651:3 652:121 653:253 654:253 655:253 656:253 657:253 658:182 659:24 -0 125:22 126:183 127:252 128:254 129:252 130:252 131:252 132:76 151:85 152:85 153:168 154:250 155:250 156:252 157:250 158:250 159:250 160:250 161:71 163:43 164:85 165:14 178:107 179:252 180:250 181:250 182:250 183:250 184:252 185:250 186:250 187:250 188:250 189:210 191:127 192:250 193:146 205:114 206:237 207:252 208:250 209:250 210:250 211:250 212:252 213:250 214:250 215:250 216:250 217:210 219:127 220:250 221:250 232:107 233:237 234:250 235:252 236:250 237:250 238:250 239:74 240:41 241:41 242:41 243:41 244:217 245:34 247:127 248:250 249:250 259:15 260:148 261:252 262:252 263:254 264:238 265:105 275:128 276:252 277:252 286:15 287:140 288:250 289:250 290:250 291:167 292:111 303:127 304:250 305:250 314:43 315:250 316:250 317:250 318:250 331:127 332:250 333:250 342:183 343:250 344:250 345:250 346:110 358:57 359:210 360:250 361:250 370:252 371:250 372:250 373:110 374:7 386:85 387:250 388:250 389:250 398:254 399:252 400:252 401:83 414:86 415:252 416:252 417:217 426:252 427:250 428:250 429:138 430:14 441:15 442:140 443:250 444:250 445:41 454:252 455:250 456:250 457:250 458:41 469:43 470:250 471:250 472:250 473:41 482:252 483:250 484:250 485:250 486:181 497:183 498:250 499:250 500:250 501:41 510:76 511:250 512:250 513:250 514:250 524:177 525:252 526:250 527:250 528:110 529:7 538:36 539:224 540:252 541:252 542:252 543:219 544:43 545:43 546:43 547:7 549:15 550:43 551:183 552:252 553:255 554:252 555:126 567:85 568:250 569:250 570:250 571:252 572:250 573:250 574:250 575:111 576:86 577:140 578:250 579:250 580:250 581:252 582:222 583:83 595:42 596:188 597:250 598:250 599:252 600:250 601:250 602:250 603:250 604:252 605:250 606:250 607:250 608:250 609:126 610:83 624:127 625:250 626:250 627:252 628:250 629:250 630:250 631:250 632:252 633:250 634:250 635:137 636:83 652:21 653:41 654:217 655:252 656:250 657:250 658:250 659:250 660:217 661:41 662:41 663:14 -1 155:114 156:206 157:25 183:238 184:252 185:55 211:222 212:252 213:55 239:113 240:252 241:55 267:113 268:252 269:55 295:255 296:253 297:56 323:253 324:176 325:6 350:32 351:253 352:233 353:43 378:140 379:253 380:195 381:19 406:140 407:253 408:167 433:29 434:253 435:141 461:29 462:252 463:140 489:29 490:252 491:140 517:29 518:252 519:140 545:29 546:252 547:140 573:169 574:253 575:79 601:169 602:252 628:76 629:234 630:141 656:197 657:233 658:37 684:197 685:223 -1 127:73 128:253 129:253 130:63 155:115 156:252 157:252 158:144 183:217 184:252 185:252 186:144 210:63 211:237 212:252 213:252 214:144 238:109 239:252 240:252 241:252 266:109 267:252 268:252 269:252 294:109 295:252 296:252 297:252 322:191 323:252 324:252 325:252 349:145 350:255 351:253 352:253 353:253 376:32 377:237 378:253 379:252 380:252 381:210 404:37 405:252 406:253 407:252 408:252 409:108 432:37 433:252 434:253 435:252 436:252 437:108 460:21 461:207 462:255 463:253 464:253 465:108 489:144 490:253 491:252 492:252 493:108 516:27 517:221 518:253 519:252 520:252 521:108 544:16 545:190 546:253 547:252 548:252 549:108 573:145 574:255 575:253 576:253 577:253 601:144 602:253 603:252 604:252 605:210 629:144 630:253 631:252 632:252 633:108 657:62 658:253 659:252 660:252 661:108 -1 120:85 121:253 122:132 123:9 147:82 148:241 149:251 150:251 151:128 175:175 176:251 177:251 178:251 179:245 180:121 203:13 204:204 205:251 206:251 207:251 208:245 209:107 232:39 233:251 234:251 235:251 236:251 237:167 238:22 260:15 261:155 262:251 263:251 264:251 265:251 266:177 289:15 290:157 291:248 292:251 293:251 294:251 295:165 319:214 320:251 321:251 322:251 323:212 324:78 325:24 347:109 348:251 349:251 350:251 351:253 352:251 353:170 354:10 375:5 376:57 377:162 378:251 379:253 380:251 381:251 382:18 405:106 406:239 407:255 408:253 409:253 410:213 434:105 435:253 436:251 437:251 438:230 439:72 463:253 464:251 465:251 466:251 467:221 468:67 491:72 492:251 493:251 494:251 495:251 496:96 519:36 520:199 521:251 522:251 523:251 524:155 525:15 548:45 549:204 550:251 551:251 552:251 553:157 577:161 578:249 579:251 580:251 581:248 582:147 606:233 607:251 608:251 609:251 610:173 634:233 635:251 636:251 637:251 638:173 662:53 663:131 664:251 665:251 666:173 -1 126:15 127:200 128:255 129:90 154:42 155:254 156:254 157:173 182:42 183:254 184:254 185:199 210:26 211:237 212:254 213:221 214:12 239:213 240:254 241:231 242:17 267:213 268:254 269:199 295:213 296:254 297:199 323:213 324:254 325:96 350:20 351:232 352:254 353:33 378:84 379:254 380:229 381:17 406:168 407:254 408:203 433:8 434:217 435:254 436:187 461:84 462:254 463:254 464:48 489:195 490:254 491:254 492:37 516:20 517:233 518:254 519:212 520:4 544:132 545:254 546:254 547:82 571:9 572:215 573:254 574:254 575:116 576:46 599:55 600:254 601:254 602:254 603:254 604:121 627:113 628:254 629:254 630:254 631:254 632:40 655:12 656:163 657:254 658:185 659:58 660:1 -0 182:32 183:57 184:57 185:57 186:57 187:57 188:57 189:57 208:67 209:185 210:229 211:252 212:252 213:252 214:253 215:252 216:252 217:252 218:185 219:66 234:13 235:188 236:246 237:252 238:253 239:252 240:252 241:252 242:241 243:139 244:177 245:252 246:253 247:246 248:187 249:13 261:26 262:255 263:253 264:244 265:175 266:101 274:126 275:244 276:253 277:153 288:82 289:243 290:253 291:214 292:81 303:169 304:252 305:252 315:19 316:215 317:252 318:206 319:56 331:169 332:252 333:252 343:157 344:252 345:252 346:13 359:169 360:252 361:151 370:41 371:253 372:253 373:128 386:92 387:253 388:206 389:13 398:166 399:252 400:196 401:9 414:216 415:252 416:142 426:253 427:252 428:168 441:89 442:253 443:208 444:13 454:253 455:252 456:68 468:38 469:225 470:253 471:96 482:254 483:253 484:56 495:45 496:229 497:253 498:151 510:253 511:252 512:81 522:70 523:225 524:252 525:227 538:216 539:252 540:168 548:29 549:134 550:253 551:252 552:186 553:31 566:91 567:252 568:243 569:125 573:51 574:114 575:113 576:210 577:252 578:253 579:151 580:19 595:157 596:253 597:253 598:254 599:253 600:253 601:253 602:254 603:253 604:244 605:175 606:51 623:19 624:122 625:196 626:197 627:221 628:196 629:196 630:197 631:121 632:56 655:25 -0 127:42 128:235 129:255 130:84 153:15 154:132 155:208 156:253 157:253 158:171 159:108 180:6 181:177 182:253 183:253 184:253 185:253 186:253 187:242 188:110 208:151 209:253 210:253 211:253 212:253 213:253 214:253 215:253 216:139 235:48 236:208 237:253 238:253 239:253 240:253 241:253 242:253 243:253 244:139 263:85 264:253 265:253 266:253 267:253 268:236 269:156 270:184 271:253 272:148 273:6 290:7 291:141 292:253 293:253 294:253 295:253 296:27 298:170 299:253 300:253 301:74 318:19 319:253 320:253 321:253 322:253 323:253 324:27 326:170 327:253 328:253 329:74 345:16 346:186 347:253 348:253 349:253 350:242 351:105 352:4 354:170 355:253 356:253 357:94 358:1 373:141 374:253 375:253 376:253 377:242 378:100 382:170 383:253 384:253 385:253 386:8 401:141 402:253 403:253 404:253 405:224 410:170 411:253 412:253 413:253 414:8 428:12 429:158 430:253 431:253 432:230 433:51 438:18 439:237 440:253 441:253 442:8 456:76 457:253 458:253 459:218 460:61 467:236 468:253 469:253 470:8 484:76 485:253 486:253 487:168 495:110 496:253 497:132 498:3 512:76 513:253 514:253 515:168 521:20 522:174 523:239 524:147 525:5 539:5 540:155 541:253 542:253 543:168 548:102 549:170 550:253 551:253 552:139 567:3 568:128 569:253 570:253 571:228 572:179 573:179 574:179 575:179 576:245 577:253 578:253 579:219 580:41 596:76 597:253 598:253 599:253 600:253 601:253 602:253 603:253 604:253 605:253 606:253 607:163 624:39 625:199 626:253 627:253 628:253 629:253 630:253 631:253 632:253 633:253 634:170 635:9 653:36 654:219 655:253 656:253 657:253 658:253 659:253 660:224 661:65 662:22 -1 156:202 157:253 158:69 184:253 185:252 186:121 212:253 213:252 214:69 240:253 241:252 242:69 267:106 268:253 269:231 270:37 295:179 296:255 297:196 322:17 323:234 324:253 325:92 350:93 351:252 352:253 353:92 378:93 379:252 380:253 381:92 406:93 407:252 408:232 409:8 434:208 435:253 436:116 462:207 463:252 464:116 490:207 491:252 492:32 517:57 518:244 519:252 545:122 546:252 547:252 573:185 574:253 575:253 601:184 602:252 603:252 629:101 630:252 631:252 657:13 658:173 659:252 660:43 686:9 687:232 688:116 -1 156:73 157:253 158:253 159:253 160:124 184:73 185:251 186:251 187:251 188:251 212:99 213:251 214:251 215:251 216:225 240:253 241:251 242:251 243:251 244:71 266:79 267:180 268:253 269:251 270:251 271:173 272:20 294:110 295:253 296:255 297:253 298:216 322:109 323:251 324:253 325:251 326:215 350:109 351:251 352:253 353:251 354:215 378:109 379:251 380:253 381:251 382:137 406:109 407:251 408:253 409:251 410:35 433:37 434:253 435:253 436:255 437:253 438:35 461:140 462:251 463:251 464:253 465:168 466:15 488:125 489:246 490:251 491:251 492:190 493:15 516:144 517:251 518:251 519:251 520:180 543:53 544:221 545:251 546:251 547:251 548:51 571:125 572:253 573:253 574:253 575:201 598:105 599:253 600:251 601:251 602:188 603:30 626:180 627:253 628:251 629:251 630:142 654:180 655:253 656:251 657:235 658:82 682:180 683:253 684:251 685:215 -1 124:111 125:255 126:48 152:162 153:253 154:237 155:63 180:206 181:253 182:253 183:183 208:87 209:217 210:253 211:205 237:90 238:253 239:238 240:60 265:37 266:225 267:253 268:89 294:206 295:253 296:159 322:206 323:253 324:226 350:206 351:253 352:226 378:206 379:253 380:226 406:206 407:253 408:226 434:206 435:253 436:226 462:206 463:253 464:226 490:206 491:253 492:226 518:206 519:253 520:237 521:45 546:206 547:253 548:253 549:109 574:173 575:253 576:253 577:109 602:69 603:253 604:253 605:109 630:64 631:248 632:253 633:109 659:112 660:253 661:109 -0 99:70 100:255 101:165 102:114 127:122 128:253 129:253 130:253 131:120 155:165 156:253 157:253 158:253 159:234 160:52 183:99 184:253 185:253 186:253 187:253 188:228 189:26 209:60 210:168 211:238 212:202 213:174 214:253 215:253 216:253 217:127 235:91 236:81 237:1 238:215 239:128 240:28 241:12 242:181 243:253 244:253 245:175 246:3 262:18 263:204 264:253 265:77 270:7 271:253 272:253 273:253 274:54 289:54 290:248 291:253 292:253 293:143 298:1 299:127 300:253 301:253 302:188 317:104 318:253 319:253 320:253 321:20 327:81 328:249 329:253 330:191 345:192 346:253 347:253 348:218 349:5 356:203 357:253 358:208 359:21 372:56 373:237 374:253 375:250 376:100 384:104 385:253 386:253 387:75 400:76 401:253 402:253 403:224 412:119 413:253 414:253 415:75 428:80 429:253 430:253 431:103 439:4 440:241 441:253 442:218 443:32 456:213 457:253 458:253 459:103 467:125 468:253 469:253 470:191 484:213 485:253 486:253 487:103 494:3 495:176 496:253 497:253 498:135 512:213 513:253 514:253 515:103 521:9 522:162 523:253 524:253 525:226 526:37 540:179 541:253 542:253 543:135 548:46 549:157 550:253 551:253 552:253 553:63 568:23 569:188 570:253 571:249 572:179 573:179 574:179 575:179 576:233 577:253 578:253 579:233 580:156 581:10 597:51 598:235 599:253 600:253 601:253 602:253 603:253 604:253 605:251 606:232 607:120 626:16 627:124 628:253 629:253 630:253 631:253 632:152 633:104 -1 124:29 125:197 126:255 127:84 152:85 153:251 154:253 155:83 180:86 181:253 182:254 183:253 208:85 209:251 210:253 211:251 236:86 237:253 238:254 239:253 240:169 264:85 265:251 266:253 267:251 268:168 292:86 293:253 294:254 295:253 296:169 320:28 321:196 322:253 323:251 324:168 349:169 350:254 351:253 352:169 377:168 378:253 379:251 380:168 405:169 406:254 407:253 408:169 433:168 434:253 435:251 436:168 462:254 463:253 464:254 465:139 490:253 491:251 492:253 493:251 518:254 519:253 520:254 521:253 522:57 546:253 547:251 548:253 549:251 550:168 574:198 575:253 576:254 577:253 578:114 602:85 603:251 604:253 605:251 630:85 631:253 632:254 633:253 658:28 659:83 660:196 661:83 -1 159:31 160:210 161:253 162:163 187:198 188:252 189:252 190:162 213:10 214:86 215:242 216:252 217:252 218:66 241:164 242:252 243:252 244:252 245:188 246:8 268:53 269:242 270:252 271:252 272:225 273:14 296:78 297:252 298:252 299:252 300:204 323:56 324:231 325:252 326:252 327:212 328:35 351:157 352:252 353:252 354:252 355:37 377:8 378:132 379:253 380:252 381:252 382:230 383:24 405:45 406:252 407:253 408:252 409:154 410:55 427:7 428:55 433:107 434:253 435:255 436:228 437:53 454:15 455:24 456:23 460:110 461:242 462:252 463:228 464:59 482:57 483:83 487:88 488:247 489:252 490:252 491:140 514:15 515:189 516:252 517:252 518:252 542:74 543:252 544:252 545:238 546:90 570:178 571:252 572:252 573:189 597:40 598:217 599:252 600:252 601:59 625:75 626:252 627:252 628:252 629:85 630:61 653:62 654:239 655:252 656:156 657:14 682:178 683:252 684:14 -1 131:159 132:255 133:122 158:167 159:228 160:253 161:121 185:64 186:236 187:251 188:205 189:110 212:48 213:158 214:251 215:251 216:178 217:39 240:190 241:251 242:251 243:251 267:96 268:253 269:253 270:253 271:153 295:194 296:251 297:251 298:211 299:74 322:80 323:174 324:251 325:251 326:140 327:47 349:16 350:181 351:253 352:251 353:219 354:23 377:64 378:251 379:253 380:251 381:204 382:19 405:223 406:253 407:255 408:233 409:48 431:20 432:174 433:244 434:251 435:253 436:109 437:31 459:96 460:189 461:251 462:251 463:126 464:31 486:24 487:106 488:251 489:235 490:188 491:100 514:96 515:251 516:251 517:228 518:59 542:255 543:253 544:253 545:213 546:36 569:100 570:253 571:251 572:251 573:85 574:23 596:32 597:127 598:253 599:235 600:126 601:15 624:104 625:251 626:253 627:240 628:79 652:83 653:193 654:253 655:220 -0 153:92 154:191 155:178 156:253 157:242 158:141 159:104 160:29 180:26 181:253 182:252 183:252 184:252 185:253 186:252 187:252 188:252 189:108 190:19 206:57 207:123 208:222 209:253 210:252 211:252 212:252 213:168 214:224 215:252 216:252 217:253 218:84 233:176 234:243 235:252 236:252 237:253 238:252 239:252 240:252 242:19 243:153 244:252 245:253 246:209 247:25 259:10 260:128 261:255 262:253 263:244 264:225 265:114 266:194 267:253 268:178 272:163 273:254 274:253 275:168 287:85 288:252 289:253 290:189 291:56 294:19 295:133 296:9 300:38 301:253 302:252 303:168 314:19 315:191 316:252 317:194 318:19 329:253 330:252 331:234 332:22 342:107 343:252 344:252 345:13 357:253 358:252 359:252 360:128 370:169 371:253 372:241 385:141 386:253 387:253 388:140 397:19 398:225 399:252 400:139 413:66 414:252 415:252 416:139 425:29 426:252 427:252 428:52 441:29 442:252 443:252 444:139 453:29 454:252 455:252 456:28 469:29 470:252 471:252 472:40 481:141 482:253 483:253 484:91 497:154 498:253 499:168 509:66 510:252 511:252 512:165 525:253 526:252 527:168 537:19 538:224 539:252 540:252 552:126 553:253 554:252 555:80 566:169 567:252 568:252 569:214 570:38 579:126 580:249 581:253 582:151 583:6 594:26 595:223 596:253 597:254 598:253 599:128 600:29 604:13 605:41 606:216 607:253 608:253 609:226 610:38 623:122 624:252 625:253 626:252 627:252 628:252 629:169 630:169 631:169 632:206 633:253 634:252 635:252 636:202 637:38 651:19 652:56 653:168 654:224 655:252 656:252 657:253 658:252 659:252 660:252 661:253 662:233 663:130 664:6 682:94 683:139 684:190 685:153 686:252 687:164 688:139 689:28 690:22 -1 128:53 129:250 130:255 131:25 156:167 157:253 158:253 159:25 182:3 183:123 184:247 185:253 186:253 187:25 210:9 211:253 212:253 213:253 214:253 215:25 238:9 239:253 240:253 241:253 242:253 243:25 266:9 267:253 268:253 269:253 270:180 271:13 294:9 295:253 296:253 297:253 298:104 322:9 323:253 324:253 325:253 326:104 350:15 351:253 352:253 353:253 354:104 378:184 379:253 380:253 381:228 382:68 406:184 407:253 408:253 409:182 433:103 434:251 435:253 436:253 437:12 461:106 462:253 463:253 464:253 465:8 488:24 489:238 490:253 491:253 492:253 493:8 516:27 517:253 518:253 519:253 520:253 521:8 544:27 545:253 546:253 547:253 548:253 549:8 572:27 573:253 574:253 575:253 576:177 577:4 600:160 601:253 602:253 603:253 604:87 628:202 629:253 630:253 631:219 632:54 656:81 657:253 658:247 659:51 -0 122:63 123:176 124:253 125:253 126:159 127:113 128:63 150:140 151:253 152:252 153:252 154:252 155:252 156:241 157:100 158:66 177:54 178:227 179:253 180:252 181:252 182:252 183:252 184:253 185:252 186:239 187:181 188:57 204:38 205:224 206:252 207:253 208:226 209:246 210:252 211:252 212:253 213:252 214:252 215:252 216:252 217:108 218:3 232:57 233:252 234:252 235:253 236:27 237:88 238:112 239:112 240:112 241:112 242:142 243:252 244:252 245:253 246:152 247:31 260:198 261:253 262:253 263:79 270:32 271:153 272:253 273:255 274:253 275:196 287:76 288:246 289:252 290:127 299:3 300:106 301:253 302:252 303:214 304:28 315:194 316:252 317:252 318:112 329:143 330:252 331:252 332:193 343:225 344:252 345:217 346:37 357:38 358:234 359:252 360:223 370:63 371:240 372:252 373:84 386:146 387:252 388:223 398:114 399:253 400:228 401:47 414:147 415:253 416:253 417:112 426:159 427:252 428:195 442:225 443:252 444:252 445:112 454:253 455:252 456:195 470:225 471:252 472:230 473:25 482:159 483:252 484:202 485:10 497:92 498:243 499:252 500:208 510:113 511:252 512:252 513:161 524:79 525:253 526:252 527:220 528:37 538:114 539:253 540:253 541:253 542:174 543:63 550:26 551:128 552:253 553:255 554:253 555:133 566:12 567:228 568:252 569:252 570:252 571:241 572:100 573:85 574:76 576:85 577:131 578:231 579:252 580:252 581:253 582:129 583:6 595:97 596:208 597:252 598:252 599:253 600:252 601:252 602:246 603:197 604:253 605:252 606:252 607:252 608:220 609:133 610:6 624:19 625:99 626:239 627:253 628:252 629:252 630:252 631:252 632:253 633:252 634:245 635:223 636:99 654:63 655:112 656:112 657:221 658:252 659:252 660:253 661:127 662:87 -0 153:12 154:136 155:254 156:255 157:195 158:115 159:3 180:6 181:175 182:253 183:196 184:160 185:252 186:253 187:15 208:130 209:253 210:234 211:4 213:27 214:205 215:232 216:40 235:54 236:246 237:253 238:68 242:24 243:243 244:106 262:3 263:134 264:235 265:99 266:4 271:132 272:247 273:77 290:56 291:253 292:62 299:23 300:233 301:129 318:179 319:183 320:4 328:182 329:220 345:21 346:232 347:59 356:95 357:232 358:21 373:128 374:183 385:228 386:85 401:187 402:124 413:228 414:186 429:187 430:124 441:228 442:104 457:187 458:124 469:169 470:184 485:187 486:124 497:203 498:150 513:187 514:124 524:10 525:220 526:39 541:187 542:155 552:111 553:201 569:129 570:228 571:7 579:12 580:181 581:76 598:234 599:166 600:9 606:24 607:209 608:106 626:139 627:250 628:167 629:11 630:2 631:11 632:11 633:129 634:227 635:90 636:11 655:95 656:247 657:253 658:178 659:253 660:253 661:244 662:86 684:47 685:175 686:253 687:232 688:149 689:40 -1 128:255 129:253 130:57 156:253 157:251 158:225 159:56 183:169 184:254 185:253 186:254 187:84 211:168 212:253 213:251 214:253 215:83 238:85 239:253 240:254 241:253 242:169 266:85 267:251 268:253 269:251 270:56 294:141 295:253 296:254 297:253 322:253 323:251 324:253 325:251 350:254 351:253 352:254 353:253 378:253 379:251 380:253 381:251 406:254 407:253 408:254 409:196 433:114 434:253 435:251 436:253 437:83 461:169 462:254 463:253 464:226 465:56 489:168 490:253 491:251 492:168 516:85 517:253 518:254 519:253 544:85 545:251 546:253 547:251 572:254 573:253 574:254 575:253 600:253 601:251 602:253 603:251 628:254 629:253 630:254 631:253 656:139 657:251 658:253 659:138 -0 151:23 152:167 153:208 154:254 155:255 156:129 157:19 179:151 180:253 181:253 182:253 183:253 184:253 185:209 186:26 207:181 208:253 209:253 210:253 211:227 212:181 213:253 214:207 215:22 235:227 236:253 237:253 238:253 239:92 240:38 241:226 242:253 243:129 244:2 263:193 264:253 265:253 266:248 267:62 269:50 270:253 271:253 272:45 291:170 292:253 293:253 294:135 297:12 298:208 299:253 300:119 318:16 319:232 320:253 321:253 322:21 326:60 327:253 328:185 346:164 347:253 348:253 349:224 350:14 354:14 355:217 356:247 357:62 373:3 374:193 375:253 376:250 377:64 383:199 384:253 385:179 401:67 402:253 403:253 404:205 411:98 412:253 413:188 429:151 430:253 431:245 432:43 439:63 440:250 441:188 457:151 458:253 459:243 468:244 469:222 470:22 485:151 486:253 487:217 496:244 497:253 498:115 512:3 513:195 514:253 515:134 524:156 525:253 526:150 541:140 542:253 543:134 552:239 553:253 554:139 569:44 570:253 571:134 579:53 580:246 581:237 582:32 597:8 598:200 599:229 600:40 606:25 607:225 608:253 609:188 626:120 627:250 628:230 629:58 630:17 632:12 633:42 634:213 635:253 636:238 637:84 655:151 656:253 657:253 658:217 659:179 660:206 661:253 662:253 663:196 664:118 683:18 684:58 685:145 686:152 687:253 688:214 689:145 690:74 691:7 -1 130:24 131:150 132:233 133:38 156:14 157:89 158:253 159:254 160:254 161:71 183:78 184:203 185:254 186:254 187:254 188:232 189:77 190:54 191:8 209:12 210:155 211:240 212:254 213:223 214:76 215:254 216:254 217:254 218:254 219:68 235:3 236:101 237:216 238:254 239:227 240:122 241:26 242:110 243:254 244:254 245:254 246:184 247:100 262:46 263:222 264:254 265:254 266:179 267:48 270:181 271:254 272:254 273:146 274:6 288:2 289:145 290:248 291:254 292:182 293:111 294:4 297:3 298:250 299:254 300:206 301:3 315:6 316:144 317:254 318:254 319:171 325:125 326:254 327:252 328:80 342:6 343:142 344:254 345:179 346:95 347:4 352:61 353:246 354:254 355:150 370:64 371:254 372:177 373:14 380:124 381:254 382:246 383:32 398:108 399:97 400:15 407:24 408:226 409:254 410:116 435:177 436:255 437:254 438:5 463:196 464:254 465:99 466:1 490:3 491:199 492:254 493:79 518:129 519:254 520:254 521:23 546:178 547:254 548:192 549:8 550:3 551:43 573:11 574:198 575:254 576:128 577:66 578:130 579:225 595:137 596:202 597:106 598:84 599:84 600:84 601:112 602:254 603:254 604:254 605:254 606:212 607:151 623:172 624:254 625:254 626:254 627:254 628:254 629:254 630:254 631:254 632:254 633:162 634:75 651:12 652:106 653:177 654:254 655:254 656:254 657:235 658:135 659:100 660:17 661:2 -0 125:120 126:253 127:253 128:63 151:38 152:131 153:246 154:252 155:252 156:203 157:15 179:222 180:252 181:252 182:252 183:252 184:166 185:38 205:4 206:107 207:253 208:252 209:252 210:252 211:252 212:253 213:224 214:137 215:26 233:107 234:252 235:253 236:252 237:220 238:128 239:252 240:253 241:252 242:252 243:239 244:140 261:170 262:253 263:255 264:168 267:79 268:192 269:253 270:253 271:253 272:253 273:255 274:90 288:51 289:243 290:252 291:215 292:33 296:12 297:74 298:233 299:252 300:252 301:253 302:195 303:19 316:166 317:252 318:252 319:31 326:43 327:149 328:195 329:253 330:252 331:177 332:19 343:57 344:234 345:252 346:252 357:237 358:252 359:252 360:180 361:13 371:85 372:252 373:252 374:173 385:50 386:237 387:252 388:252 389:112 399:226 400:253 401:240 402:63 414:163 415:253 416:253 417:112 426:38 427:234 428:252 429:176 442:85 443:252 444:252 445:158 454:113 455:252 456:252 457:84 470:19 471:209 472:252 473:252 482:207 483:252 484:252 485:84 498:10 499:203 500:252 501:236 510:253 511:252 512:252 513:84 526:85 527:252 528:252 529:112 538:114 539:253 540:253 541:146 553:51 554:159 555:253 556:240 557:63 566:75 567:243 568:252 569:249 570:146 579:57 580:85 581:238 582:252 583:252 584:99 595:116 596:252 597:252 598:252 599:198 600:197 601:165 602:57 603:57 604:57 605:182 606:197 607:234 608:252 609:253 610:233 611:164 612:19 623:28 624:84 625:180 626:252 627:253 628:252 629:252 630:252 631:252 632:253 633:252 634:252 635:252 636:252 637:225 638:71 653:13 654:112 655:253 656:252 657:252 658:252 659:252 660:253 661:252 662:252 663:157 664:112 -1 127:155 128:253 129:126 155:253 156:251 157:141 158:4 183:253 184:251 185:251 186:31 211:253 212:251 213:251 214:31 239:253 240:251 241:251 242:31 267:255 268:253 269:253 270:31 293:8 294:131 295:253 296:251 297:235 298:27 321:64 322:251 323:253 324:251 325:126 349:64 350:251 351:253 352:251 353:126 377:64 378:251 379:253 380:251 381:126 405:64 406:253 407:255 408:221 433:182 434:251 435:253 436:200 460:64 461:236 462:251 463:253 464:62 487:8 488:158 489:251 490:251 491:169 492:8 515:32 516:251 517:251 518:251 519:158 543:32 544:253 545:253 546:253 547:159 571:32 572:251 573:251 574:251 575:39 599:32 600:251 601:251 602:251 627:32 628:251 629:251 630:251 631:100 655:32 656:251 657:251 658:251 -0 101:88 102:127 103:5 126:19 127:58 128:20 129:14 130:217 131:19 152:7 153:146 154:247 155:253 156:235 157:27 158:84 159:81 180:126 181:253 182:164 183:19 184:15 187:156 188:9 208:214 209:222 210:34 215:234 216:58 235:59 236:254 237:116 243:235 244:58 263:141 264:251 265:72 271:151 272:140 291:224 292:233 299:136 300:223 319:254 320:218 327:136 328:253 347:254 348:135 355:136 356:253 374:23 375:255 376:114 383:137 384:231 402:98 403:254 404:122 411:136 412:155 430:98 431:254 432:106 439:166 440:155 458:98 459:254 460:128 467:234 468:193 486:98 487:254 488:135 494:61 495:248 496:118 515:255 516:238 517:18 521:13 522:224 523:254 524:58 543:201 544:253 545:128 546:2 548:5 549:150 550:253 551:167 552:9 571:18 572:226 573:253 574:49 575:31 576:156 577:253 578:228 579:13 600:147 601:253 602:243 603:241 604:254 605:227 606:43 628:5 629:126 630:245 631:253 632:231 633:46 -0 127:37 128:141 129:156 130:156 131:194 132:194 133:47 153:11 154:132 155:239 156:253 157:253 158:253 159:253 160:254 161:181 180:25 181:172 182:253 183:235 184:167 185:78 186:93 187:174 188:254 189:247 190:54 207:26 208:210 209:253 210:237 211:90 216:201 217:253 218:78 235:192 236:253 237:237 238:58 244:156 245:253 246:78 262:141 263:254 264:235 265:53 269:19 270:5 272:156 273:254 274:78 289:46 290:254 291:253 292:92 296:17 297:226 298:217 299:49 300:148 301:253 302:78 317:165 318:254 319:239 320:24 324:20 325:253 326:253 327:58 328:18 329:115 330:24 344:37 345:248 346:254 347:91 352:2 353:117 354:250 355:163 356:91 372:77 373:253 374:254 375:39 382:196 383:253 384:173 400:159 401:254 402:218 403:15 410:77 411:254 412:255 413:61 428:234 429:253 430:113 438:21 439:226 440:254 441:135 455:25 456:240 457:253 458:68 467:195 468:254 469:135 483:79 484:253 485:253 495:195 496:254 497:135 511:79 512:253 513:253 514:76 523:195 524:254 525:99 540:212 541:254 542:209 543:9 550:10 551:209 552:196 553:15 568:54 569:253 570:254 571:137 572:36 576:2 577:20 578:168 579:253 580:60 596:28 597:235 598:254 599:253 600:199 601:124 602:79 603:79 604:167 605:253 606:253 607:185 608:30 625:15 626:117 627:217 628:253 629:253 630:253 631:254 632:253 633:240 634:109 635:12 655:27 656:126 657:208 658:253 659:193 660:147 661:40 -0 154:32 155:134 156:218 157:254 158:254 159:254 160:217 161:84 176:44 177:208 178:215 179:156 180:35 181:119 182:236 183:246 184:136 185:91 186:69 187:151 188:249 189:246 190:78 203:44 204:230 205:254 206:254 207:254 208:254 209:254 210:196 211:48 216:60 217:224 218:210 219:24 231:118 232:254 233:202 234:19 235:201 236:254 237:181 238:9 245:35 246:233 247:168 259:193 260:223 261:34 263:59 264:163 265:236 266:15 274:140 275:205 276:8 286:60 287:254 288:176 293:38 302:54 303:237 304:80 314:59 315:254 316:93 331:131 332:200 342:59 343:240 344:24 359:79 360:214 370:59 371:234 387:67 388:248 389:54 398:59 399:234 416:235 417:58 426:60 427:235 443:79 444:255 445:59 454:59 455:251 456:66 471:79 472:250 473:54 482:59 483:254 484:108 499:146 500:214 510:5 511:203 512:187 513:3 526:4 527:188 528:199 539:118 540:254 541:57 554:96 555:254 556:117 567:16 568:237 569:224 570:14 581:14 582:187 583:206 584:8 596:88 597:252 598:186 599:16 608:16 609:187 610:252 611:125 625:100 626:254 627:237 628:94 629:24 635:13 636:214 637:254 638:166 653:3 654:57 655:215 656:248 657:241 658:235 659:197 660:137 661:137 662:137 663:231 664:238 665:155 666:25 684:57 685:155 686:246 687:254 688:254 689:254 690:254 691:147 692:36 -1 124:102 125:252 126:252 127:41 152:102 153:250 154:250 155:202 180:102 181:250 182:250 183:232 184:91 208:102 209:250 210:250 211:212 212:29 236:102 237:252 238:252 239:254 240:150 264:102 265:250 266:250 267:252 268:149 292:102 293:250 294:250 295:252 296:149 320:102 321:250 322:250 323:252 324:231 325:80 349:152 350:252 351:254 352:252 353:100 377:151 378:250 379:252 380:250 381:100 405:151 406:250 407:252 408:250 409:100 433:151 434:250 435:252 436:250 437:100 461:123 462:243 463:254 464:252 465:100 490:202 491:252 492:250 493:100 518:80 519:252 520:250 521:190 522:30 547:252 548:250 549:250 550:49 575:255 576:252 577:252 578:252 579:214 580:31 603:171 604:250 605:250 606:250 607:252 608:190 609:40 631:20 632:160 633:250 634:250 635:252 636:250 637:100 660:20 661:170 662:250 663:212 664:49 665:20 -0 124:20 125:121 126:197 127:253 128:64 151:23 152:200 153:252 154:252 155:252 156:184 157:6 178:25 179:197 180:252 181:252 182:252 183:252 184:253 185:228 186:107 187:15 205:26 206:196 207:252 208:252 209:252 210:252 211:252 212:253 213:252 214:252 215:219 216:178 217:21 233:186 234:252 235:238 236:94 237:67 238:224 239:217 240:53 241:109 242:245 243:252 244:252 245:213 246:63 260:98 261:242 262:252 263:101 266:39 267:31 270:109 271:128 272:241 273:252 274:207 275:97 287:17 288:230 289:252 290:241 291:56 300:109 301:252 302:252 303:229 304:17 314:13 315:192 316:252 317:243 318:96 328:25 329:127 330:252 331:252 332:120 342:121 343:252 344:252 345:165 357:125 358:252 359:252 360:246 361:70 370:190 371:252 372:252 373:39 385:26 386:210 387:252 388:252 389:119 398:255 399:253 400:159 414:22 415:209 416:253 417:183 426:253 427:252 428:103 443:34 444:252 445:252 454:253 455:252 456:26 471:27 472:252 473:252 482:253 483:252 484:168 485:13 499:70 500:252 501:209 510:147 511:252 512:252 513:75 526:68 527:233 528:252 529:119 538:121 539:252 540:252 541:189 542:40 552:15 553:82 554:231 555:252 556:214 557:31 566:38 567:135 568:248 569:252 570:231 571:145 572:41 573:41 574:41 575:41 576:20 577:24 578:37 579:83 580:194 581:252 582:252 583:212 584:33 596:83 597:213 598:252 599:252 600:252 601:252 602:252 603:252 604:204 605:213 606:243 607:252 608:252 609:252 610:212 611:34 625:34 626:140 627:238 628:248 629:252 630:252 631:252 632:253 633:252 634:252 635:241 636:238 637:238 638:75 656:82 657:119 658:119 659:119 660:120 661:119 662:119 663:19 -1 127:20 128:254 129:255 130:37 155:19 156:253 157:253 158:134 183:19 184:253 185:253 186:246 187:125 211:76 212:253 213:253 214:253 215:158 239:207 240:253 241:253 242:253 243:158 267:207 268:253 269:253 270:253 271:158 294:48 295:223 296:253 297:253 298:243 299:106 322:141 323:253 324:253 325:253 326:113 349:65 350:237 351:253 352:253 353:253 354:36 377:76 378:253 379:253 380:253 381:253 382:36 405:76 406:253 407:253 408:253 409:253 410:36 433:76 434:253 435:253 436:253 437:118 438:4 460:4 461:148 462:253 463:253 464:253 465:103 488:10 489:253 490:253 491:253 492:253 493:103 516:10 517:253 518:253 519:253 520:173 521:7 544:10 545:253 546:253 547:253 548:168 572:143 573:253 574:253 575:239 576:49 600:198 601:253 602:253 603:234 615:140 628:198 629:253 630:253 631:234 656:198 657:253 658:253 659:234 -0 235:40 236:37 238:7 239:77 240:137 241:136 242:136 243:136 244:136 245:40 246:6 261:16 262:135 263:254 264:233 266:152 267:215 268:96 269:140 270:155 271:118 272:230 273:254 274:158 275:68 288:19 289:164 290:254 291:114 294:235 295:140 301:99 302:230 303:254 304:186 305:14 315:70 316:226 317:242 318:121 322:104 323:195 324:38 330:33 331:179 332:253 333:140 342:41 343:241 344:198 345:43 359:24 360:209 361:223 370:164 371:250 372:66 388:136 389:253 398:254 399:158 416:136 417:215 426:255 427:76 442:5 443:127 444:246 445:133 454:254 455:122 469:5 470:150 471:247 472:91 473:9 482:254 483:165 495:13 496:79 497:194 498:216 499:84 510:111 511:251 512:87 519:16 520:25 521:40 522:107 523:186 524:213 525:117 526:25 538:14 539:185 540:235 541:142 542:23 546:91 547:157 548:231 549:207 550:126 551:49 569:143 570:195 571:255 572:254 573:254 574:244 575:157 576:76 599:39 600:39 601:39 602:33 -1 128:166 129:255 130:187 131:6 156:165 157:253 158:253 159:13 183:15 184:191 185:253 186:253 187:13 211:49 212:253 213:253 214:253 215:13 239:141 240:253 241:253 242:169 243:4 266:4 267:189 268:253 269:249 270:53 294:69 295:253 296:253 297:246 322:69 323:253 324:253 325:246 350:118 351:253 352:253 353:124 378:206 379:253 380:231 381:21 405:66 406:241 407:253 408:199 433:105 434:253 435:253 436:89 460:3 461:228 462:253 463:252 464:86 488:111 489:253 490:253 491:205 516:166 517:253 518:253 519:75 543:43 544:249 545:253 546:193 547:9 570:4 571:160 572:253 573:253 574:184 598:37 599:253 600:253 601:253 602:88 626:140 627:253 628:253 629:186 630:18 654:14 655:253 656:253 657:27 -1 128:117 129:128 155:2 156:199 157:127 183:81 184:254 185:87 211:116 212:254 213:48 239:175 240:234 241:18 266:5 267:230 268:168 294:80 295:255 296:142 322:80 323:255 324:142 350:80 351:251 352:57 378:129 379:239 406:164 407:209 433:28 434:245 435:159 461:64 462:254 463:144 489:84 490:254 491:80 517:143 518:254 519:30 544:3 545:225 546:200 572:48 573:254 574:174 600:48 601:254 602:174 628:93 629:254 630:129 656:53 657:234 658:41 -1 129:159 130:142 156:11 157:220 158:141 184:78 185:254 186:141 212:111 213:254 214:109 240:196 241:221 242:15 267:26 268:221 269:159 295:63 296:254 297:159 323:178 324:254 325:93 350:7 351:191 352:254 353:97 378:42 379:255 380:254 381:41 406:42 407:254 408:195 409:10 434:141 435:255 436:78 461:11 462:202 463:254 464:59 489:86 490:254 491:254 492:59 517:142 518:254 519:248 520:52 545:142 546:254 547:195 573:142 574:254 575:164 601:142 602:254 603:77 629:142 630:254 631:131 657:77 658:172 659:5 -0 124:66 125:254 126:254 127:58 128:60 129:59 130:59 131:50 151:73 152:233 153:253 154:253 155:148 156:254 157:253 158:253 159:232 160:73 179:156 180:253 181:253 182:253 183:117 184:255 185:253 186:253 187:253 188:223 189:176 190:162 205:37 206:116 207:246 208:253 209:180 210:18 211:4 212:18 213:109 214:241 215:253 216:253 217:253 218:236 219:28 233:235 234:253 235:253 236:245 237:107 242:109 243:170 244:253 245:253 246:253 247:174 261:235 262:253 263:253 264:233 271:15 272:156 273:253 274:253 275:223 276:72 287:10 288:156 289:250 290:253 291:253 292:67 300:99 301:253 302:253 303:253 304:127 305:5 315:118 316:253 317:253 318:253 319:204 320:26 328:68 329:223 330:253 331:253 332:253 333:57 342:32 343:191 344:253 345:253 346:253 347:97 357:156 358:253 359:253 360:253 361:57 370:59 371:253 372:253 373:253 374:253 375:97 385:36 386:224 387:253 388:253 389:57 398:60 399:254 400:255 401:254 402:156 413:37 414:226 415:254 416:254 417:58 426:59 427:253 428:253 429:253 430:154 441:156 442:253 443:253 444:253 445:57 454:59 455:253 456:253 457:253 458:154 469:156 470:253 471:253 472:253 473:57 482:59 483:253 484:253 485:253 486:246 487:90 496:16 497:171 498:253 499:253 500:231 501:49 510:59 511:253 512:253 513:253 514:253 515:156 516:91 524:99 525:253 526:253 527:222 528:71 538:59 539:253 540:253 541:253 542:253 543:253 544:245 545:109 551:145 552:194 553:253 554:253 555:174 566:9 567:38 568:174 569:251 570:253 571:253 572:253 573:241 574:215 575:215 576:217 577:215 578:215 579:250 580:253 581:253 582:221 583:26 597:235 598:253 599:253 600:253 601:253 602:253 603:253 604:254 605:253 606:253 607:253 608:253 609:204 610:26 625:108 626:116 627:200 628:253 629:253 630:253 631:253 632:254 633:253 634:253 635:253 636:199 637:44 655:36 656:57 657:118 658:253 659:253 660:58 661:57 662:57 663:57 664:35 -1 129:101 130:222 131:84 157:225 158:252 159:84 184:89 185:246 186:208 187:19 212:128 213:252 214:195 239:79 240:253 241:252 242:195 267:141 268:255 269:253 270:133 294:26 295:240 296:253 297:252 298:55 322:60 323:252 324:253 325:154 326:12 349:7 350:178 351:252 352:253 353:27 377:57 378:252 379:252 380:253 381:27 405:57 406:253 407:253 408:204 409:15 433:104 434:252 435:252 436:94 460:19 461:209 462:252 463:252 488:101 489:252 490:252 491:157 516:225 517:252 518:252 519:112 544:226 545:253 546:240 547:63 572:225 573:252 574:223 600:225 601:252 602:223 628:225 629:252 630:242 631:75 656:146 657:252 658:236 659:50 -0 124:41 125:254 126:254 127:157 128:34 129:34 130:218 131:255 132:206 133:34 134:18 151:53 152:238 153:252 154:252 155:252 156:252 157:252 158:252 159:252 160:252 161:252 162:162 163:26 178:66 179:220 180:252 181:252 182:252 183:209 184:153 185:223 186:252 187:252 188:252 189:252 190:252 191:98 206:166 207:252 208:252 209:252 210:252 211:141 213:85 214:230 215:252 216:252 217:252 218:252 219:98 234:166 235:252 236:252 237:252 238:252 239:141 242:73 243:102 244:252 245:252 246:252 247:98 262:166 263:252 264:252 265:252 266:191 267:30 271:5 272:97 273:252 274:252 275:220 276:51 289:123 290:245 291:252 292:252 293:202 294:14 300:56 301:252 302:252 303:252 304:65 316:18 317:154 318:252 319:252 320:241 328:56 329:252 330:252 331:252 332:65 343:21 344:146 345:252 346:252 347:252 348:241 356:56 357:252 358:252 359:252 360:65 371:67 372:252 373:252 374:252 375:252 376:241 384:56 385:252 386:252 387:252 388:65 399:67 400:252 401:252 402:252 403:252 404:116 412:56 413:252 414:252 415:252 416:65 427:67 428:252 429:252 430:252 431:252 432:20 440:56 441:252 442:252 443:252 444:65 455:67 456:252 457:252 458:252 459:87 460:4 468:56 469:252 470:252 471:124 472:11 483:67 484:252 485:252 486:252 487:54 494:19 495:236 496:245 497:252 498:252 499:98 511:67 512:252 513:252 514:252 515:97 516:5 521:39 522:219 523:252 524:252 525:252 526:252 527:98 539:67 540:252 541:252 542:252 543:252 544:102 545:89 546:89 547:89 548:89 549:203 550:252 551:252 552:252 553:252 554:209 555:64 567:67 568:252 569:252 570:252 571:252 572:252 573:252 574:252 575:252 576:252 577:252 578:252 579:252 580:226 581:130 582:68 595:67 596:252 597:252 598:252 599:252 600:252 601:252 602:252 603:252 604:252 605:252 606:252 607:239 608:77 623:17 624:65 625:163 626:252 627:252 628:252 629:252 630:252 631:252 632:252 633:252 634:96 635:59 653:17 654:176 655:252 656:252 657:252 658:252 659:155 660:32 661:32 662:6 -0 96:56 97:247 98:121 124:24 125:242 126:245 127:122 153:231 154:253 155:253 156:104 157:12 181:90 182:253 183:253 184:254 185:221 186:120 187:120 188:85 206:67 207:75 208:36 209:11 210:56 211:222 212:254 213:253 214:253 215:253 216:245 217:207 218:36 233:86 234:245 235:249 236:105 239:44 240:224 241:230 242:253 243:253 244:253 245:253 246:214 247:10 260:8 261:191 262:253 263:143 269:29 270:119 271:119 272:158 273:253 274:253 275:94 288:15 289:253 290:226 291:48 300:4 301:183 302:253 303:248 304:56 316:42 317:253 318:178 329:179 330:253 331:184 332:14 344:164 345:253 346:178 357:179 358:253 359:163 371:61 372:254 373:254 374:179 384:76 385:254 386:254 387:164 399:60 400:253 401:253 402:178 411:29 412:206 413:253 414:253 415:40 427:60 428:253 429:253 430:178 439:120 440:253 441:253 442:245 443:13 455:60 456:253 457:253 458:178 467:120 468:253 469:239 470:63 483:60 484:253 485:253 486:178 494:14 495:238 496:253 497:179 511:18 512:190 513:253 514:231 515:70 521:43 522:184 523:253 524:253 525:74 540:86 541:253 542:253 543:239 544:134 545:8 548:56 549:163 550:253 551:253 552:213 553:35 568:16 569:253 570:253 571:253 572:253 573:240 574:239 575:239 576:247 577:253 578:253 579:210 580:27 596:4 597:59 598:204 599:253 600:253 601:253 602:253 603:253 604:254 605:253 606:250 607:110 626:31 627:122 628:253 629:253 630:253 631:253 632:255 633:217 634:98 -0 125:19 126:164 127:253 128:255 129:253 130:118 131:59 132:36 153:78 154:251 155:251 156:253 157:251 158:251 159:251 160:199 161:45 180:14 181:198 182:251 183:251 184:253 185:251 186:251 187:251 188:251 189:204 190:26 208:5 209:117 210:251 211:251 212:243 213:212 214:239 215:251 216:251 217:251 218:218 236:95 237:251 238:251 239:251 240:120 242:175 243:251 244:251 245:251 246:231 263:97 264:237 265:251 266:251 267:251 270:67 271:240 272:251 273:251 274:243 275:108 290:8 291:163 292:251 293:251 294:240 295:81 299:68 300:251 301:251 302:251 303:179 304:9 317:13 318:145 319:251 320:251 321:226 322:80 327:39 328:251 329:251 330:251 331:251 332:115 345:144 346:251 347:251 348:251 349:173 355:18 356:167 357:251 358:251 359:251 360:115 373:233 374:251 375:251 376:251 377:173 384:98 385:251 386:251 387:251 388:115 400:176 401:253 402:253 403:216 404:179 412:99 413:253 414:253 415:253 416:116 427:55 428:210 429:251 430:251 431:96 440:98 441:251 442:251 443:214 444:62 455:117 456:251 457:251 458:251 459:96 467:28 468:204 469:251 470:237 471:53 482:55 483:241 484:251 485:251 486:160 487:7 494:28 495:222 496:251 497:251 498:231 510:59 511:251 512:251 513:251 514:153 520:23 521:98 522:204 523:251 524:251 525:251 526:156 538:59 539:251 540:251 541:251 542:153 546:85 547:155 548:179 549:251 550:251 551:251 552:251 553:154 554:15 566:59 567:251 568:251 569:251 570:236 571:214 572:214 573:214 574:234 575:251 576:253 577:251 578:251 579:248 580:156 581:15 594:41 595:209 596:251 597:251 598:251 599:251 600:251 601:251 602:251 603:251 604:253 605:251 606:196 607:146 623:54 624:115 625:241 626:251 627:251 628:251 629:251 630:251 631:251 632:253 633:187 634:35 653:83 654:251 655:251 656:251 657:251 658:251 659:101 660:57 661:31 -1 129:232 130:255 131:107 156:58 157:244 158:253 159:106 184:95 185:253 186:253 187:106 212:95 213:253 214:253 215:106 240:95 241:253 242:249 243:69 268:144 269:253 270:192 295:97 296:233 297:253 298:66 323:195 324:253 325:253 326:5 350:38 351:232 352:253 353:182 354:2 377:10 378:160 379:253 380:231 381:53 405:42 406:253 407:253 408:158 433:141 434:253 435:253 436:115 460:75 461:245 462:253 463:183 464:4 487:1 488:147 489:253 490:251 491:58 515:20 516:253 517:253 518:180 543:202 544:253 545:226 546:27 571:243 572:253 573:212 598:85 599:251 600:253 601:173 626:209 627:253 628:244 629:57 654:169 655:253 656:174 -1 127:63 128:128 129:2 155:63 156:254 157:123 183:63 184:254 185:179 211:63 212:254 213:179 239:63 240:254 241:179 267:142 268:254 269:179 295:187 296:254 297:158 323:187 324:254 325:55 350:68 351:235 352:254 353:55 378:181 379:254 380:254 381:55 406:181 407:254 408:202 409:14 434:181 435:254 436:186 462:181 463:254 464:146 490:181 491:254 492:62 518:181 519:254 520:62 546:181 547:254 548:62 574:181 575:255 576:62 602:181 603:254 604:241 605:52 630:181 631:254 632:222 633:30 658:181 659:224 660:34 -1 130:131 131:255 132:184 133:15 157:99 158:247 159:253 160:182 161:15 185:124 186:253 187:253 188:253 189:38 212:9 213:171 214:253 215:253 216:140 217:1 240:47 241:253 242:253 243:251 244:117 267:43 268:219 269:253 270:253 271:153 295:78 296:253 297:253 298:253 299:84 323:97 324:253 325:253 326:244 327:74 350:69 351:243 352:253 353:253 354:183 377:10 378:168 379:253 380:253 381:215 382:34 405:31 406:253 407:253 408:253 409:129 433:107 434:253 435:253 436:242 437:67 460:24 461:204 462:253 463:253 464:187 488:95 489:253 490:253 491:201 492:25 516:239 517:253 518:253 519:176 543:119 544:251 545:253 546:253 547:138 570:30 571:212 572:253 573:252 574:165 575:8 598:193 599:253 600:253 601:222 626:193 627:253 628:253 629:189 654:193 655:253 656:201 657:27 -0 125:57 126:255 127:253 128:198 129:85 153:168 154:253 155:251 156:253 157:251 158:169 159:56 180:86 181:253 182:254 183:253 184:254 185:253 186:254 187:253 188:57 208:197 209:251 210:253 211:251 212:253 213:251 214:253 215:251 216:225 217:56 235:169 236:255 237:253 238:226 239:56 241:114 242:254 243:253 244:254 245:84 262:57 263:224 264:253 265:251 266:56 270:139 271:251 272:253 273:83 290:141 291:253 292:255 293:84 298:57 299:225 300:254 301:196 318:253 319:251 320:253 321:83 327:168 328:253 329:83 345:169 346:254 347:253 348:169 355:169 356:254 357:253 358:169 373:168 374:253 375:251 376:56 383:168 384:253 385:251 386:56 401:169 402:254 403:84 412:254 413:253 429:168 430:253 431:83 440:253 441:251 456:29 457:197 458:254 459:84 467:169 468:254 469:196 484:85 485:251 486:253 487:83 494:57 495:224 496:253 497:83 512:57 513:225 514:254 515:139 521:57 522:141 523:253 524:254 525:84 541:168 542:253 543:251 544:169 545:56 547:114 548:169 549:224 550:253 551:251 552:253 553:83 569:169 570:254 571:253 572:254 573:253 574:254 575:253 576:254 577:253 578:254 579:253 580:226 581:56 597:56 598:253 599:251 600:253 601:251 602:253 603:251 604:253 605:251 606:253 607:251 608:56 626:169 627:225 628:254 629:253 630:254 631:253 632:254 633:253 634:226 635:56 655:56 656:253 657:251 658:253 659:251 660:84 661:83 662:56 -0 127:12 128:105 129:224 130:255 131:247 132:22 155:131 156:254 157:254 158:243 159:252 160:76 182:131 183:225 184:254 185:224 186:48 187:136 208:13 209:109 210:252 211:254 212:254 213:254 214:197 215:76 235:9 236:181 237:254 238:254 239:240 240:229 241:237 242:254 243:252 244:152 245:21 262:9 263:143 264:254 265:254 266:226 267:36 269:22 270:138 271:254 272:254 273:188 289:13 290:181 291:254 292:254 293:250 294:64 298:2 299:53 300:236 301:252 302:131 317:102 318:254 319:254 320:254 321:111 328:56 329:243 330:251 331:42 344:30 345:186 346:254 347:254 348:206 349:29 357:199 358:254 359:91 372:92 373:254 374:254 375:237 376:13 385:134 386:254 387:91 400:133 401:254 402:254 403:126 413:134 414:250 415:17 428:187 429:254 430:237 431:23 441:200 442:183 456:187 457:254 458:213 467:2 468:134 469:252 470:101 484:183 485:254 486:133 495:14 496:254 497:234 498:34 512:92 513:254 514:161 522:84 523:204 524:254 525:56 540:92 541:254 542:229 549:85 550:252 551:252 552:188 553:11 568:56 569:252 570:229 575:3 576:53 577:235 578:253 579:166 597:224 598:245 599:130 600:68 601:68 602:134 603:214 604:254 605:254 606:159 625:141 626:254 627:254 628:254 629:254 630:254 631:254 632:233 633:95 634:3 653:14 654:152 655:254 656:254 657:254 658:186 659:157 660:53 -1 130:226 131:247 132:55 157:99 158:248 159:254 160:230 161:30 185:125 186:254 187:254 188:254 189:38 213:125 214:254 215:254 216:212 217:24 240:18 241:223 242:254 243:252 244:118 268:24 269:254 270:254 271:239 295:27 296:195 297:254 298:254 299:93 323:78 324:254 325:254 326:246 327:74 351:158 352:254 353:254 354:185 378:41 379:239 380:254 381:254 382:43 405:22 406:218 407:254 408:254 409:167 410:9 433:32 434:254 435:254 436:254 437:130 460:24 461:187 462:254 463:254 464:234 465:16 488:189 489:254 490:254 491:254 492:128 515:64 516:247 517:254 518:255 519:219 520:42 543:139 544:254 545:254 546:222 547:40 570:30 571:213 572:254 573:235 574:45 598:194 599:254 600:254 601:223 626:194 627:254 628:254 629:190 654:194 655:254 656:202 657:27 -1 130:166 131:253 132:124 133:53 158:140 159:251 160:251 161:180 185:125 186:246 187:251 188:251 189:51 212:32 213:190 214:251 215:251 216:251 217:103 240:21 241:174 242:251 243:251 244:251 268:73 269:176 270:253 271:253 272:201 296:149 297:251 298:251 299:251 300:71 323:27 324:228 325:251 326:251 327:157 328:10 351:180 352:253 353:251 354:251 355:142 377:27 378:180 379:231 380:253 381:251 382:96 383:41 405:89 406:253 407:253 408:255 409:211 410:25 433:217 434:251 435:251 436:253 437:107 460:21 461:221 462:251 463:251 464:242 465:92 487:32 488:190 489:251 490:251 491:251 492:103 515:202 516:251 517:251 518:251 519:122 542:53 543:255 544:253 545:253 546:221 547:51 570:180 571:253 572:251 573:251 574:142 598:180 599:253 600:251 601:251 602:142 626:180 627:253 628:251 629:157 630:82 654:180 655:253 656:147 657:10 -1 129:17 130:206 131:229 132:44 157:2 158:125 159:254 160:123 185:95 186:254 187:254 188:123 212:78 213:240 214:254 215:254 216:123 240:100 241:254 242:254 243:254 244:123 267:2 268:129 269:254 270:254 271:220 272:20 295:9 296:254 297:254 298:254 299:123 322:22 323:179 324:254 325:254 326:254 327:49 350:83 351:254 352:254 353:254 354:183 355:19 378:136 379:254 380:254 381:254 382:139 404:3 405:111 406:252 407:254 408:254 409:232 410:45 432:67 433:254 434:254 435:254 436:216 437:40 459:14 460:192 461:254 462:254 463:254 464:140 486:23 487:192 488:254 489:254 490:254 491:246 514:77 515:254 516:254 517:255 518:241 519:100 541:65 542:235 543:254 544:254 545:254 546:172 568:30 569:238 570:254 571:254 572:254 573:219 574:26 596:34 597:254 598:254 599:254 600:216 601:41 624:34 625:254 626:254 627:254 628:188 652:12 653:170 654:254 655:254 656:82 -1 130:218 131:253 132:124 157:84 158:236 159:251 160:251 184:63 185:236 186:251 187:251 188:122 212:73 213:251 214:251 215:251 216:173 240:202 241:251 242:251 243:251 244:71 267:53 268:255 269:253 270:253 271:253 272:72 295:180 296:253 297:251 298:251 299:188 300:30 323:180 324:253 325:251 326:251 327:142 350:47 351:211 352:253 353:251 354:235 355:82 377:27 378:211 379:251 380:253 381:251 382:215 405:89 406:253 407:253 408:255 409:253 410:164 433:217 434:251 435:251 436:253 437:168 438:15 460:21 461:221 462:251 463:251 464:253 465:107 487:32 488:190 489:251 490:251 491:251 492:221 493:61 515:73 516:251 517:251 518:251 519:251 520:180 543:255 544:253 545:253 546:253 547:201 570:105 571:253 572:251 573:251 574:251 575:71 598:180 599:253 600:251 601:246 602:137 603:10 626:180 627:253 628:251 629:215 654:180 655:253 656:251 657:86 -1 124:102 125:180 126:1 152:140 153:254 154:130 180:140 181:254 182:204 208:140 209:254 210:204 236:72 237:254 238:204 264:25 265:231 266:250 267:135 292:11 293:211 294:254 295:222 321:101 322:254 323:250 324:15 349:96 350:254 351:254 352:95 377:2 378:251 379:254 380:95 405:2 406:251 407:254 408:95 433:96 434:254 435:254 436:95 461:53 462:253 463:254 464:139 490:250 491:254 492:235 493:27 518:201 519:254 520:254 521:128 546:80 547:254 548:254 549:139 574:65 575:254 576:254 577:139 602:150 603:254 604:254 605:139 630:229 631:254 632:254 633:43 658:52 659:196 660:168 661:9 -0 128:87 129:208 130:249 155:27 156:212 157:254 158:195 182:118 183:225 184:254 185:254 186:232 187:147 188:46 209:115 210:248 211:254 212:254 213:254 214:254 215:254 216:230 217:148 218:12 236:18 237:250 238:254 239:245 240:226 241:254 242:254 243:254 244:254 245:254 246:148 263:92 264:205 265:254 266:250 267:101 268:20 269:194 270:254 271:254 272:254 273:254 274:229 275:53 291:152 292:254 293:254 294:94 297:14 298:124 299:187 300:254 301:254 302:254 303:213 318:95 319:252 320:254 321:206 322:15 327:3 328:6 329:51 330:231 331:254 332:94 345:50 346:246 347:254 348:254 349:20 358:200 359:254 360:96 372:21 373:184 374:254 375:254 376:147 377:2 386:200 387:254 388:96 400:177 401:254 402:254 403:218 404:33 413:16 414:211 415:254 416:96 427:11 428:219 429:254 430:251 431:92 441:84 442:254 443:232 444:44 455:101 456:254 457:254 458:141 469:162 470:254 471:231 472:42 483:235 484:254 485:227 486:42 496:51 497:238 498:254 499:213 511:235 512:254 513:199 524:160 525:254 526:229 527:52 539:235 540:254 541:199 549:10 550:84 551:150 552:253 553:254 554:147 567:235 568:254 569:213 570:20 575:17 576:63 577:158 578:254 579:254 580:254 581:155 582:12 595:122 596:248 597:254 598:204 599:98 600:42 601:177 602:180 603:200 604:254 605:254 606:253 607:213 608:82 609:10 624:203 625:254 626:254 627:254 628:254 629:254 630:254 631:254 632:251 633:219 634:94 652:35 653:221 654:254 655:254 656:254 657:254 658:254 659:217 660:95 -1 126:134 127:230 154:133 155:231 156:10 182:133 183:253 184:96 210:133 211:253 212:96 238:133 239:253 240:183 266:133 267:253 268:217 294:133 295:253 296:217 322:133 323:253 324:217 350:133 351:253 352:217 378:133 379:253 380:217 406:134 407:254 408:218 434:133 435:253 436:159 462:133 463:253 464:199 490:156 491:253 492:96 518:254 519:247 520:73 546:254 547:248 548:74 573:99 574:254 575:245 576:64 600:89 601:230 602:254 603:125 627:140 628:251 629:253 630:243 631:10 655:114 656:242 657:195 658:69 -1 125:29 126:85 127:255 128:139 153:197 154:251 155:253 156:251 181:254 182:253 183:254 184:253 209:253 210:251 211:253 212:251 237:254 238:253 239:254 240:253 265:253 266:251 267:253 268:138 293:254 294:253 295:254 296:196 321:253 322:251 323:253 324:196 349:254 350:253 351:254 352:84 377:253 378:251 379:253 380:196 405:254 406:253 407:254 408:253 433:253 434:251 435:253 436:251 461:254 462:253 463:254 464:253 489:253 490:251 491:253 492:251 517:254 518:253 519:254 520:253 545:253 546:251 547:253 548:251 573:254 574:253 575:254 576:253 601:253 602:251 603:253 604:251 629:57 630:225 631:254 632:253 658:56 659:253 660:251 -1 125:149 126:255 127:254 128:58 153:215 154:253 155:183 156:2 180:41 181:232 182:253 183:181 208:92 209:253 210:253 211:181 236:92 237:253 238:253 239:181 264:92 265:253 266:253 267:181 292:92 293:253 294:253 295:181 320:92 321:253 322:253 323:181 348:92 349:253 350:253 351:181 376:92 377:253 378:253 379:181 404:92 405:253 406:253 407:181 432:92 433:253 434:253 435:181 460:92 461:253 462:253 463:181 488:31 489:228 490:253 491:181 517:198 518:253 519:228 520:54 545:33 546:226 547:253 548:195 549:7 574:199 575:253 576:253 577:75 602:34 603:218 604:253 605:228 606:117 607:14 608:12 631:33 632:219 633:253 634:253 635:253 636:211 660:32 661:123 662:149 663:230 664:41 -1 130:79 131:203 132:141 157:51 158:240 159:240 160:140 185:88 186:252 187:252 188:140 213:197 214:252 215:252 216:140 241:197 242:252 243:252 244:140 268:147 269:253 270:253 271:253 295:38 296:234 297:252 298:242 299:89 323:113 324:252 325:252 326:223 350:16 351:207 352:252 353:252 354:129 377:16 378:203 379:253 380:252 381:220 382:37 405:29 406:253 407:255 408:253 409:56 432:19 433:181 434:252 435:253 436:176 437:6 460:166 461:252 462:252 463:228 464:52 487:10 488:203 489:252 490:252 491:126 514:63 515:178 516:252 517:252 518:173 542:114 543:253 544:253 545:225 570:238 571:252 572:252 573:99 596:7 597:135 598:253 599:252 600:176 601:19 624:29 625:252 626:253 627:252 628:55 652:13 653:189 654:253 655:204 656:25 -1 126:94 127:254 128:75 154:166 155:253 156:231 182:208 183:253 184:147 210:208 211:253 212:116 238:208 239:253 240:168 266:146 267:254 268:222 294:166 295:253 296:116 322:208 323:253 324:116 350:166 351:253 352:158 378:145 379:253 380:231 406:209 407:254 408:169 434:187 435:253 436:168 462:93 463:253 464:116 490:93 491:253 492:116 518:93 519:253 520:116 546:94 547:254 548:179 549:11 574:93 575:253 576:246 577:101 602:145 603:253 604:255 605:92 630:93 631:253 632:246 633:59 658:93 659:253 660:74 -0 127:46 128:105 129:254 130:254 131:224 132:59 133:59 134:9 155:196 156:254 157:253 158:253 159:253 160:253 161:253 162:128 182:96 183:235 184:254 185:253 186:253 187:253 188:253 189:253 190:247 191:122 208:4 209:101 210:244 211:253 212:254 213:234 214:241 215:253 216:253 217:253 218:253 219:186 220:18 236:96 237:253 238:253 239:253 240:232 241:83 242:109 243:170 244:253 245:253 246:253 247:253 248:116 264:215 265:253 266:253 267:253 268:196 271:40 272:253 273:253 274:253 275:253 276:116 290:8 291:141 292:247 293:253 294:253 295:237 296:29 299:6 300:38 301:171 302:253 303:253 304:116 317:13 318:146 319:253 320:253 321:253 322:253 323:57 329:156 330:253 331:253 332:116 345:40 346:253 347:253 348:253 349:253 350:178 351:27 357:156 358:253 359:253 360:116 372:136 373:204 374:253 375:253 376:253 377:192 378:27 385:156 386:253 387:253 388:116 399:28 400:195 401:254 402:254 403:254 404:250 405:135 412:99 413:255 414:254 415:254 416:117 427:118 428:253 429:253 430:253 431:253 432:142 439:19 440:170 441:253 442:253 443:216 444:62 454:42 455:212 456:253 457:253 458:253 459:253 460:38 466:124 467:188 468:253 469:253 470:253 471:174 482:59 483:253 484:253 485:253 486:237 487:93 488:3 491:31 492:40 493:130 494:247 495:253 496:253 497:253 498:204 499:13 510:59 511:253 512:253 513:253 514:154 518:54 519:218 520:254 521:253 522:253 523:253 524:253 525:253 526:38 538:59 539:253 540:253 541:253 542:215 543:156 544:156 545:156 546:209 547:253 548:255 549:253 550:253 551:253 552:192 553:97 554:15 566:55 567:242 568:253 569:253 570:253 571:253 572:253 573:253 574:253 575:253 576:254 577:253 578:253 579:204 580:23 595:118 596:253 597:253 598:253 599:253 600:253 601:253 602:253 603:253 604:254 605:216 606:174 607:13 623:54 624:116 625:243 626:253 627:253 628:253 629:253 630:253 631:146 632:117 633:62 653:53 654:132 655:253 656:253 657:192 658:57 659:13 -1 125:42 126:232 127:254 128:58 153:86 154:253 155:253 156:58 181:86 182:253 183:253 184:58 209:206 210:253 211:253 212:58 237:215 238:253 239:253 240:58 265:215 266:253 267:253 268:58 293:215 294:253 295:253 296:58 321:215 322:253 323:253 324:58 349:215 350:253 351:253 352:58 377:215 378:253 379:253 380:58 405:215 406:253 407:253 408:58 433:188 434:253 435:253 436:85 461:86 462:253 463:253 464:200 465:12 489:29 490:223 491:253 492:253 493:151 518:209 519:253 520:253 521:194 546:128 547:253 548:253 549:200 550:8 574:32 575:213 576:253 577:253 578:152 579:6 603:32 604:221 605:253 606:253 607:153 608:5 632:90 633:215 634:253 635:253 636:151 661:59 662:253 663:253 664:84 -1 156:60 157:229 158:38 184:187 185:254 186:78 211:121 212:252 213:254 214:78 239:197 240:254 241:206 242:6 267:197 268:254 269:202 294:27 295:218 296:233 297:62 322:117 323:254 324:195 350:203 351:254 352:195 377:64 378:244 379:254 380:195 405:79 406:254 407:255 408:161 433:79 434:254 435:254 436:65 461:79 462:254 463:241 464:52 489:79 490:254 491:189 517:79 518:254 519:189 545:79 546:254 547:189 573:79 574:254 575:189 601:79 602:254 603:194 604:5 629:35 630:219 631:254 632:72 658:34 659:223 660:195 687:129 688:195 -1 101:11 102:150 103:72 129:37 130:251 131:71 157:63 158:251 159:71 185:217 186:251 187:71 213:217 214:251 215:71 240:145 241:253 242:253 243:72 267:42 268:206 269:251 270:251 271:71 295:99 296:251 297:251 298:251 299:71 323:253 324:251 325:251 326:251 327:71 350:130 351:253 352:251 353:251 354:251 355:71 377:110 378:253 379:255 380:253 381:253 382:253 383:72 405:109 406:251 407:253 408:251 409:251 410:188 411:30 433:109 434:251 435:253 436:251 437:246 438:123 460:16 461:170 462:251 463:253 464:251 465:215 488:37 489:251 490:251 491:253 492:251 493:86 516:218 517:253 518:253 519:255 520:253 521:35 543:84 544:236 545:251 546:251 547:253 548:168 549:15 571:144 572:251 573:251 574:251 575:190 576:15 599:144 600:251 601:251 602:251 603:180 626:53 627:221 628:251 629:251 630:251 631:180 -0 127:45 128:254 129:254 130:254 131:148 132:24 133:9 154:43 155:254 156:252 157:252 158:252 159:252 160:252 161:121 162:13 181:58 182:237 183:254 184:252 185:252 186:252 187:252 188:252 189:252 190:68 208:69 209:224 210:252 211:254 212:252 213:252 214:252 215:252 216:252 217:252 218:135 219:17 235:75 236:216 237:252 238:252 239:254 240:231 241:168 242:252 243:252 244:252 245:252 246:252 247:45 262:77 263:212 264:252 265:252 266:252 267:242 268:93 269:32 270:114 271:177 272:252 273:252 274:252 275:158 276:12 289:75 290:212 291:252 292:252 293:252 294:252 295:231 299:116 300:252 301:252 302:252 303:252 304:21 316:69 317:216 318:252 319:252 320:252 321:252 322:252 323:62 327:116 328:252 329:252 330:252 331:252 332:21 344:93 345:252 346:252 347:252 348:252 349:252 350:62 355:21 356:158 357:252 358:252 359:252 360:21 371:64 372:239 373:252 374:252 375:252 376:252 377:252 378:21 384:139 385:252 386:252 387:252 388:21 398:5 399:87 400:254 401:254 402:254 403:254 404:237 405:41 411:11 412:150 413:254 414:254 415:254 416:22 425:5 426:85 427:252 428:252 429:252 430:252 431:222 432:55 439:116 440:252 441:252 442:252 443:214 444:18 453:24 454:252 455:252 456:252 457:252 458:252 459:91 466:26 467:153 468:252 469:252 470:252 471:45 481:24 482:252 483:252 484:252 485:252 486:252 487:91 492:18 493:93 494:151 495:252 496:252 497:252 498:184 499:28 509:24 510:252 511:252 512:252 513:252 514:252 515:164 516:116 517:116 518:116 519:117 520:141 521:252 522:252 523:252 524:252 525:252 526:68 537:24 538:252 539:252 540:252 541:252 542:252 543:252 544:252 545:252 546:252 547:254 548:252 549:252 550:252 551:252 552:252 553:163 554:31 565:9 566:121 567:252 568:252 569:252 570:252 571:252 572:252 573:252 574:252 575:254 576:252 577:252 578:252 579:178 580:91 581:33 594:13 595:119 596:252 597:252 598:252 599:252 600:252 601:252 602:252 603:254 604:252 605:252 606:184 607:37 623:13 624:121 625:252 626:252 627:252 628:252 629:252 630:252 631:254 632:214 633:45 634:28 652:8 653:21 654:21 655:169 656:252 657:252 658:41 659:22 660:18 -0 125:218 126:253 127:253 128:255 129:149 130:62 151:42 152:144 153:236 154:251 155:251 156:253 157:251 158:236 159:144 160:144 179:99 180:251 181:251 182:251 183:225 184:253 185:251 186:251 187:251 188:251 189:166 190:16 206:79 207:253 208:251 209:251 210:204 211:41 212:143 213:205 214:251 215:251 216:251 217:253 218:169 219:15 233:79 234:231 235:253 236:251 237:225 238:41 241:41 242:226 243:251 244:251 245:253 246:251 247:164 260:37 261:253 262:253 263:255 264:253 265:35 271:79 272:232 273:255 274:253 275:227 276:42 288:140 289:251 290:251 291:253 292:168 293:15 300:77 301:253 302:251 303:251 304:142 315:21 316:221 317:251 318:251 319:164 320:15 329:227 330:251 331:251 332:236 333:61 342:32 343:190 344:251 345:251 346:251 357:73 358:251 359:251 360:251 361:71 370:73 371:251 372:251 373:251 374:251 385:73 386:251 387:251 388:251 389:71 398:73 399:253 400:253 401:253 402:201 413:73 414:253 415:253 416:253 417:72 426:176 427:251 428:251 429:251 430:71 441:73 442:251 443:251 444:251 445:71 454:253 455:251 456:251 457:157 458:10 469:73 470:251 471:251 472:251 473:71 482:253 483:251 484:251 485:142 497:150 498:251 499:251 500:204 501:41 510:124 511:251 512:251 513:220 514:180 524:130 525:253 526:251 527:225 528:41 538:73 539:253 540:253 541:253 542:253 543:73 544:73 545:10 549:42 550:73 551:150 552:253 553:255 554:253 555:216 566:31 567:189 568:251 569:251 570:251 571:253 572:251 573:159 574:144 575:144 576:145 577:206 578:251 579:251 580:251 581:253 582:168 583:92 595:20 596:195 597:251 598:251 599:253 600:251 601:251 602:251 603:251 604:253 605:251 606:251 607:251 608:225 609:164 610:15 624:21 625:142 626:220 627:253 628:251 629:251 630:251 631:251 632:253 633:251 634:251 635:204 636:41 654:51 655:72 656:174 657:251 658:251 659:251 660:253 661:147 662:71 663:41 -0 127:60 128:96 129:96 130:48 153:16 154:171 155:228 156:253 157:251 158:220 159:51 160:32 181:127 182:251 183:251 184:253 185:251 186:251 187:251 188:251 189:80 207:24 208:182 209:236 210:251 211:211 212:189 213:236 214:251 215:251 216:251 217:242 218:193 234:100 235:194 236:251 237:251 238:211 239:35 241:71 242:173 243:251 244:251 245:253 246:240 247:158 248:19 261:64 262:253 263:255 264:253 265:205 266:19 271:40 272:218 273:255 274:253 275:253 276:91 288:16 289:186 290:251 291:253 292:247 293:110 300:39 301:233 302:251 303:251 304:188 315:16 316:189 317:251 318:251 319:205 320:110 329:48 330:220 331:251 332:220 333:48 343:72 344:251 345:251 346:251 347:158 358:51 359:251 360:251 361:232 371:190 372:251 373:251 374:251 375:59 386:32 387:251 388:251 389:251 398:96 399:253 400:253 401:253 402:95 414:32 415:253 416:253 417:193 426:214 427:251 428:251 429:204 430:23 442:52 443:251 444:251 445:94 454:253 455:251 456:251 457:109 469:48 470:221 471:251 472:219 473:47 482:253 483:251 484:251 485:70 497:234 498:251 499:251 500:188 510:253 511:251 512:251 513:188 523:40 524:158 525:253 526:251 527:172 528:70 539:191 540:253 541:253 542:253 543:96 544:24 549:12 550:174 551:253 552:253 553:255 554:221 567:71 568:251 569:251 570:251 571:253 572:205 573:190 574:190 575:190 576:191 577:197 578:251 579:251 580:231 581:221 582:93 595:16 596:126 597:236 598:251 599:253 600:251 601:251 602:251 603:251 604:253 605:251 606:251 607:140 608:47 625:67 626:188 627:189 628:188 629:188 630:188 631:188 632:189 633:188 634:109 635:4 -0 126:32 127:202 128:255 129:253 130:253 131:175 132:21 152:84 153:144 154:190 155:251 156:253 157:251 158:251 159:251 160:174 176:6 177:37 178:166 179:218 180:236 181:251 182:251 183:251 184:253 185:251 186:251 187:251 188:251 189:156 204:115 205:251 206:251 207:253 208:251 209:251 210:251 211:251 212:253 213:251 214:251 215:251 216:251 217:180 231:105 232:241 233:251 234:251 235:253 236:251 237:251 238:251 239:122 240:72 241:71 242:71 243:148 244:251 245:180 258:73 259:253 260:253 261:253 262:253 263:202 264:253 265:253 266:143 286:31 287:189 288:251 289:251 290:251 291:31 292:189 293:251 294:142 314:63 315:236 316:251 317:251 318:96 320:124 321:246 322:142 330:21 331:166 332:21 342:73 343:251 344:251 345:251 346:71 349:217 350:142 357:32 358:190 359:251 360:142 370:73 371:251 372:251 373:251 374:71 377:217 378:142 385:73 386:251 387:251 388:142 398:73 399:253 400:253 401:253 402:72 405:156 406:103 413:73 414:253 415:253 416:253 417:72 426:73 427:251 428:251 429:251 430:174 441:73 442:251 443:251 444:251 445:71 454:73 455:251 456:251 457:251 458:251 469:73 470:251 471:251 472:251 473:71 482:42 483:205 484:251 485:251 486:251 487:79 497:73 498:251 499:251 500:251 501:71 511:41 512:226 513:251 514:251 515:232 516:77 525:73 526:251 527:251 528:251 529:71 540:166 541:253 542:253 543:255 544:253 545:227 546:73 547:21 553:125 554:253 555:253 556:143 568:16 569:169 570:251 571:253 572:251 573:251 574:251 575:174 576:105 579:63 580:144 581:253 582:251 583:251 584:142 597:15 598:35 599:253 600:251 601:251 602:251 603:251 604:243 605:217 606:217 607:231 608:251 609:253 610:251 611:220 612:20 627:143 628:142 629:236 630:251 631:251 632:253 633:251 634:251 635:251 636:251 637:253 638:251 639:137 657:61 658:71 659:200 660:253 661:251 662:251 663:251 664:251 665:201 666:71 667:10 -1 130:218 131:170 132:108 157:32 158:227 159:252 160:232 185:129 186:252 187:252 188:252 212:1 213:253 214:252 215:252 216:168 240:144 241:253 242:252 243:236 244:62 268:144 269:253 270:252 271:215 296:144 297:253 298:252 299:112 323:21 324:206 325:253 326:252 327:71 351:99 352:253 353:255 354:119 378:63 379:242 380:252 381:253 382:35 406:94 407:252 408:252 409:154 410:10 433:145 434:237 435:252 436:252 461:255 462:253 463:253 464:108 487:11 488:155 489:253 490:252 491:179 492:15 514:11 515:150 516:252 517:253 518:200 519:20 542:73 543:252 544:252 545:253 546:97 569:47 570:233 571:253 572:253 596:1 597:149 598:252 599:252 600:252 624:1 625:252 626:252 627:246 628:132 652:1 653:169 654:252 655:132 -1 130:116 131:255 132:123 157:29 158:213 159:253 160:122 185:189 186:253 187:253 188:122 213:189 214:253 215:253 216:122 241:189 242:253 243:253 244:122 267:2 268:114 269:243 270:253 271:186 272:19 295:100 296:253 297:253 298:253 299:48 323:172 324:253 325:253 326:253 327:48 351:172 352:253 353:253 354:182 355:19 378:133 379:251 380:253 381:175 382:4 405:107 406:251 407:253 408:253 409:65 432:26 433:194 434:253 435:253 436:214 437:40 459:105 460:205 461:253 462:253 463:125 464:40 487:139 488:253 489:253 490:253 491:81 514:41 515:231 516:253 517:253 518:159 519:16 541:65 542:155 543:253 544:253 545:172 546:4 569:124 570:253 571:253 572:253 573:98 597:124 598:253 599:253 600:214 601:41 624:22 625:207 626:253 627:253 628:139 653:124 654:253 655:162 656:9 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_linear_regression_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_linear_regression_data.txt deleted file mode 100755 index 9aaaa42..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_linear_regression_data.txt +++ /dev/null @@ -1,501 +0,0 @@ --9.490009878824548 1:0.4551273600657362 2:0.36644694351969087 3:-0.38256108933468047 4:-0.4458430198517267 5:0.33109790358914726 6:0.8067445293443565 7:-0.2624341731773887 8:-0.44850386111659524 9:-0.07269284838169332 10:0.5658035575800715 -0.2577820163584905 1:0.8386555657374337 2:-0.1270180511534269 3:0.499812362510895 4:-0.22686625128130267 5:-0.6452430441812433 6:0.18869982177936828 7:-0.5804648622673358 8:0.651931743775642 9:-0.6555641246242951 10:0.17485476357259122 --4.438869807456516 1:0.5025608135349202 2:0.14208069682973434 3:0.16004976900412138 4:0.505019897181302 5:-0.9371635223468384 6:-0.2841601610457427 7:0.6355938616712786 8:-0.1646249064941625 9:0.9480713629917628 10:0.42681251564645817 --19.782762789614537 1:-0.0388509668871313 2:-0.4166870051763918 3:0.8997202693189332 4:0.6409836467726933 5:0.273289095712564 6:-0.26175701211620517 7:-0.2794902492677298 8:-0.1306778297187794 9:-0.08536581111046115 10:-0.05462315824828923 --7.966593841555266 1:-0.06195495876886281 2:0.6546448480299902 3:-0.6979368909424835 4:0.6677324708883314 5:-0.07938725467767771 6:-0.43885601665437957 7:-0.608071585153688 8:-0.6414531182501653 9:0.7313735926547045 10:-0.026818676347611925 --7.896274316726144 1:-0.15805658673794265 2:0.26573958270655806 3:0.3997172901343442 4:-0.3693430998846541 5:0.14324061105995334 6:-0.25797542063247825 7:0.7436291919296774 8:0.6114618853239959 9:0.2324273700703574 10:-0.25128128782199144 --8.464803554195287 1:0.39449745853945895 2:0.817229160415142 3:-0.6077058562362969 4:0.6182496334554788 5:0.2558665508269453 6:-0.07320145794330979 7:-0.38884168866510227 8:0.07981886851873865 9:0.27022202891277614 10:-0.7474843534024693 -2.1214592666251364 1:-0.005346215048158909 2:-0.9453716674280683 3:-0.9270309666195007 4:-0.032312290091389695 5:0.31010676221964206 6:-0.20846743965751569 7:0.8803449313707621 8:-0.23077831216541722 9:0.29246395759528565 10:0.5409312755478819 -1.0720117616524107 1:0.7880855916368177 2:0.19767407429003536 3:0.9520689432368168 4:-0.845829774129496 5:0.5502413918543512 6:-0.44235539500246457 7:0.7984106594591154 8:-0.2523277127589152 9:-0.1373808897290778 10:-0.3353514432305029 --13.772441561702871 1:-0.3697050572653644 2:-0.11452811582755928 3:-0.807098168238352 4:0.4903066124307711 5:-0.6582805242342049 6:0.6107814398427647 7:-0.7204208094262783 8:-0.8141063661170889 9:-0.9459402662357332 10:0.09666938346350307 --5.082010756207233 1:-0.43560342773870375 2:0.9349906440170221 3:0.8090021580031235 4:-0.3121157071110545 5:-0.9718883630945336 6:0.6191882496201251 7:0.0429886073795116 8:0.670311110015402 9:0.16692329718223786 10:0.37649213869502973 -7.887786536531237 1:0.11276440263810383 2:-0.7684997525607482 3:0.1770172737885798 4:0.7902845707138706 5:0.2529503304079441 6:-0.23483801763662826 7:0.8072501895004851 8:0.6673992021927047 9:-0.4796127376677324 10:0.9244724404994455 -14.323146365332388 1:-0.2049276879687938 2:0.1470694373531216 3:-0.48366999792166787 4:0.643491115907358 5:0.3183669486383729 6:0.22821350958477082 7:-0.023605251086149304 8:-0.2770587742156372 9:0.47596326458377436 10:0.7107229819632654 --20.057482615789212 1:-0.3205057828114841 2:0.51605972926996 3:0.45215640988181516 4:0.01712446974606241 5:0.5508198371849293 6:-0.2478254241316491 7:0.7256483175955235 8:0.39418662792516 9:-0.6797384914236382 10:0.6001217520150142 --0.8995693247765151 1:0.4508991072414843 2:0.589749448443134 3:0.6464818311502738 4:0.7005669004769028 5:0.9699584106930381 6:-0.7417466269908464 7:0.22818964839784495 8:0.08574936236270037 9:-0.6945765138377225 10:0.06915201979238828 --19.16829262296376 1:0.09798746565879424 2:-0.34288007110901964 3:0.440249350802451 4:-0.22440768392359534 5:-0.9695067570891225 6:-0.7942032659310758 7:-0.792286205517398 8:-0.6535487038528798 9:0.7952676470618951 10:-0.1622831617066689 -5.601801561245534 1:0.6949189734965766 2:-0.32697929564739403 3:-0.15359663581829275 4:-0.8951865090520432 5:0.2057889391931318 6:-0.6676656789571533 7:-0.03553655732400762 8:0.14550349954571096 9:0.034600542078191854 10:0.4223352065067103 --3.2256352187273354 1:0.35278245969741096 2:0.7022211035026023 3:0.5686638754605697 4:-0.4202155290448111 5:-0.26102723928249216 6:0.010688215941416779 7:-0.4311544807877927 8:0.9500151672991208 9:0.14380635780710693 10:-0.7549354840975826 -1.5299675726687754 1:-0.13079299081883855 2:0.0983382230287082 3:0.15347083875928424 4:0.45507300685816965 5:0.1921083467305864 6:0.6361110540492223 7:0.7675261182370992 8:-0.2543488202081907 9:0.2927051050236915 10:0.680182444769418 --0.250102447941961 1:-0.8062832278617296 2:0.8266289890474885 3:0.22684501241708888 4:0.1726291966578266 5:-0.6778773666126594 6:0.9993906921393696 7:0.1789490173139363 8:0.5584053824232391 9:0.03495894704368174 10:-0.8505720014852347 -12.792267926563595 1:-0.008461200645088818 2:-0.648273596036564 3:-0.005334477339629995 4:0.3781469006858833 5:0.30565234666790686 6:-0.2822867492866177 7:0.10175120738413801 8:0.5342432888482425 9:0.05146513075475534 10:-0.6459729964194652 -6.082192787194888 1:0.42519013450094767 2:0.09441503345243984 3:-0.07898439043103522 4:-0.32207498048636474 5:-0.9180071861219266 6:0.5951317320731633 7:0.41000814588717693 8:-0.3926260640533046 9:0.2789036768568971 10:0.13163692286014528 --7.481405271455238 1:0.03324842612749346 2:0.07055844751995122 3:-0.47199515597021113 4:-0.682690342465275 5:0.3983414713797069 6:-0.2136729393256811 7:-0.09066563475481249 8:-0.4640338194317184 9:-0.03513782089224482 10:-0.1711809802758364 -6.739533816100517 1:0.1774546460228057 2:-0.6783644553523549 3:-0.47871398278230504 4:0.02272121490463097 5:-0.5047649289302389 6:0.26479596144873896 7:-0.32045436544054096 8:0.3113047940487379 9:0.6269418147567556 10:0.9710114516962312 -3.780807062175497 1:0.01715676997104909 2:0.8975962429865936 3:-0.46594560920034134 4:0.2873623499953055 5:0.8894362304584083 6:0.17973981232418468 7:0.49105791400707743 8:-0.7359842740294882 9:0.38941133808001127 10:-0.7151884777228046 -4.564039393483412 1:0.07478785545033317 2:-0.8672651994084235 3:0.450599300176334 4:0.35104802298560056 5:0.6797318185095045 6:-0.03891997518827006 7:-0.33208695871398675 8:0.6166574577055226 9:0.5730212324012205 10:-0.4194925751047054 --0.3195679646035633 1:0.054527683864544096 2:-0.15591931640565093 3:0.9266742559542833 4:0.888522581905147 5:0.6576203900699167 6:0.6417770212400336 7:0.7509788029052338 8:-0.3104974571382815 9:0.7234744267051683 10:-0.15869049651427103 -11.290452658023497 1:0.20173310976772196 2:0.8657502566551409 3:0.9325160601080682 4:0.24570884032596263 5:-0.6546108813337841 6:-0.14020032028377583 7:-0.8825687891702743 8:-0.21420166926412865 9:-0.8600275184792756 10:-0.7990574622230739 --4.003499192090455 1:0.8325875503351796 2:-0.5956350140619129 3:0.12598048009007923 4:0.12340188733473134 5:-0.839435659309717 6:-0.16623481818728414 7:0.12028795301041662 8:-0.7994713170657952 9:0.2216721974907896 10:0.8407561415075087 --19.872991038068406 1:-0.9325810772922609 2:-0.6411471147334535 3:0.9949216290375054 4:0.483048267470493 5:-0.8736297429070232 6:-0.36222771685582544 7:0.26397860162786957 8:0.45527588775737704 9:-0.9424989711186325 10:0.6251162293059616 -10.502762149373098 1:-0.2307778924009991 2:0.6977871128979924 3:0.022830408261390822 4:0.6257738824362347 5:0.9770979848265122 6:0.09985730624684575 7:-0.9755858424230182 8:-0.689969833240031 9:-0.7294587311376761 10:0.3496326193951331 --14.328978509075442 1:0.37929821892417404 2:0.8402056881660709 3:-0.1806835799958202 4:0.766314307210441 5:0.865876513623024 6:-0.7113501219432434 7:-0.0932956557986735 8:-0.7042025810921411 9:0.47530696925672267 10:-0.4629102077669889 --16.26143027545273 1:-0.9309578475799722 2:0.7591795880911123 3:0.06296957473213705 4:0.786790093290086 5:-0.9527998391625465 6:-0.08573982501921895 7:-0.3812232026687308 8:-0.6890669703685022 9:0.25415911467755015 10:-0.07664746267502509 -11.772544195529013 1:0.3614756404325046 2:0.14508027508253818 3:0.23042774014795753 4:0.4164348685332022 5:0.4109091750657461 6:0.03853098236933272 7:0.38911994885223145 8:-0.5031309357181766 9:-0.596467768575587 10:0.17884522225228028 -14.697703557439503 1:0.24508864174863 2:0.7576193329655578 3:0.09030511120334461 4:0.9537528991778741 5:-0.7224092160621338 6:-0.34089385162121943 7:0.6924170720838818 8:0.32912306214891784 9:-0.4064624712125904 10:-0.5344662061201593 --13.976130931152703 1:0.5891192531479754 2:0.29862103742464274 3:-0.36153976712796343 4:-0.6552669564323226 5:-0.22672513691161766 6:0.3001336202535376 7:0.34490251346382617 8:0.2072633053920192 9:-0.5659371284058774 10:0.49599636156628835 --14.762758252931127 1:0.31302496164254223 2:-0.6062773982342133 3:-0.9874007658402217 4:-0.6214904627601421 5:-0.11421073677207683 6:-0.5850843421161205 7:0.1250679146774638 8:-0.7108170726393621 9:-0.6888351241194393 10:0.6077343683084389 --3.300641320608255 1:-0.1407178879203672 2:0.12960233233004925 3:-0.4236196478321872 4:0.7903078296084356 5:-0.8755754953628643 6:-0.2062360260394529 7:-0.045680124889026175 8:0.783182093429277 9:-0.02995737262668463 10:-0.33382351650328435 --15.72351561304857 1:-0.1802575775708093 2:-0.991006951265341 3:-0.9107951763247621 4:0.9069820084047908 5:-0.12691921206803047 6:-0.7087012119383593 7:-0.9179510577925369 8:0.18480349982718325 9:-0.4478459144114004 10:-0.5560585660624608 --22.949825936196074 1:0.4797855980916854 2:0.01997502546020402 3:-0.8827928315487465 4:0.2755107907750989 5:0.015544482147298977 6:0.9652687138748801 7:0.6622667860970648 8:-0.7708138539912186 9:0.17728148663006627 10:0.47818190728952925 -12.092431628826905 1:0.1358843437335564 2:0.03643446587894239 3:-0.31070823939673287 4:0.5283033206569152 5:0.3469111543845367 6:-0.5162518174930761 7:0.24270234207184016 8:0.7352292800096338 9:0.8860322286740037 10:0.6748068653962045 --23.51088409032297 1:-0.4683538422180036 2:0.1469540185936138 3:0.9113612952591796 4:-0.9838482669789823 5:0.4506466371133697 6:0.6456121712599778 7:0.8264783725578371 8:0.562664168655115 9:-0.8299281852090683 10:0.40690300256653256 -5.998186124881712 1:-0.9781302074883151 2:0.32984303335155785 3:0.7303430847899663 4:0.841481297188956 5:0.05580773881989276 6:0.7130788298702062 7:-0.218087116119847 8:-0.9889494995220598 9:0.9182854134226501 10:-0.7501751701020942 -9.852316338642547 1:0.146854160091757 2:-0.3611508707370965 3:0.3517016971654914 4:0.6187697988029395 5:-0.010768583697787548 6:0.5236725885871243 7:0.5945666964145524 8:-0.009180562740628506 9:-0.44474762415618274 10:0.41852743519493685 --5.313930756588526 1:-0.6304209277071555 2:-0.37010359785263813 3:-0.3194739026510125 4:-0.750533359080716 5:0.45500303301733114 6:-0.012727544364283805 7:-0.43941651856862274 8:0.927108876532093 9:-0.24164903158058149 10:0.44134972919002124 --4.2775224863223915 1:-0.35785764991284363 2:0.942797043714243 3:0.4539569191274251 4:-0.6944903010994341 5:-0.08357221983075225 6:0.4433049548665855 7:-0.5488972050023557 8:-0.24014623658145773 9:-0.6178118485382511 10:-0.4575463952834564 --10.57769830424322 1:0.22693864400257335 2:-0.041639691095668674 3:0.9948726461115123 4:-0.7450471554938383 5:-0.1114847126717804 6:-0.27881184842402673 7:0.029766812446276214 8:-0.3727649352432578 9:-0.7791732805568077 10:0.9425576681069683 --0.8430338600258201 1:0.4607090007225536 2:-0.6079961642969514 3:-0.5671626932935381 4:0.12784576080614185 5:-0.30766031989910236 6:-0.21232963505711555 7:0.3310463755850872 8:-0.6807682731528943 9:0.7826634145951483 10:0.0608057623636995 -13.450586257053727 1:-0.2697769964284986 2:0.07743737732312428 3:-0.8459687499864881 4:0.6091901514177853 5:-0.9464815428211699 6:0.15780407422581533 7:-0.28552052619478996 8:-0.27500859181806403 9:-0.7207541548282903 10:0.05215593729084533 -20.358241877831016 1:0.29768927445620164 2:-0.5379390525163252 3:0.6591913001003027 4:0.6635992348010928 5:0.3786594651413009 6:-0.7217135278882543 7:0.9634013908615768 8:0.03961253903778861 9:0.1335121312144949 10:0.7933944303463509 -9.800993960518852 1:0.39896823489212285 2:0.30948413101894023 3:0.08568060094378493 4:-0.7454513450113371 5:0.8054125831421357 6:-0.24464240413169347 7:-0.18294406588625112 8:-0.883455504399858 9:0.2468431033653562 10:-0.708151566382103 --21.432387764165806 1:-0.4785033857256795 2:0.520350718059089 3:-0.2988515012130126 4:-0.46260150057299754 5:0.5394344995663083 6:0.39320468081626836 7:0.1890560923345248 8:0.13123799325264507 9:0.43613839380760355 10:0.39541998419731494 --4.090570760187878 1:0.3909705814857716 2:0.9830271975811611 3:0.672523651785939 4:0.0035177223850744177 5:0.567082732451311 6:-0.2620454326881394 7:0.46622578556708105 8:0.646246879249865 9:0.4263175536668733 10:0.8982696975276223 -3.7459201216906926 1:-0.9480167656870653 2:-4.888270196095057E-4 3:0.48226844071577646 4:-0.23706663537631645 5:0.22420266627462127 6:0.2981747607694978 7:0.3893425967975348 8:0.6302701381298614 9:-0.21909113816064196 10:0.8371697958140494 -9.767952084958061 1:-0.2300790371078303 2:-0.4457883630748676 3:0.28710853302295325 4:0.7112839743052013 5:-0.8765858382640623 6:-0.6470779468607217 7:0.4369262584371727 8:-0.7175412028407337 9:0.5506733477278882 10:0.5393007189573547 -6.9802839308913365 1:0.21769855012808215 2:0.8653818331675485 3:0.2322943113578111 4:0.3760591265797468 5:0.06554014167292377 6:0.6866096712933549 7:0.866929973115441 8:-0.6462263417217329 9:0.2507247465275353 10:-0.7005877782050307 -16.014720800069103 1:0.6058055248984549 2:0.048517868234337014 3:-0.15744912875924877 4:0.32598079708869365 5:-0.587791997223768 6:-0.4636187312118474 7:0.7771908559246068 8:-0.349403853888719 9:0.229800030145503 10:-0.674614818934488 -8.417571532985823 1:-0.21164946152466801 2:-0.9981936663594053 3:0.8611869575187896 4:0.11100891297254312 5:-0.7406067304729631 6:-0.7613837395522254 7:-0.9617573325708704 8:0.5697426971647488 9:-0.5830879716990833 10:0.5951448538064159 --12.491442077546413 1:-0.19172117564625735 2:-0.12421304883392126 3:0.7095605786791346 4:0.6401582292398038 5:-0.9347790209840108 6:0.6592209285686903 7:0.702282297844389 8:-0.22765902007749528 9:-0.17746922342943816 10:0.7196663432778121 --8.605713514762092 1:0.36490454976480846 2:0.6991204480538957 3:0.6546945560337121 4:-0.032324845758738174 5:0.2453935969836043 6:0.5363119225093116 7:0.6266741350524205 8:-0.2132266305382322 9:-0.308105870487996 10:-0.08219413867616465 --10.35591860037468 1:-0.014204168485027147 2:-0.7077035677144325 3:0.024004217785642767 4:0.818971992516166 5:0.9081305263471056 6:0.808854493237229 7:-0.6474336785461867 8:-0.32559288177031465 9:-0.32850453072496055 10:-0.7035310416695784 -3.605002621628445 1:0.6085817977516599 2:0.8101072412357928 3:0.7697891508923966 4:-0.5738750389864677 5:-0.734314989863889 6:-0.7879014492215499 7:0.6884442838920775 8:-0.46131231930402383 9:-0.7730585954271005 10:-0.7819874019145132 -12.30435312415091 1:0.3283668768730639 2:-0.18316686990068187 3:0.3955614099142126 4:0.8450470350842108 5:0.3958042901611589 6:0.6578475571960676 7:-0.4395488558075096 8:0.15720430113495376 9:-0.5318362828977672 10:0.45585285255232044 -9.020048819638827 1:-0.5986521145193395 2:0.3266542215286443 3:-0.09911773729611917 4:-0.21478254478908676 5:0.6546175049764293 6:-0.1414796368932345 7:0.25802631337510085 8:-0.6773828562539816 9:-0.22038193899258718 10:-0.17465737306657902 -14.854262978981406 1:0.5293763924477841 2:-0.24658868331583683 3:0.8268631648872109 4:0.8969207203400265 5:0.03933229861213983 6:-0.6212951181360529 7:-0.36695460282178205 8:-0.5468014636386027 9:-0.3419492829414976 10:-0.8273314086998671 -5.658665647926016 1:0.9543096383762801 2:0.13230023957687176 3:-0.3071929861496465 4:-0.3646067841449696 5:0.6979929890816723 6:-0.20721664168809228 7:0.6676482547655365 8:0.944757051233543 9:0.024377296173674567 10:-0.9413728609667691 --6.930603551528371 1:0.09198647857985232 2:-0.3685113649452161 3:-0.2361728930325453 4:0.3674268130607439 5:0.27385598384498344 6:-0.7151900241735676 7:0.3310154476154119 8:-0.24328111897361682 9:0.2511378679668912 10:-0.35825141175578934 -13.361196783041926 1:0.11676665169094824 2:-0.49968608916548307 3:0.9941342810313298 4:-0.17858967215374988 5:0.1993744673440312 6:0.14596837574280297 7:-0.8245495433125194 8:-0.5637934691545672 9:-0.8589185806222286 10:-0.4923216901915597 --3.280508467210429 1:-0.9917770074538397 2:-0.1547651813493751 3:0.621733177563484 4:0.7303326279246298 5:-0.0786900332560696 6:0.9107127797641994 7:0.7104513024299466 8:-0.32858522942354407 9:0.17013652749847386 10:0.27656984316288824 -11.13509519160867 1:0.6874932143640391 2:-0.46610293161038907 3:0.8744681017967024 4:0.40900365224695956 5:-0.49770054448432055 6:-0.0635770754462921 7:-0.5705387648707747 8:-0.577988250149829 9:-0.8099463063934682 10:0.42132700180827354 --11.857350365429426 1:-0.24607974991258308 2:-0.943388538022258 3:0.8679112109377674 4:0.7779951176637694 5:-0.5802336023276593 6:-0.9093352471884992 7:0.29337797938742316 8:0.498519874589175 9:0.3493034812120912 10:-0.07307210651399076 -11.421632138263703 1:0.3911519359353859 2:-0.8154393787235621 3:0.47194271125243237 4:0.14014792298759593 5:-0.3589345913619957 6:0.7887695409762479 7:0.49962792312858895 8:-0.6402670146359797 9:-0.2314041601683119 10:-0.798901341175887 -5.194792012146463 1:0.810279303469398 2:-0.9772756877199589 3:-0.20925958437085557 4:0.8797562461102444 5:0.3211532423260066 6:0.25250279470783754 7:0.14387831263435813 8:-0.021466789385169882 9:0.18909293657271564 10:-0.5981349964027893 -12.242677118499806 1:0.3565715672082048 2:0.7366743237221687 3:0.1922233582434527 4:-0.3551925780624561 5:0.5290849503909634 6:0.7744214641246749 7:0.7277215028580597 8:-0.590440215391044 9:0.7427328184290733 10:-0.6231904162251609 -3.496172341296411 1:0.5028717258135624 2:-0.5838871888624848 3:-0.5540116561110324 4:0.8502487679795261 5:-0.7983061034328727 6:-0.3853123296389005 7:-0.1493800684643869 8:0.6008798629354264 9:-0.32299062155495406 10:-0.5827019502242026 --15.437384793431217 1:0.41994681418237345 2:0.7106426870657483 3:-0.45211033467567696 4:-0.7272406549392239 5:-0.35736594496490737 6:0.4764507578985955 7:-0.5249912641281373 8:0.8562010912051132 9:0.45927621623833637 10:-0.3701817429794385 -5.490036861541498 1:0.8414999442459015 2:0.9273442862476728 3:-0.054654787893199774 4:-0.23126134156257327 5:-0.9155048245317694 6:0.25750538376376975 7:-0.8470916763665326 8:0.9105674676753848 9:0.5026028522378054 10:-0.06650501561108468 --1.074065343287859 1:0.37484830603001607 2:-0.9858854245832975 3:0.007159356555897611 4:0.8172796295244154 5:0.519147377529164 6:0.8211049991970722 7:0.9901658817979146 8:-0.026519560032641998 9:-0.2328762488733862 10:0.43161994187258035 -2.0482082496444622 1:0.24940246021565793 2:0.47248358864259177 3:0.23833814894291105 4:-0.3449172512379757 5:0.7412869866239866 6:0.1351422898741914 7:-0.003784141556894216 8:-0.6321917152754075 9:0.8246267827865776 10:0.5057520480449009 -16.709794859608397 1:-0.5977424405191092 2:-0.13991362149785713 3:0.613487896720806 4:-0.37681525320218157 5:-0.4369592282569783 6:0.4702242879506955 7:0.07498463532645339 8:-0.9942304127133292 9:0.41304209196175257 10:0.6799250665519481 -4.598881854940949 1:-0.41212838137243835 2:0.6737124633791323 3:0.8376369191216593 4:0.2848328781926128 5:-0.17960265353296 6:0.0035488712665472377 7:-0.8355355482928055 8:-0.7439716673142398 9:-0.009043467128117433 10:0.7423272515054122 -9.566038608555402 1:-0.662329643040616 2:0.4727113884417973 3:-0.15734218732411365 4:-0.3950754785173889 5:0.13837083076070011 6:0.633261314089351 7:0.9929998062307679 8:-0.4639028424346423 9:-0.073992579817449 10:0.3413166410117088 -1.629198477883475 1:-0.2875719791707101 2:0.9395753700232541 3:-0.45090801750966314 4:-0.384528069378699 5:-0.35937736478702753 6:0.9597102694501136 7:-0.6898325123180971 8:-0.11436012866371303 9:-0.5330550575952768 10:0.24688769932037258 --7.374620970147229 1:0.16864051681940984 2:-0.08391828256018252 3:-0.8184503043836224 4:0.5461252511055263 5:0.7264676659099087 6:-0.9654384426822686 7:-0.8537533138667612 8:0.9189716013058653 9:-0.03449322582531389 10:0.5490329745887035 --0.5741704240890674 1:0.9392753294760656 2:-0.5579682000156501 3:-0.8083270703362093 4:-0.7022804026958895 5:-0.30426803430649896 6:0.8211432527140852 7:-0.8101343265051797 8:-0.0945946325760949 9:0.49546915718101814 10:0.5184327698839013 -12.583032451116004 1:0.20496323995364651 2:0.5082017540304999 3:0.2428646053751764 4:0.7101854338863274 5:-0.9619925264660094 6:0.4610134502825909 7:-0.5620669052678122 8:0.6766614078376236 9:-0.7169693435782278 10:-0.14362322382035164 --10.489157123372898 1:-0.7441633083637054 2:0.07069898351187809 3:-0.47119552972566336 4:-0.43970155900871344 5:0.43192289605353973 6:-0.0798550143899397 7:0.2111188135787776 8:0.9101748615761336 9:-0.4079984876629721 10:-0.8101424982394589 --3.811365493249739 1:0.7250263461647963 2:0.22182621035333838 3:-0.12735342714215725 4:0.26222861719040624 5:0.3928174057935714 6:0.817131411734006 7:-0.056109765698795 8:0.7908779197353637 9:-0.06768319505245768 10:0.4107045608924882 --7.604636483513961 1:0.876751634787073 2:0.04037085575852295 3:0.18142385658771398 4:0.38350565074271903 5:-0.30937664332011905 6:-0.9544807672006823 7:0.008643477632712449 8:-0.27676843472226276 9:-0.12938540988602476 10:-0.2929762262661819 --1.9889499615051784 1:-0.4243149295090465 2:0.22578711943818686 3:0.662530786460152 4:0.28592235843136105 5:0.4170345231441832 6:0.9290881132120887 7:0.5332443368002588 8:-0.33248958421809927 9:0.16273139830495942 10:0.6899022585936985 --1.99891354174786 1:-0.1732078452611825 2:0.2361029542296429 3:-0.8455867017505336 4:0.31638672033240867 5:-0.648387667144986 6:-0.7647886103837449 7:0.6910155501192978 8:-0.2665663102538198 9:-0.5980899570876459 10:-0.9165896495676276 -9.74348630903265 1:0.18934450539532244 2:-0.715110505416745 3:-0.453777527810155 4:0.2743741252197758 5:-0.8439310405443103 6:-0.533835190276116 7:-0.5911710854054728 8:0.21026462628920695 9:-0.45849607678093585 10:0.1327074179200407 -20.221961806051706 1:0.624731930687735 2:-0.39914395421723015 3:0.781887900750925 4:0.5442619051596436 5:0.16651193067479153 6:0.9064846121246533 7:-0.3643159594276202 8:-0.5182065337246469 9:-0.6785628247191553 10:0.7111152852903913 -20.456947955410897 1:-0.21923785332346513 2:0.11340668617783778 3:0.7397883986253251 4:-0.11748081084695605 5:0.06314872700777197 6:-0.7124574845946587 7:0.18043581960897104 8:-0.09023925260092103 9:-0.7256417560118238 10:-0.5038088673851804 -12.241006086129564 1:-0.15271598143132215 2:0.9038942665552285 3:-0.6168514099878155 4:-0.12219038322317011 5:0.5402785935596728 6:0.4059744401803913 7:0.258870596734184 8:0.3190881033039108 9:0.2372469007313076 10:0.367188299614863 -3.980473021620311 1:-0.9025895351376971 2:-0.03333947011476446 3:-0.8220776066161464 4:0.449117985679933 5:0.9970519437779266 6:0.27430911004640457 7:0.039081352882204046 8:-0.8621514950929796 9:-0.569587565933642 10:-0.9118346349929578 --13.420594775890757 1:0.3697979495309094 2:0.07383664120111888 3:0.7199366131785143 4:0.2118625428869032 5:-0.9015976323216077 6:-0.5298395275757712 7:-0.9517419542156635 8:0.39554920787574743 9:-0.3721957439110324 10:-0.4750272836396878 --1.052659359353786 1:0.02106845330888185 2:0.7571245678782959 3:0.8034228830223251 4:0.32968340513846917 5:-0.6510386482911554 6:0.2710115488605187 7:-0.1319580272290235 8:0.932600992666184 9:0.8260461527035414 10:-0.8507648952138052 -9.813440129324034 1:0.41048687946340134 2:0.9384639988086239 3:0.4569555844323441 4:-0.3084729082645552 5:-0.7299010284877061 6:-0.6925012997779212 7:-0.6798013915257548 8:-0.504368104320321 9:-0.6234398059664716 10:0.8633407902005543 --2.8942782378157714 1:0.5546381825677706 2:0.7959405841824887 3:0.584699836289184 4:-0.5726371777829862 5:-0.2827976152663936 6:0.138034013875719 7:-0.2935080791661324 8:-0.5323479091625714 9:0.6837641044797451 10:0.5986680812032501 -8.562937733537664 1:0.14753220510180776 2:-0.31591341855048327 3:-0.748545617199091 4:0.3251888821665734 5:0.8228589483149358 6:0.046659706976506676 7:-0.35049927996132624 8:0.2953170004605874 9:-0.6429374177050204 10:0.4624083116836044 -13.413187970975178 1:-0.7229883396779724 2:0.8876940454894067 3:-0.033794226589695775 4:0.46700071356381523 5:0.1599557295166274 6:-0.8944619785248653 7:-0.1258464584151997 8:-0.8797551785991506 9:-0.14408879184669354 10:0.11020655997336015 --5.491389764900794 1:-0.366507395597937 2:0.630480481240723 3:-0.16600801981741609 4:0.09842042773854076 5:0.30129535029579047 6:0.14102166298628882 7:-0.28131788612036623 8:0.49635295715686234 9:0.0625636989631968 10:-0.41748132718912 --10.29566593602992 1:-0.7898597726154271 2:-0.05425577320946573 3:0.5992645759265662 4:-0.4716868549309716 5:-0.020137302700854676 6:0.6216515277233232 7:-0.7295510954484412 8:-0.41443875567123967 9:-0.610576632050404 10:-0.9515988311377204 -7.084732852050431 1:0.9990215581592679 2:-0.9868954542412269 3:0.49133473382040704 4:0.7697599878561228 5:-0.34668939907967267 6:0.9777705993519483 7:0.4449043102759509 8:0.9812971199646168 9:0.6666598587737487 10:0.14398842572598514 -0.23715467505851734 1:0.21628799185444336 2:-0.4526390568867018 3:0.6558486691929235 4:0.13730688681492142 5:0.23076986155942736 6:0.7020484017619715 7:-0.12077999528458938 8:0.8306084972447003 9:-0.49337323198621563 10:-0.8270028152572872 -1.1552619549601455 1:-0.48202394020369277 2:-0.6274878708695264 3:-0.27623674153600697 4:-0.5312153415813432 5:-0.030820182786174044 6:-0.5893370965577813 7:0.6666315120904487 8:-0.36482991729570036 9:0.6065771813692735 10:0.05831057330788525 --0.20433879835596253 1:-0.4702220250018212 2:0.9123705796362889 3:-0.2045657170490376 4:-0.18922063450309534 5:-0.31431213362503163 6:0.4150130060120387 7:0.34016193625941127 8:0.8391374136299805 9:0.6884250315764333 10:-0.7916408854251566 --9.751622607785082 1:-0.0014232315621649505 2:-0.1284246813729939 3:0.5228953023175369 4:0.9688522449007109 5:-0.7857721219549156 6:-0.7812922263391038 7:-0.5916136652814756 8:0.793988610184206 9:0.7982949061274296 10:-0.592785473963741 --22.837460416919342 1:-0.17363144173810174 2:-0.3340314573781735 3:0.9351424971322297 4:-0.6430601902397572 5:-0.13363305808148818 6:-0.42446359566938585 7:-0.4093070316761178 8:-0.9302259781839204 9:0.47004365892170585 10:-0.6231289889808045 --3.6318714209289436 1:-0.8296410705737971 2:-0.6056572341069668 3:-0.2975417404042737 4:0.07134138175064741 5:-0.8966463747179154 6:-0.4112675899658855 7:0.7908013478009401 8:0.407396254566472 9:0.9227769302156879 10:0.12418427404473764 --3.8909712376010583 1:-0.6552751548581366 2:-0.5641921108932855 3:-0.6340486345063014 4:-0.5441069121131075 5:0.908720622198947 6:-0.026054643814348077 7:0.03706191653058433 8:-0.6672524338819317 9:0.7958274915288801 10:-0.19029619970124023 --10.600130341909033 1:-0.7457695999520562 2:-0.3739453132549577 3:0.01327423342620393 4:-0.08482897201178563 5:0.84573456086082 6:0.6279927575103963 7:0.014494803555804125 8:0.9420647557771027 9:-0.13484113287285893 10:0.3037405853352888 --12.094351278535258 1:0.9982796018306028 2:0.8354271779265348 3:0.46284321795736116 4:0.07693347919601745 5:-0.4753440408996932 6:-0.47098252868073787 7:0.4810729184846003 8:-0.6136990339205741 9:-0.6715833036640317 10:-0.6247058955319091 -9.936399360181602 1:0.7330323083522969 2:0.47204204993669197 3:0.3850471475752122 4:0.21483460195167958 5:0.3806220122265147 6:0.6336993433402796 7:-0.47987416364572 8:-0.195509010865196 9:-0.6561820282562041 10:-0.45300480439842894 --4.706701061062994 1:-0.847895844561626 2:-0.29946646506145114 3:0.25432868082106497 4:0.1352958872054535 5:-0.8803017974303002 6:-0.3675110562764785 7:0.10888496324899721 8:0.9620737605396772 9:-0.031046632561323895 10:-0.09466883461500908 -5.101614991255809 1:-0.5174248135588373 2:0.14394061894828014 3:0.5613709266711013 4:-0.5678634944326011 5:0.930216209978763 6:-0.6204727890080077 7:0.4133141749872311 8:0.6262685035917408 9:0.03382924477926896 10:-0.15231139191832854 --8.772667465932606 1:-0.2117605577769197 2:-0.4283897136887762 3:0.44686767473401035 4:-0.5507826261358746 5:0.237124956028401 6:0.6413157520982717 7:0.2409214827604571 8:-0.8505503638033114 9:-0.9811997368468401 10:-0.9499963936664035 --11.615775265015627 1:0.8782018665273386 2:-0.9751473570197167 3:0.6307050068521085 4:0.7012721336851997 5:0.21318736263512283 6:0.024885128053773853 7:-0.4580644243558505 8:0.1318650007251434 9:-0.9306090092992167 10:-0.5688746770986652 -19.64829023536192 1:0.14426537998360645 2:0.3557716894181753 3:-0.8577143134654568 4:0.5288643233801469 5:0.9231529738221469 6:0.975999712077738 7:0.24700404691888678 8:0.10206517527052283 9:-0.10041951294847062 10:-0.9412918491876225 -2.7409415438025486 1:-0.7404936009304737 2:-0.9792071376296605 3:-0.49510748520932113 4:0.9538460112904268 5:-0.5075114153141447 6:-0.5890791308058669 7:-0.775366087491284 8:0.4983912525892249 9:-0.2976197956132913 10:0.6791258030468514 --4.394658158733604 1:-0.41628618754613345 2:-0.1472602552309057 3:0.44136102233464025 4:0.011882653940414434 5:-0.6559502840386595 6:-0.4022529016339016 7:0.048402312931387526 8:0.8753776623326166 9:-0.8528247288266961 10:0.6593783978826002 -1.1915739133607073 1:-0.7840827624854878 2:-0.4860418508208426 3:-0.7418773161179972 4:0.129874781837924 5:-0.22631682294184796 6:0.47794208013755024 7:0.5532183426143056 8:0.11879859459306741 9:0.09927630694484524 10:-0.19268618891399636 -2.156192215438919 1:0.44325986644475646 2:-0.6057278708888592 3:0.3943381582091723 4:0.6560336238050575 5:-0.9651308100517204 6:-0.2358219003943678 7:-0.04143043460232465 8:0.8623951169233035 9:-0.4933545255502605 10:0.8990427200454263 --1.1009750789589774 1:-0.4515707618788496 2:-0.745936099912899 3:0.41307003181926794 4:0.6127760492402428 5:0.9250878169732681 6:-0.6778628527469126 7:0.42794190420905753 8:0.4943969797578971 9:0.7762709104958854 10:-0.6932349268610041 -10.04434496594037 1:-0.0995467494040092 2:-0.7766769414838959 3:-0.6608009972582911 4:0.7075788021090594 5:0.5208396359138381 6:-0.09724033794207299 7:-0.743087245352148 8:0.765372791789753 9:0.3788699859744704 10:-0.6898257995055466 -8.038039859115667 1:-0.5321510657026671 2:0.5571925538006008 3:0.747268102801854 4:0.09079641165917596 5:0.25861122989509266 6:-0.9948187479498878 7:-0.9665136866462685 8:-0.3904629432867681 9:-0.9975425877998279 10:0.32024289816988416 -5.14371929922303 1:-0.4829199170694627 2:-0.5713285263827719 3:-0.43889652467111184 4:0.18478247261988967 5:-0.27374063120041225 6:-0.8069125377696931 7:-0.15497746743367058 8:0.32448521325998714 9:-0.39397735035206227 10:0.08184957956614292 --1.6848276484379352 1:-0.39250489761445895 2:0.02730338852529557 3:0.9916055514435305 4:-0.07571433435055064 5:0.19024527726403728 6:0.6385182319185971 7:0.32480605537471297 8:0.5807543325220577 9:-0.35642510103381153 10:-0.9060482769392468 --11.640549677888826 1:0.03707410390488852 2:0.2527049166981137 3:0.4114872952854447 4:-0.8508977901757795 5:-0.42791544663481895 6:-0.9864047295390463 7:0.6023685964407528 8:0.12018443688097036 9:-0.36816249877130414 10:-0.9583147535652901 -11.672104494601319 1:-0.2416258355340175 2:0.6737553249072334 3:0.9041602191361382 4:-0.2123232797997281 5:-0.008255188002961988 6:-0.5151894064136904 7:-0.7341877977528246 8:0.624625272218277 9:-0.6261434804192929 10:-0.5710586715741532 --2.2960192492344627 1:-0.7457768645184579 2:-0.5954998103421847 3:0.5428846769211537 4:-0.6176587961491775 5:0.46222150678166574 6:0.7852238239427731 7:-0.3614580530629148 8:-0.325840253127059 9:-0.5660596710348922 10:-0.8060263366626401 -5.428302298615722 1:0.8774286357993033 2:-0.23110126319781088 3:0.6264134914476072 4:-0.143015582616014 5:0.350109539755298 6:-0.147747167834422 7:0.05020570422182824 8:-0.5469605849960337 9:0.951112567977048 10:-0.34800121380288185 --17.32672073267595 1:0.31374599099683476 2:-0.36270498808879115 3:0.7456203273799138 4:0.046239858938568856 5:-0.030136501929084014 6:-0.06596637210739509 7:-0.46829487815816484 8:-0.2054839116368734 9:-0.7006480295111763 10:-0.6886047709544985 -7.314490512652487 1:0.8745354279105222 2:-0.9270067504840309 3:0.965218170323435 4:0.12808957052353698 5:-0.5309399625085234 6:-0.5968520990090951 7:-0.667403236513185 8:0.08154410986660832 9:0.33025488397543934 10:0.03406708067839537 -4.687373993408297 1:0.6731426721418288 2:-0.7111023070261273 3:-0.9849054116048603 4:-0.12831346258317322 5:-0.04095946352836921 6:0.6967001556166801 7:0.8479895229743999 8:-0.35600791972899404 9:0.5005979045264868 10:0.6421341979636503 --6.82923852156868 1:-0.04849233571020073 2:-0.8505855619911602 3:0.2927180954190314 4:0.5780268040086791 5:-0.22322207765417268 6:-0.8436513934568071 7:-0.3906240514635124 8:0.7258714963093444 9:-0.21695043530813085 10:0.8049335285918169 --8.24622879369294 1:0.12154833675098842 2:-0.26446415445316673 3:-0.06653791221669247 4:-0.7920694887292259 5:0.6128791496627621 6:-0.6927179137980173 7:-0.24584418172709932 8:0.3557416365779935 9:0.22868636757755234 10:-0.8288196322549064 --5.090863544403131 1:-0.1535668648046895 2:-0.59868738365189 3:-0.8822518703008675 4:-0.1790505106198006 5:0.9467581256591948 6:-0.0661313762905984 7:0.31263046332923694 8:-0.03628894224569357 9:0.8969599435828515 10:-0.05386674051170348 --15.780685032623301 1:-0.2568492063716883 2:0.7740976197426315 3:-0.7829158104387535 4:0.8578846037465748 5:-0.6111039318672586 6:-0.26939268282639306 7:0.3659136640533909 8:-0.8205938562638555 9:-0.24945505706767923 10:-0.935948184861368 --3.9916779937384743 1:0.22925954469403154 2:0.18159238246979537 3:0.05607027262862396 4:-0.3376037702047998 5:-0.10630000583678934 6:-0.7148277241201622 7:-0.08327294541727137 8:0.6532439360618307 9:0.34352364313237294 10:-0.21028242388807916 -8.798748248458631 1:0.509058184822212 2:-0.17532831457577935 3:-0.6387880909085213 4:-0.966194650702529 5:0.7829797328120436 6:0.5470735549914605 7:-0.38312745239682333 8:-0.8890923931840893 9:0.6823342859396513 10:0.9231260597729121 -14.341273640964873 1:0.6996156678090684 2:0.8612833977834464 3:0.9131301694042417 4:0.5199385192744859 5:-0.32605907950755086 6:-0.9816465962348846 7:-0.5939885763232406 8:-0.7730924566676425 9:0.6367821449954114 10:0.10873812383881054 -9.75855501262469 1:0.2933324921347933 2:-0.4652534314332506 3:-0.2940640558090537 4:0.9883453215038367 5:-0.042460731786114314 6:-0.15438550895912062 7:-0.11182397625560592 8:0.7425954283250873 9:0.5063859049644963 10:0.3012211854180429 -7.695200921242407 1:0.3554353390157281 2:0.08707592690448718 3:-0.10815435665633877 4:0.05524046679762784 5:0.8000157491787581 6:0.3756193347272323 7:-0.18659830666742527 8:-0.08168623764933125 9:-0.2551379303720174 10:0.8560030587463281 -26.903524792043335 1:-0.4672678144441864 2:0.868381965588082 3:-0.04748335609643428 4:-0.0908285508827269 5:-0.22436865911994275 6:-0.953965287326564 7:0.40644848732968164 8:-0.33391575325981115 9:0.008337907338700212 10:-0.45597904754961416 -9.87318781117539 1:0.7310287890171705 2:-0.38300115058116324 3:0.5492682498036086 4:0.552016070316655 5:0.3715022458396897 6:-0.3888040017277252 7:0.21348231125683648 8:0.23219558685722874 9:-0.6271161253492459 10:-0.009137052604519136 -7.6930514050666625 1:0.48603550488592284 2:-0.9218820771919889 3:0.17678612698428053 4:0.5110501870908806 5:0.5817010201164554 6:0.4488707800038747 7:0.4977618637956498 8:0.1683214570038094 9:0.17237242672259323 10:-0.5276084644007359 -3.155413914311745 1:0.04582517188512947 2:-0.9060800653779759 3:0.049786270132956556 4:-0.4236784487542993 5:0.6293910028372613 6:-0.7370237043436467 7:-0.4599678991281728 8:0.5317111095323057 9:0.0029525239228334055 10:0.9294876800738165 --10.18815737519111 1:-0.9023553189306839 2:0.08434165073970856 3:0.7163931103395633 4:0.41749986495957914 5:-0.8190972970472759 6:-0.9996126872234177 7:0.1779075727741255 8:0.18212754689351862 9:0.24628508239298963 10:0.667589863190412 -18.585731475373457 1:-0.8399129036462931 2:-0.10024819268489127 3:-0.5011350892733817 4:-0.7299256348863585 5:-0.9412022985072928 6:-0.245064895931544 7:-0.1032512650854267 8:0.9943863256441088 9:-0.6429371028855466 10:0.062299742931960056 -8.998359297106072 1:-0.16850226855111905 2:0.7262839202089402 3:-0.04876255055071854 4:0.8948164957242868 5:-0.10720585418953132 6:0.2622719447841948 7:0.26433837506661373 8:-0.5143449147399106 9:0.17444585508955002 10:-0.813182163328944 -13.032424230011074 1:0.4014766166181287 2:-0.1710502754125871 3:-0.309850483152607 4:0.255642456909988 5:0.9949117714165621 6:0.12553772251510864 7:0.6412602805648968 8:-0.6225679446416825 9:-0.15867011477056936 10:-0.4970695349036196 --6.931030745953174 1:0.5151452174260762 2:0.10077064818539072 3:0.9515221270405545 4:-0.21551878535257907 5:0.29152528087481366 6:-0.10995497026133605 7:-0.7872786530801681 8:0.9909149980139627 9:-0.6044617953251021 10:0.4135285912642448 -15.538062451207367 1:-0.493569696351595 2:0.7280914440594639 3:-0.5399160539735497 4:0.5688018985826291 5:0.8296550361854862 6:-0.3519274619833537 7:-0.5536583684230114 8:-0.9648774930921231 9:-0.2649670832738824 10:-0.2337289004188019 -9.499729032920945 1:0.22017490770298553 2:0.7693082799289328 3:-0.7645745307823122 4:-0.4243400515554365 5:-0.7065281515163817 6:-0.9488470141298047 7:-0.7888781431404843 8:-0.38027758953310964 9:0.11329243985448345 10:-0.5636550498916204 --0.6039115764951412 1:0.3128791250125589 2:0.4690308315665288 3:-0.9819748103687955 4:0.28931283693913223 5:-0.6283983933456656 6:-0.10795935596621975 7:0.7785831799196448 8:0.4453768248295542 9:0.4055410615499917 10:-0.581108383985806 -9.682301463907875 1:0.5039970331368235 2:-0.008965105921562966 3:-0.5415225380115054 4:0.4677111860370293 5:-0.3854089758945243 6:-0.8468317339287676 7:-0.29258253017713587 8:0.7361173598968789 9:0.5722561668394952 10:0.8524030171340933 --2.8752191903928064 1:-0.45407356732165205 2:0.6563221064539377 3:-0.8938366926767671 4:0.6028173420234533 5:0.6792881349943096 6:-0.6295604812779405 7:-0.21641416912497213 8:-0.8703620515028858 9:-0.3397362922228042 10:-0.0966947467107604 --28.046018037776633 1:0.9493308195854675 2:0.3285214661535252 3:0.749300278016316 4:-0.006681618268088219 5:0.2936055273341429 6:0.0044706790416966236 7:0.5006172205470896 8:0.38751814960349473 9:0.6069735922707928 10:-0.794612882855285 -2.8752582614589373 1:-0.9443232811926943 2:0.3153126492983107 3:0.6423843271417344 4:-0.09528333043829118 5:-0.2318773828230698 6:0.32597909562645766 7:0.42808555740416065 8:0.2895959316734451 9:-0.5106491076955746 10:-0.2917418155655722 --2.203945173593806 1:-0.13844025039418084 2:-0.024638102806725293 3:0.6114514176076162 4:-0.6939316676972749 5:-0.036549673716341324 6:0.0942395290460385 7:0.7943411369475493 8:0.7025693796408046 9:-0.21822635487138853 10:-0.6045250179827362 --5.070655299509993 1:-0.8035156105848074 2:-0.5344928236067734 3:0.6105404604447127 4:-0.7538635525543969 5:0.9836765037886612 6:-0.5700253195942724 7:0.9232380985458313 8:-0.26374377078100464 9:0.9079431466301682 10:0.8404281771949533 --2.540181413836895 1:0.220453181647285 2:-0.03105792440486077 3:-0.17131282366411926 4:-0.41800060634660485 5:-0.1477564564540963 6:0.055537469469941536 7:-0.8092076926316594 8:-0.29815112444525727 9:-0.20030580647762464 10:0.337865838755971 -19.341342586351033 1:-0.32052868280788616 2:0.954507993011956 3:0.38642226954792824 4:0.9240442034609888 5:-0.46077559741256824 6:-0.7991393493824104 7:0.9396232321156679 8:-0.2486930151964184 9:-0.6256485833035617 10:0.14861843824730103 -0.31398559122529757 1:-0.4684215762946897 2:0.07873308388585198 3:-0.3589594894052015 4:0.14284662079329458 5:-0.8936272055527841 6:0.5647217242826741 7:0.49613233215723507 8:-0.501698787526992 9:-0.46710107378968724 10:0.898517179577361 -12.243117462926584 1:-0.8147610562690222 2:0.21104006948075482 3:0.42405323019132957 4:-0.667965573810795 5:-0.267026607469405 6:0.7949752815579358 7:-0.07664414977654532 8:-0.6023087644686556 9:-0.659375887511856 10:0.459735946423397 --4.623091296763939 1:0.08405646515942733 2:-0.40009448092691446 3:-0.39831245310544094 4:0.8794137836499942 5:-0.04788565812369017 6:-0.06763019434549333 7:0.41324877265674065 8:0.39746868847324146 9:-0.986729367280818 10:0.7001677710291752 --5.782162271139417 1:0.29127970805530157 2:0.6712715787317827 3:0.27575757044478477 4:0.31525054647682804 5:0.6905016168465983 6:-0.5195319089267731 7:-0.06598129860341295 8:-0.5142554034519407 9:-0.11546331150946942 10:-0.2934524891698944 --9.892155927826222 1:-0.7048583334456604 2:-0.46598491327111247 3:-0.02034722477413209 4:-0.663294196316965 5:0.4485329128582778 6:0.4553619594861118 7:0.7154814909138205 8:0.7532937661147989 9:0.020693077287389894 10:-0.23131986644633207 -0.5422764698408844 1:-0.1513298744027669 2:-0.4591544337339648 3:-0.7192219559850723 4:0.21236658135317632 5:0.12050445497328166 6:-0.42411528242712127 7:-0.15103925528861595 8:0.9032115729799512 9:-0.9228817525021624 10:0.2604090001033641 -4.187800872274017 1:0.3084355607627949 2:0.7029638272178733 3:0.34098344122299573 4:-0.5818421369891376 5:0.1332356708082485 6:0.22671316744441716 7:-0.6750469752494854 8:-0.4065302428716193 9:-0.48213803977370073 10:0.17918596677210186 -4.487701812297124 1:0.8352061350259052 2:0.2757393215770836 3:0.9310504392364667 4:0.519503546762708 5:0.5270245209143005 6:-0.08181154800488488 7:0.5148324302455536 8:-0.6680946101511949 9:0.7574060703813035 10:-0.4721334895419935 --5.150140984417682 1:0.8113709439821006 2:0.21939305063309278 3:0.02109986546311826 4:0.07450107676582762 5:0.723883853128624 6:0.5392035186380486 7:-0.1382740221237464 8:0.9990201540159807 9:0.10429329766137108 10:-0.1365266408862309 --6.544633229269576 1:-0.08278037549320039 2:0.6982730989138761 3:0.9090685953368327 4:0.6754092061339365 5:0.5889199822482736 6:0.020678619551471433 7:0.47605785660672084 8:-0.49926771127869873 9:-0.28380077002944093 10:0.5282319276258469 -7.216836352055753 1:-0.8510680074642156 2:0.42611818262128476 3:0.593607821624947 4:0.5635067468583634 5:0.2121930523769171 6:0.2708063180622071 7:-0.31491113345871735 8:0.005990053407278095 9:0.8985259402559085 10:-0.44549339042232794 -20.874246167942125 1:-0.53010692413621 2:-0.9897084749945524 3:-0.9083978261828305 4:-0.15581655583739495 5:0.9974035542095165 6:0.9894717992956665 7:-0.7287287537245402 8:0.06425127137526943 9:-0.06684164745938337 10:-0.3600621883071937 --6.556192430758147 1:-0.7655958349167471 2:-0.08083170734199419 3:-0.8540636958251198 4:-0.09994429443696973 5:0.1734809016500265 6:-0.29563180244063325 7:0.2158497607364409 8:-0.6071644305523003 9:0.8063426715403785 10:0.47092299197899345 -7.252748885335252 1:-0.36403312429467216 2:0.1237451136826817 3:-0.5756427605741237 4:0.7612833636750866 5:0.9350628314096134 6:-0.012087843264624754 7:-0.03742573515965031 8:-0.05210460803183037 9:-0.5333214800203341 10:-0.013320030179712505 --9.2679651250406 1:-0.5057250557539077 2:-0.41655319851679495 3:0.1897431234740683 4:-0.038318717640150046 5:0.9136495575471062 6:-0.8890525036858237 7:0.40859501498633377 8:-0.8746985847539293 9:-0.005836984002720369 10:0.7838036026237987 --15.732088272239245 1:-0.8546867577633044 2:-0.3003980324850013 3:0.49649883896876834 4:0.710496747220617 5:0.5848510480601048 6:0.5714826756665468 7:0.5487975165953451 8:0.5654333402837335 9:0.863539315599626 10:-0.9699410102494574 --0.20412431312519014 1:0.13323548063028934 2:-0.3030177580658542 3:-0.6358920925969869 4:0.3729380701923921 5:-0.8131818118430312 6:0.11567152703716288 7:-0.3645508535812394 8:-0.5487213252460876 9:0.5605886387366952 10:-0.8400308993051686 -10.445759684895373 1:-0.92707280355555 2:-0.9307772570299944 3:-0.11971873660640964 4:0.5140245291069254 5:0.5751145648836897 6:-0.43850910073502347 7:-0.7872208869913662 8:-0.3087975452145404 9:-0.4645849758749403 10:-0.1563641826381328 -3.349708377102383 1:-0.6334394121009499 2:-0.9008086683014112 3:-0.2678892493467009 4:0.7626514243443427 5:0.6406493676995701 6:0.3669245573649391 7:-0.052050629941784665 8:0.6713394117904852 9:-0.11458974566378233 10:-0.25949626043219576 --23.487440120936512 1:-0.5195354431261132 2:0.8080357948412571 3:0.8498613208566037 4:0.044766977500795946 5:-0.9031972948753286 6:0.284006053218262 7:0.9640004956647206 8:-0.04090127960289358 9:0.44190479952918427 10:-0.7359820144913463 --11.827072996392571 1:0.9409739656166973 2:0.17053032210347996 3:-0.5735271206214345 4:0.2713064952443933 5:-0.11725988807909005 6:0.34413389399753047 7:-0.2987734110474076 8:-0.5436538528015331 9:-0.06578668798680076 10:0.7901644743575837 --3.650649176738987 1:0.9665344025238449 2:0.1395514751689353 3:0.954697162791015 4:0.2093601878355722 5:-0.42841737775246336 6:-0.02877209657213764 7:-0.8382526163632971 8:-0.03773878779258388 9:-0.3751775119106411 10:0.6477987464528951 -0.21915863046310957 1:0.25143109618049353 2:-0.06463696557011112 3:-0.3324862332340037 4:-0.7184623449423757 5:-0.8897217937178385 6:-0.7336278194091297 7:0.8547631637534296 8:-0.7582613025929346 9:0.9080481791309838 10:0.9427850135311773 -4.813247597584681 1:-0.4564689661727537 2:-0.4315414033069003 3:0.09676404446694242 4:0.6024645727173434 5:0.20466090997530606 6:-0.09432916868838737 7:0.6402934161890248 8:0.741842551426011 9:-0.343937669190693 10:0.308871619426873 --3.0700825038127206 1:0.660084046469162 2:-0.02317305725931229 3:0.7567569356692221 4:0.2528834502236612 5:-0.3935091635208898 6:-0.9965507922509653 7:0.9065754202428946 8:0.6988037588300844 9:0.05145737657924321 10:0.4697377584426863 -9.762542323725354 1:-0.036129448543738896 2:-0.8252508992030534 3:-0.752854859129851 4:-0.9027424488033049 5:-0.4763092428375775 6:0.4832492121777574 7:-0.2935697977919014 8:-0.9197908986231211 9:0.8914359296658816 10:0.8688484670974876 -6.690913813146277 1:-0.7649833946109403 2:0.0419327356721928 3:0.5420954694310764 4:-0.7373259510045522 5:-0.9187577877864708 6:0.6431180783847401 7:-0.6272529754533058 8:-0.43356486537110106 9:0.16848266440424364 10:0.3129700315745716 -21.325049167466855 1:-0.36392795201361383 2:0.846518905511275 3:-0.26361421923150097 4:0.5140384860444887 5:-0.9147771624497878 6:-0.22044646197773576 7:0.14099760779666948 8:-0.546631395802236 9:-0.4345465263406878 10:-0.2759894364167672 -0.41237529640734055 1:0.05016964684797287 2:0.21708512805176072 3:-0.9444942733586354 4:-0.6118772896807114 5:-0.18053631846913665 6:-0.06752556529755416 7:-0.0081819952134361 8:-0.7774039956687315 9:-0.5548994336153177 10:0.7510833121912588 --15.056482974542433 1:0.6012054064354875 2:-0.6127014811673221 3:-0.8356741843949218 4:0.19830469052767397 5:-0.07726493085289698 6:-0.5756891943805014 7:-0.49010583357941884 8:0.7493759119974515 9:-0.7828994218436376 10:0.6154265137741459 --2.109441044710089 1:-0.5757976103755722 2:0.3686657403505862 3:0.5418762444017706 4:-0.5896052565388463 5:-0.1000712585735879 6:-0.8114188394866342 7:-0.5863884932327266 8:0.28289838755838015 9:0.5378646921099333 10:0.5063780890366179 --5.249715067336168 1:0.6828022788286754 2:0.9044668986252975 3:-0.6010464361571437 4:0.8416122052398811 5:-0.9846446498408039 6:-0.3777762313579811 7:0.5763775880953983 8:-0.07608009385213488 9:-0.29576023599575474 10:0.8845728751981716 -6.907770824878343 1:-0.9751352215365647 2:-0.8297271715190588 3:-0.07240311280415779 4:0.4796310183582191 5:0.358213469979769 6:0.4628020211207058 7:-0.9753405605972942 8:-0.765583403709019 9:0.5623611232648877 10:-0.14725965272406616 --9.299021854126096 1:0.8784076266914045 2:-0.8314918563417382 3:0.8701529449600536 4:-0.8070129727442199 5:0.07396877198841345 6:0.0040889707225901795 7:0.40529205456687145 8:0.6412485325027342 9:0.1443450351498905 10:0.404997568726581 -10.95643670126225 1:-0.37321642594676097 2:0.47766490569544473 3:0.9201313123144423 4:-0.649393433578801 5:-0.9084894063674787 6:-0.2547160991750408 7:0.7674649994523459 8:0.646056370118979 9:0.6014100713287893 10:-0.15130291862509182 --2.6397202393123336 1:0.3285252466844373 2:-0.2714281159811125 3:-0.5869561846815805 4:-0.5643935541712441 5:-0.7285201267315389 6:0.6502951074428092 7:0.8611880383193904 8:0.6380425291162128 9:0.5118538704085516 10:0.4012684110865874 -12.521131042032012 1:0.4843931319727355 2:0.06440853455169626 3:-0.6151259240105509 4:-0.4180928328467284 5:-0.4607061773323424 6:0.8104775289268906 7:0.3284199695768064 8:0.8425028998495565 9:-0.34822319854822825 10:0.1969239149176112 --16.151349351277112 1:0.7787909191620395 2:-0.5219981442072688 3:-0.7230569918898555 4:-0.05707801168212101 5:-0.8134225824740247 6:0.09950684183685454 7:0.6261274830059296 8:-0.9502006765164366 9:-0.6724983095526844 10:-0.600347212281825 --5.039628433467326 1:0.7680701397575322 2:0.7956844224408437 3:0.4131717201035916 4:-0.3127895385265915 5:0.7226571953995224 6:-0.06845863083031967 7:-0.1007291660029832 8:-0.9130249132342207 9:-0.4605180615474036 10:0.42093879298156 -9.007596502870785 1:-0.6562175566238462 2:0.08420074013217049 3:0.589801949672486 4:-0.11964901133703987 5:-0.8145711913860048 6:0.43854302140351065 7:0.5992967124729605 8:0.253745043289755 9:-0.10742030998120033 10:-0.5993228348160153 --12.41094640284016 1:0.31035917086763765 2:-0.8053417167237813 3:0.5754655536186164 4:-0.3645388095106201 5:-0.9135176753316416 6:-0.8690739610562535 7:-0.14039224825138197 8:-0.7112835675593987 9:0.25762942117230825 10:-0.9483300117501923 --12.130353212287929 1:-0.41404309625298485 2:-0.7695984204591535 3:-0.44569447239245275 4:-0.3168863099965644 5:-0.26669244730409036 6:-0.33484042698895755 7:-0.41062396946367685 8:-0.09075804785640385 9:0.8511367190902208 10:0.021918606255194595 --15.375857723312297 1:-0.9794952880997945 2:-0.9547237660069134 3:0.2460912345929791 4:0.3174335823329406 5:-0.23758562926743054 6:-0.113610303129287 7:0.18292675847568063 8:-0.9656446754474337 9:-0.58300134324846 10:-0.6689602908128025 --6.397510534969392 1:0.440780662587545 2:-0.03737991637410243 3:0.9506435891605849 4:0.8177486462589998 5:-0.2917628929963241 6:0.42365289098031034 7:-0.4280555544979745 8:-0.18388426864865903 9:0.5057230088452542 10:-0.1699163749308643 --9.789294452221961 1:-0.25066699970459694 2:0.1772977344415987 3:0.5913498268900952 4:0.6293756431864681 5:-0.6430441015863757 6:-0.7238519180293621 7:0.13639541626580498 8:-0.6620281401715837 9:-0.9515237061912034 10:-0.4333426289849791 --13.15333560636553 1:0.3491978525665129 2:-0.4067353159374012 3:-0.8677040612253524 4:-0.5757086910974862 5:-0.3186886816681207 6:-0.06609938943414573 7:-0.5419747642754873 8:0.9632759660044383 9:0.2673520823110991 10:0.36463236596724546 -2.2307697392937795 1:0.12285527276472785 2:0.8938323722714365 3:-0.16995870341610209 4:-0.3298643049714254 5:0.16781582791954253 6:0.42381594687105895 7:0.9245288214717629 8:-0.08709025093361689 9:-0.14227085487682722 10:-0.2888302862659746 -5.892885365508635 1:0.10116053019915738 2:-0.41641547074900154 3:-0.3750004290914961 4:-0.5619470211369917 5:0.33343039544460384 6:0.46766042657994733 7:-0.6937940929321615 8:0.7044604392055189 9:0.8879353764416567 10:-0.5490902425042639 --16.692207021311106 1:0.9117919458569854 2:0.628599902089868 3:-0.29426892743208954 4:-0.7936280881977256 5:0.8429787263741186 6:0.7932494418330283 7:0.31956207523432667 8:0.9890773145202636 9:-0.7936494627564858 10:0.9917688731048739 -10.454641756541454 1:0.3490213088098768 2:0.6103387992494194 3:0.6753935651135747 4:-0.39560763769937934 5:-0.3042308221531884 6:-0.9283481899557042 7:-0.7904038212853011 8:0.37488335848537346 9:-0.296477977723397 10:0.30894819444660304 -0.08978797103855778 1:-0.13445409764877803 2:-0.6404150831493631 3:-0.24740260669490133 4:0.031151119464385646 5:0.9207882173498612 6:-0.6146471129497393 7:-0.9736175690408087 8:-0.2673180325645341 9:0.5800384183301572 10:0.479811220263183 -1.7362099941626894 1:0.5171681395917551 2:0.6177735922313075 3:0.6446678302226738 4:-0.5731769722311459 5:-0.2686270617709168 6:-0.6048534221658814 7:0.7002124303669326 8:-0.1479765297345712 9:0.009254061109394307 10:-0.31519081920853287 --1.0349488340235453 1:0.612980711993536 2:0.05771318707554962 3:-0.10821368362160744 4:-0.8755559420458141 5:0.42566546089913326 6:-0.7966341558699277 7:-0.45253617234374466 8:-0.8289517557653971 9:-0.8968075137250837 10:-0.6325457096866376 -0.10157453780074743 1:0.9143592240573388 2:0.06419631741815457 3:-0.9961326744227916 4:-0.47174548800139715 5:-0.0821464027819967 6:-0.5495006555498168 7:-0.5627911401420294 8:-0.43426056724099005 9:0.892026786364895 10:-0.23546485121284055 --12.92222310337042 1:0.218687524173371 2:0.013626751799176162 3:-0.8372219908323961 4:0.6197296846266354 5:0.7429130827811232 6:0.48009972886541896 7:-0.35667717521227904 8:0.18337067878780533 9:-0.22935396092245197 10:0.4076715024284059 -22.923352376063196 1:-0.7522075505725567 2:-0.20686029838909326 3:-0.1386664769095396 4:0.157117595808127 5:0.9462377653889174 6:0.9182504509330662 7:0.18170057747293833 8:0.27735387813088863 9:-0.6355799944714868 10:0.9764849106195284 --6.132450015997121 1:0.2822534275343054 2:0.2625905791399692 3:-0.02565260641304068 4:0.4891221076432757 5:-0.6426178913585772 6:-0.8999539149461033 7:0.12659507663825287 8:0.5889572439755832 9:0.49107548332672857 10:0.47595749470452 --9.173693798406978 1:0.4430245286298278 2:0.9923116639471541 3:-0.5609082824097824 4:-0.36542266258313916 5:-0.5814039716882617 6:0.20413852042674874 7:0.6097541611931963 8:0.5743002479324253 9:0.4735459963431561 10:-0.053969823043886755 --5.814408490931223 1:-0.9206287328000513 2:-0.48298486023273157 3:-0.8477202041890262 4:0.5801385102362351 5:0.7146074564553095 6:-0.5987672678579339 7:0.6829077928212723 8:-0.3522788540815065 9:0.7729595638821951 10:0.5264904880591215 -6.474329501040298 1:0.6914309300550991 2:-0.4507700505202725 3:0.713821440501512 4:0.41599059910235847 5:0.507160951750409 6:0.8635615811574222 7:-0.6235518270244333 8:-0.5336201820384283 9:-0.7989630679361768 10:0.837293162455248 -6.984517471584806 1:0.16745919469723392 2:0.018033079961716103 3:-0.7339201095541323 4:0.17042828693740697 5:0.4493471632580528 6:-0.8938445962323078 7:-0.3206968104792325 8:-0.616617071238893 9:0.9327878222034172 10:-0.6575294247048245 --12.479280211451497 1:0.9769767754725367 2:0.7706430863248943 3:-0.4506244622476816 4:0.12921761745628713 5:-0.0697804449658812 6:-0.7702703569987461 7:0.017734558413919688 8:0.7216294158911261 9:0.42547357862241886 10:-0.9001915116155741 -2.8363866587728186 1:0.11478724114928918 2:-0.4679790550082039 3:0.2344912687736711 4:0.5524878060045462 5:0.5252859884051309 6:0.5080674087215156 7:0.5010449021825665 8:0.048046765816400105 9:0.06654581719548891 10:-0.5801934713347348 -4.186809777233374 1:-0.02335342201396018 2:0.9035437912091193 3:-0.9283585631882163 4:0.454351316397237 5:-0.6948564428085262 6:0.11495485234890368 7:-0.23683956078769963 8:0.6442534752881419 9:-0.013866407845647188 10:0.23369602940650736 -2.8235031660626415 1:0.5609344938188046 2:0.3449103464885612 3:0.03972169049525687 4:0.31858762565827137 5:0.4409953589124853 6:0.22836189275697016 7:-0.1497811991899889 8:-0.23248048920679265 9:-0.30066618281100177 10:-0.9247232456911632 -6.96223432848425 1:-0.8160398553437558 2:-0.8212180893749699 3:0.7728655115832999 4:0.02387973088796369 5:-0.043499804905828166 6:-0.6997726250046865 7:-0.8686633773265577 8:-0.12597318402253976 9:0.967018116368416 10:0.5951339624149812 -4.669684795838683 1:-0.32226903644852833 2:0.5465858078942492 3:0.5228467793266189 4:-0.013157722224545143 5:0.5810668818928995 6:-0.1372653090293532 7:0.6446157527288279 8:-0.06005754873230629 9:0.014302180040152379 10:0.43474245441042636 -16.112744845653285 1:0.37257742858083365 2:0.19398954512844124 3:-0.11860882189887478 4:0.6492510749703395 5:-0.41273736981203313 6:0.18643017041815835 7:0.29136917186214384 8:0.47602883023389 9:0.7126916980867937 10:0.48462508659691483 --9.196003366226202 1:-0.7263358951920722 2:-0.8503799288093836 3:-0.3120563620589105 4:0.3925562655164563 5:0.027666662972283484 6:-0.35173134138805406 7:-0.32703527910354757 8:0.3060102722285065 9:0.8609161725740202 10:0.33394557004432923 -1.242972458167591 1:-0.9029238804456814 2:-0.6392681059531908 3:0.8940879647942577 4:-0.8807357173896475 5:-0.13628130467470512 6:-0.5487534785116224 7:-0.40270307148061346 8:0.09152108686997096 9:-0.20745066734844642 10:-0.20624830574384978 -3.453659210660726 1:0.2710596844435682 2:0.6510497900145247 3:-0.2899158136103117 4:-0.13531811694554707 5:0.6965847786422426 6:0.9105343028780231 7:-0.007340232468413754 8:0.7672537187738411 9:0.3538906829188173 10:0.35387524540947646 --0.48115211266405217 1:-0.17943755364759517 2:-0.1384979591151625 3:0.8425773648797268 4:-0.43234064993405097 5:0.919754442523921 6:0.8390197802990036 7:0.43890653121452683 8:-0.7647648217789051 9:0.14770258954363835 10:-0.6681813635676657 -6.965069440749298 1:-0.9158261471030473 2:0.5228494114644282 3:-0.07760531122743153 4:0.6154296244963067 5:0.5231830145381096 6:0.4892535590799165 7:0.1987053183082137 8:0.9995670294711712 9:-0.2020375688074112 10:-0.7853579334836087 --1.6896486293598596 1:0.4638529147853421 2:0.0953805943546191 3:0.8506904243225251 4:-0.028262644692445438 5:-0.9462342015500664 6:-0.6934738957112123 7:0.601125018257533 8:-0.04871041957758315 9:-0.015245062056267411 10:0.6119856200040805 --1.763729644326212 1:0.5376618752928528 2:0.8062119856717131 3:0.44996834959923593 4:0.9917728248530817 5:0.5974717482179492 6:-0.406972851600659 7:-0.8523198502065281 8:-0.3076377139692321 9:0.9099974915864462 10:-0.43374966692373484 -9.012829566937228 1:0.6885456531832366 2:-0.0631164354373237 3:0.8394182300770314 4:0.7207913383891218 5:0.4715324450375691 6:-0.34417503908167757 7:-0.31448279255342126 8:-0.020591617987411936 9:-0.37668573574418107 10:-0.6528048324896532 --15.951512565794573 1:-0.6112828771933607 2:0.4867007149846869 3:0.863494046941478 4:-0.7292072742454481 5:0.6338749652624007 6:0.5980798993978542 7:-0.5119002889878654 8:0.8394383182101366 9:-0.1412423080445726 10:-0.15838730884968655 --0.29622788243318465 1:-0.9436253326661384 2:0.2907259958032098 3:-0.1530538226933904 4:-0.6174176535420375 5:0.8209632215649141 6:0.5060548803172731 7:0.8212448453211292 8:0.33506684706740386 9:-0.5408309869188785 10:-0.8105966349150977 --7.683213587039055 1:0.2525015766703558 2:0.6417869320191234 3:-0.7569571597336913 4:0.5265130776924394 5:-0.03992944660560949 6:0.18292946303778823 7:0.4286344960738724 8:0.9158523573288766 9:0.5039796366711773 10:0.27660486075533797 -3.9061298856792797 1:-0.6501789225392032 2:-0.6040685518173872 3:-0.6448094322678659 4:-0.2019498832769746 5:-0.5302977370883424 6:-0.010754341856880067 7:0.8791702222974846 8:-0.2283571791337704 9:0.4726320486679656 10:0.3413255179758332 -12.928385148211825 1:0.7793178379505685 2:-0.5207562047491976 3:0.37253320760898934 4:0.7540757518052998 5:-0.679378421540417 6:-0.11966022036636881 7:-0.4317798870297489 8:-0.004211291952602059 9:0.39024653887361693 10:0.45391057946097146 -5.787566514603203 1:-0.20596730554338039 2:-0.8840796727164746 3:-0.749416279057892 4:-0.5511023306046077 5:0.9941631901218697 6:-0.09907966722992234 7:0.701617914811792 8:0.9696055014561289 9:-0.7083648075748707 10:0.5781111533720358 -5.701262468657861 1:-0.7066995012593675 2:-0.6756815056791965 3:-0.5720277255842998 4:-0.09218662060241067 5:0.21494136076896653 6:-0.37012884573008153 7:-0.6828277646796448 8:-0.10038134655965236 9:-0.46253754509583356 10:-0.20813933595648115 -0.9473494330088033 1:0.6876806675510589 2:-0.9530860102792402 3:-0.4043172626863887 4:0.6696455505098386 5:0.17863581804857254 6:0.1944646561635497 7:-0.5283662172535679 8:0.4872263841818012 9:-0.2882651789318431 10:-0.06293411605141874 --2.6834375589185675 1:-0.22376759986120187 2:0.36555755546798885 3:-0.5223502955721961 4:-0.20702347869224624 5:-0.7745351063999764 6:0.22879328233099971 7:-0.5440007473902635 8:-0.6959483071829207 9:-0.131433881760733 10:0.2764225554693165 --3.2766108642276146 1:0.0304613976530983 2:-0.3148062986719251 3:0.24950420590071953 4:0.7152023826801459 5:0.9656885739650887 6:-0.3210562623763835 7:-0.7305896664502614 8:-0.49074917893875836 9:0.7802670253347352 10:0.8667409958355992 --1.1838791995691869 1:0.06642047806096318 2:0.5336148776806793 3:-0.6199614859883396 4:-0.15342280723497237 5:0.8407250402808968 6:0.7060811811107444 7:-0.2913182140909305 8:-0.5925203360011633 9:0.22644925021629692 10:0.42395071889002467 --1.5856680515554806 1:-0.8724712788102853 2:0.11445744032031424 3:0.5483166457680566 4:0.9469521544884028 5:0.2541682828467746 6:-0.436750733871873 7:-0.9001249399695319 8:-0.7555793441458385 9:0.06946992897983018 10:0.9724148045760346 --13.039928064104615 1:-0.558607026518148 2:-0.7356765018678253 3:-0.7547644426290201 4:-0.24898664843938745 5:-0.3606374046883567 6:0.5836652368902306 7:0.8497678666873467 8:0.21331875915717635 9:0.3558733809635668 10:0.9642603628738968 --17.428674570939506 1:0.8562209225926345 2:0.7077202100653552 3:0.7449487615498371 4:0.4648122665228682 5:0.20867633509077188 6:0.08516406450475422 7:0.22426604902631664 8:-0.5503074163123833 9:-0.40653248591627533 10:-0.34680731694527833 -13.886853032969585 1:-0.6354915752033683 2:-0.9132338112681755 3:-0.4816479770266455 4:0.5448417181244594 5:-0.6250746297187781 6:0.7410618768880199 7:-0.18029029550083675 8:0.777358236920447 9:0.9625064189449102 10:0.048040935468046 -15.61684729251139 1:0.2980237970192188 2:-0.8160931971814265 3:-0.29649852157138445 4:0.3896688599904572 5:-0.17552110506337826 6:0.8721328328445139 7:0.48984799668438916 8:0.9984496052876473 9:0.9665885195526289 10:0.8966559812150274 -10.33625540376971 1:0.09939495068155724 2:0.9790332181038015 3:0.9483428886275702 4:-0.5717299810793317 5:0.4876405069057712 6:0.163962913892302 7:-0.4095537988924203 8:0.8608269751255508 9:0.010028680058212114 10:0.9095786494455713 -9.706032970113723 1:0.7687898546315146 2:-0.9825109379412285 3:-0.5423211794439926 4:-0.3099509487314134 5:-0.11561305536236333 6:0.9012327035409926 7:0.5257495475790148 8:-0.33804422025989433 9:-0.144428735681567 10:0.28019332199039604 -6.189043888072968 1:0.13246655756059478 2:-0.751192382628302 3:0.2233421456265161 4:-0.1933575076984373 5:0.8681727702736863 6:-0.7656847407654899 7:0.1033145549916572 8:0.33909210370257403 9:-0.22241363302770267 10:-0.14479004187830435 --8.680225911784335 1:-0.07718769939880432 2:0.6702228057326558 3:0.6647810334933819 4:-0.05115658747070784 5:-0.850780588302118 6:-0.040961453376221924 7:-0.8407690297644956 8:0.33775829053563156 9:-0.45421556034898547 10:0.8238500771967823 --9.42898793151394 1:0.8925906426831107 2:-0.6771269725125597 3:-0.11635105688280678 4:-0.7266044201050157 5:-0.6902918845825077 6:-0.5911234800910024 7:0.49395074569300657 8:0.43660804414878274 9:0.8736983081269782 10:-0.8001177058312081 -8.486245765579415 1:0.5614295382716652 2:0.3972427851719582 3:-0.276268504977494 4:0.7803448249454739 5:-0.358957923558495 6:0.3477822689529795 7:-0.7944805581842691 8:0.8356932134547437 9:-0.4783293647580624 10:-0.2522633417723845 --1.8722161156986976 1:0.11831037290857482 2:-0.7309091607574014 3:-0.7339122716951587 4:0.2046641765436359 5:-0.9914679283125301 6:0.13518339528098555 7:-0.9760821540963867 8:-0.6080636193563043 9:0.3890502262427238 10:0.33864957953815145 -0.5122357093733743 1:-0.9555852441641726 2:0.4754771858792488 3:0.3743376249200432 4:-0.2651772997462427 5:-0.7915484529586028 6:-0.7575915279708862 7:-0.10432268807273859 8:0.021604934223709238 9:-0.6458011732912265 10:0.40773716196391674 --18.845922472898582 1:-0.6031480148285926 2:-0.8736524730197766 3:-0.311456616524979 4:0.420921703897325 5:-0.2904011177124777 6:0.6683252350591937 7:-0.3436202976676894 8:0.5023604359385605 9:-0.33056149241985633 10:0.5168854058825227 -6.492106438811399 1:0.7824832256885428 2:0.6105456307389117 3:-0.0436873997963223 4:0.46730493583332855 5:0.2057529813440686 6:0.5738310686722767 7:0.6307964411259019 8:0.6208424783086652 9:0.8931894299284251 10:0.7164648197763028 --1.6472226859532182 1:0.8854767145642171 2:-0.8175744681485637 3:-0.14894858038610903 4:0.9667400540136402 5:-0.3575837217508149 6:-0.9211342680517054 7:-0.956785876301889 8:0.6558217028031554 9:0.8014538160668165 10:-0.9475520920917395 -0.185861229793925 1:-0.8181719548530746 2:0.9990094335332504 3:-0.8195848911987829 4:0.6991933015233858 5:0.07295718417836583 6:0.5968996100546737 7:0.4871410306452193 8:0.2980483098540927 9:0.779953293728507 10:-0.7978867112395516 --5.973450525185694 1:-0.975435413991927 2:-0.7832951303253313 3:0.5098999023442101 4:0.46795978867990007 5:0.2538986807863044 6:-0.8182887550010198 7:0.8335391734637112 8:0.4286082996234335 9:-0.1726765956719154 10:0.7649845978453362 --12.773226999251197 1:-0.383327656965585 2:-0.9439560491389036 3:0.25039001869622446 4:-0.9342091044843222 5:0.8711023711291135 6:-0.6027135241543655 7:0.9456874780319795 8:-0.243290468946338 9:0.625765915285031 10:0.5160550067618355 -24.290551295953957 1:-0.8368553572749229 2:-0.5859456648150321 3:0.873779532007048 4:0.7462623178738954 5:-0.08133011570245352 6:0.36767541461776676 7:-0.33129619282275047 8:0.6104289727615573 9:0.9416581563055089 10:0.18201841676606856 -14.490247980976621 1:-0.4765937762114507 2:0.16430711839945555 3:-0.526776940706293 4:-0.6802269991653915 5:0.40748236413299344 6:-0.500290139207977 7:-0.31915972151663885 8:-0.4586068416002418 9:-0.15572660263944127 10:-0.32925702602833073 -8.377230871265601 1:0.44141613060964846 2:0.1582267687752743 3:0.8760950367284166 4:0.40434058393690364 5:-0.7063758409891474 6:-0.616055773516162 7:0.996372393127579 8:0.6142084876085476 9:-0.528320587432094 10:-0.2815909691094802 --3.2987560995836653 1:-0.4600479783378091 2:-0.04201794336103326 3:-0.8934505203905587 4:-0.44991326751905536 5:-0.5220579476363783 6:0.46060949186328703 7:0.9169289030735643 8:-0.022458426893944283 9:0.08100795210565637 10:0.5726732415540354 -0.3422568955736137 1:-0.9888686059817204 2:0.22752298580182706 3:-0.5048696915520232 4:-0.059433420464226616 5:0.7823831512651716 6:0.9865977573980389 7:0.9164100011124972 8:-0.3638554550863984 9:0.3038282907667611 10:0.4652367033461571 --8.24116881862084 1:0.7565819250331731 2:-0.3733277500524168 3:-0.8841150081071696 4:-0.922282989989148 5:-0.041520813551309876 6:0.8615967014876558 7:0.8474207144091339 8:-0.7518437864641427 9:0.45076605239968837 10:-0.48912984167595375 --4.367083147104942 1:-0.276459380002813 2:-0.957555271384241 3:-0.3761632810202544 4:-0.3897414804149022 5:-0.3133861519856074 6:0.0777990809172171 7:0.6638552243422928 8:-0.3477312155364247 9:0.5934885465182675 10:-0.5238903641193555 -1.9280240152322783 1:-0.40051093785549696 2:0.5070348672240661 3:0.7506759969575532 4:0.5042104954516786 5:0.9959688260926507 6:0.4657024999761399 7:0.910611131925299 8:0.9836517468598804 9:-0.6263172749113686 10:0.16955852322929155 -8.918138317441574 1:-0.22407391224687023 2:0.5545084933214972 3:0.6335932367683528 4:-0.2786481116648991 5:-0.9549992830441785 6:-0.5577873948545062 7:-0.960657200286197 8:0.3709573488946196 9:-0.9191180485753339 10:0.5033478020271929 --5.657796797481157 1:0.6359910361030725 2:-0.1742637774815281 3:0.39699327107265137 4:-0.9841991491194473 5:-0.622093571871533 6:-0.5433497301426455 7:-0.6731178481686009 8:0.930615153085582 9:-0.3065877908950827 10:-0.5456093749639228 -8.697079562319692 1:0.4815820396629933 2:0.1173457441514223 3:0.7313645402039386 4:0.3354835387237334 5:-0.10300554535074702 6:0.5116687640761355 7:-0.8850803659104614 8:0.10654026377571157 9:-0.864976708975602 10:0.01345035085413615 -0.033954684723234596 1:0.6703241653088159 2:-0.13447915740201166 3:0.026022550037831937 4:-0.5145659862194116 5:-0.6963587636078901 6:0.652083884947352 7:0.22644722530715278 8:0.2671580129293405 9:0.9659035105360283 10:0.9547989197693989 -7.359108382166921 1:-0.6855762478384229 2:-0.7543318537260015 3:0.4772611975128618 4:-0.5588002332845741 5:-0.24271386844336496 6:-0.28595644325868896 7:0.8732728098501104 8:-0.8026384804471058 9:0.7589508830210041 10:-0.9992933613402135 -4.953597303754355 1:0.8915633023548608 2:0.04688596266450751 3:-0.26866754730613374 4:0.16694236975718102 5:0.23465297255622608 6:0.36488427850844407 7:-0.06717041145276781 8:0.9470029805221898 9:0.32483835237272674 10:-0.7892521260150298 -0.683536559775105 1:-0.32176084249781556 2:0.5446298870866526 3:0.4095848716057642 4:-0.42579711490120187 5:0.4482850543749355 6:-0.0982243826242506 7:-0.9190317048427039 8:0.06234509402976718 9:0.21327512416175054 10:-0.38023673796734525 --28.571478869743427 1:-0.4597184465402242 2:-0.5489429386926741 3:0.33422914572951634 4:-0.15992695377395516 5:-0.7310003311728188 6:0.18241063863467488 7:-0.48385214010599453 8:0.08139879039334552 9:-0.8401239538877046 10:-0.8896372220209929 --19.884560774273424 1:0.4619217451285318 2:0.28157115824800005 3:-0.3829811521605375 4:0.5802544015450464 5:0.1117061271473403 6:-0.8926034502584623 7:-0.34862293810401956 8:0.2733254857260612 9:0.6514176550598809 10:-0.02758604919357066 --17.494200356883344 1:-0.4218585945316018 2:0.15566399304488754 3:-0.164665303422032 4:-0.8579743106885072 5:0.5651453461779163 6:-0.6582935645654426 7:-0.40838717556437576 8:-0.19258926475033356 9:0.9864284520934183 10:0.7156150246487265 --15.86200932757056 1:-0.6341453831788726 2:-0.9259180639727085 3:0.302702923864538 4:0.749555004323947 5:-0.7932989575334761 6:-0.5620972938631934 7:0.020542041027870717 8:0.11610338700447698 9:-0.7912600154897766 10:0.5108307672038874 -9.027804254487519 1:0.1746878011084212 2:-0.5872807344913673 3:0.6018547246457264 4:0.5106104933121229 5:0.7329523371170135 6:-0.40058771577765895 7:-0.48753463550174025 8:0.34308791976318 9:0.3407668956765344 10:0.5964472848798394 -15.949172086880687 1:-0.7790584545657173 2:-0.017224094786103317 3:-0.0974907790179953 4:-0.10287391996036166 5:0.6007953354774878 6:-0.7032497754397848 7:-0.36068070856329437 8:0.021391994204512432 9:-0.6509100388083549 10:-0.5410899936281377 --6.151586699415245 1:-0.5318094974022525 2:-0.830796057445983 3:0.603828597318087 4:0.6660892552257192 5:-0.18529748408390523 6:-0.47166833767648986 7:0.592915541856605 8:0.9944601563352204 9:-0.6981606574244703 10:0.34942553665003584 -2.010398523297265 1:-0.9293899922307269 2:-0.07588009904844029 3:-0.8500855420709359 4:0.12191867923536615 5:-0.528778681165414 6:0.3117086447237414 7:-0.4222963938187163 8:-0.03247894950300623 9:-0.05387792412717962 10:0.4053568741659812 --6.749023248121471 1:-0.9875370165216966 2:0.7137693455001415 3:-0.2510160963160164 4:0.8732150877079123 5:0.49658934612905314 6:-0.9817012857861731 7:-0.2045309437850289 8:0.7562713668333418 9:-0.6787434327188155 10:-0.6147932888026117 -4.452639829999693 1:-0.35256148944834176 2:0.7581152951164591 3:-0.37755890552299265 4:0.9480813371197343 5:-0.3419340388717347 6:0.3487602851799074 7:-0.5576726724270562 8:0.4899696188087421 9:0.563074979676983 10:0.7865891460062227 --4.938733988900586 1:-0.4108386466193119 2:0.3287655432069885 3:-0.5853553038038923 4:-0.6480591422742821 5:-0.4787998161299789 6:-0.5828003484675421 7:0.42835744317623003 8:0.8378098987706633 9:-0.5645180498703375 10:0.28981512694646705 --3.373242544176224 1:0.04989033652617936 2:0.6575826440927308 3:-0.24028051935833128 4:-0.6649808138961095 5:-0.6530198970442704 6:-0.19331254127919362 7:-0.6743004878881749 8:-0.7214986105015062 9:-0.30648035516261385 10:-0.6455097687924254 --3.2843694575334834 1:-0.3548536057581908 2:0.7350125943559394 3:-0.3635282827378974 4:-0.8552820154885781 5:0.9140879208466111 6:0.21870365067770892 7:-0.17738543429561382 8:-0.052851966578491005 9:-0.36066059517759097 10:-0.9020765799355679 --3.277146077677404 1:0.910961221014513 2:0.4302525202590246 3:0.11079959840001119 4:-0.3614188274820125 5:0.5080231397310961 6:0.013940825892631237 7:0.33583012240022403 8:0.5008797094229163 9:-0.663083147090173 10:-0.0865028013627418 --0.202246147968096 1:-0.4929308143227653 2:0.8374300027105082 3:0.08763999085193186 4:-0.499738438136623 5:0.5926071511295365 6:-0.5135396038023627 7:0.6946715869746543 8:-0.5184428793490325 9:0.21753085495829239 10:-0.33796308746585235 --7.1237150573506955 1:-0.8506203499039495 2:-0.6581804183622855 3:0.6484205342724825 4:0.013914696389758285 5:-0.6214530117645831 6:-0.011163110491807293 7:-0.6025372583334574 8:-0.0371573886520411 9:-0.7933455929226487 10:-0.38653838674273455 -6.298226129171093 1:0.7304191211928768 2:0.8128475475660479 3:-0.03161148630216015 4:-0.6018899317958344 5:0.19277055729934367 6:0.3002272616310928 7:0.949169758830406 8:-0.1011823256970481 9:0.16093341376629966 10:0.9596833606094763 -14.906594657519511 1:0.5053240355803015 2:0.6775698974866082 3:-0.6194771000646291 4:-0.02876927004033525 5:-0.5481504206112477 6:-0.9239150546263386 7:0.471216755072994 8:-0.0027794620943384363 9:-0.8954411386878227 10:0.8991742143686698 -2.1710965297686267 1:0.4578509053930304 2:0.9270194505165124 3:0.22470373699901236 4:0.21526179917432753 5:0.5299563895862103 6:-0.5824108997775908 7:0.03801922095671095 8:-0.5164033454609385 9:0.4370246809487237 10:0.6514133050988229 -15.05806598279517 1:0.48645077410559057 2:0.7821442063987365 3:0.1943681666933883 4:0.8289246958621577 5:-0.08034311437806041 6:0.03709694472527203 7:-0.895481297246602 8:-0.42921579749551664 9:0.5447075872378688 10:0.844397849728866 --0.4683784136986876 1:-0.5083135683360327 2:0.626070365769088 3:-0.8737725909401557 4:0.725622293853621 5:0.0018794384199978253 6:-0.9343604622552886 7:0.6655593328822609 8:0.47501755618845753 9:0.8388618477210947 10:-0.5143806767304449 -5.823027255871114 1:0.08635467091841886 2:0.6314532702073175 3:0.8862069437865836 4:0.6542025864928516 5:-0.6846784290231471 6:0.048487096050569445 7:0.30828004933669395 8:-0.49438881988995687 9:0.5706936923061823 10:0.037705651885639346 -7.03779380408974 1:-0.07193682621291098 2:-0.5816975957307158 3:-0.8426927090342973 4:-0.37504851992255306 5:0.4473129018316815 6:0.3101938194888525 7:0.6160050428837607 8:-0.913998555949695 9:0.40461966540531313 10:-0.7581141330823786 --9.770500546345563 1:-0.31358873581579894 2:0.11771478839130278 3:-0.3404842110585631 4:-0.0604362797252429 5:0.2159524972176814 6:-0.24737863017398087 7:-0.8541428610709716 8:-0.06753562283135062 9:-0.11567537916769255 10:-0.5606246203677223 -20.000154367451547 1:-0.344717847914646 2:0.8454969480099985 3:-0.58856299370874 4:0.5884510299634649 5:0.49162879631128553 6:0.7958075013181658 7:0.7781911267315837 8:-0.6780885011989877 9:0.9797694629597928 10:-0.1872163682079866 --6.239848349456753 1:0.9132793720646253 2:0.1680340663118458 3:0.01740115925682284 4:-0.26580395408599133 5:0.28551914590761074 6:-0.9939706142381568 7:-0.8740927279520219 8:-0.8731218126652498 9:-0.10993630739903892 10:-0.3069565039708746 --4.173072569004537 1:0.7864835254860851 2:-0.5614522227484218 3:-0.7718396381376464 4:0.49508673889127985 5:0.24030155936964714 6:0.8080778221819038 7:0.05395496402881128 8:-0.3045148076729973 9:-0.6134406357458853 10:0.7447268183581948 --11.328415936777782 1:-0.10183127796258096 2:0.5689039487721601 3:-0.07015335898840225 4:0.23254189629731292 5:-0.3226974656715038 6:0.2859450214054784 7:-0.4916677058012495 8:-0.27564895614732055 9:-0.9416483232894219 10:-0.7472248333434015 -8.719164753818454 1:-0.8231424386390782 2:-0.03953537069863633 3:-0.3271580541537027 4:0.892192314973022 5:-0.6759017192358232 6:-0.419591686354591 7:-0.23967385135363606 8:0.936992531568956 9:-0.12946409158671512 10:-0.9082863469271643 -22.31738046492344 1:0.37030851555335365 2:-0.06654751559177563 3:-0.5759425437665169 4:0.9179952251152963 5:0.8628921839116359 6:0.8421952184405965 7:0.9625804174561126 8:-0.03075332253237728 9:0.12227386374957994 10:-0.6243390357793757 --1.189108450798179 1:0.5681776913545951 2:0.46049028271139436 3:-0.366463711956754 4:0.025856437432560275 5:0.7547565372954261 6:0.5506193192167212 7:-0.6279807084274867 8:-0.38698884324386107 9:-0.9885778854008227 10:0.7814740172261654 -2.8767042393531965 1:-0.6841229745503388 2:0.6252203895646273 3:-0.6737644654353572 4:-0.7321040107741059 5:0.3162570540986238 6:0.6211089085315002 7:-0.33984617437403464 8:0.1227089818682312 9:0.04586594421613177 10:-0.4679977358965799 -2.783332151730615 1:-0.39148258540779013 2:-0.3037233649803406 3:0.7955133548911926 4:-0.1729544208044842 5:-0.18247049275020033 6:-0.1315085429729259 7:-4.447133918370483E-4 8:-0.805837119503338 9:0.11574866650006688 10:0.8517519041042676 --8.99205564094827 1:-0.45501536967706535 2:-0.35829694693457914 3:0.775695048377375 4:-0.25331195582275745 5:0.15524612858817055 6:0.7400717904631442 7:0.8382485596668376 8:-0.5619009369436814 9:0.4386801597659249 10:0.09960232210246622 --9.808386702564658 1:-0.987404834666963 2:-0.6732308850750186 3:0.5528285725528492 4:-0.8796302275267409 5:0.30705569958232193 6:0.8635312232105203 7:-0.14033675947074187 8:0.5516086773506235 9:-0.7487899106678442 10:0.8851518933134919 -4.948281656077033 1:0.4331269064492329 2:0.4628446087354616 3:0.33730748244242537 4:0.3473124014683382 5:-0.1707966473106064 6:0.8558057784524846 7:0.1390312032172829 8:-0.7918343112673001 9:-0.85993782695915 10:0.33563174747577107 -10.791261476321019 1:-0.5417345768902055 2:-0.06334901799780424 3:0.027652223245870466 4:-0.9881487640651161 5:-0.19441123027957707 6:0.40295156581142355 7:-0.8315553696517317 8:0.11405283165483926 9:0.5377980570161418 10:-0.24581620554740824 --0.7287230169119936 1:0.33985587202063283 2:0.6841261099887705 3:-0.9441564997438197 4:0.28660913255058906 5:-0.7597915572726905 6:-0.8535957517473378 7:0.609134673753593 8:0.29636368731717977 9:0.05791523580926916 10:0.5589907965230858 --26.805483428483072 1:0.4572552704218824 2:-0.576096954000229 3:-0.20809839485012915 4:0.9140086345619809 5:-0.5922981637492224 6:-0.8969369345510854 7:0.3741080343476908 8:-0.01854004246308416 9:0.07834089512221243 10:0.3838413057880994 --16.71909683360509 1:-0.24375714099465773 2:-0.11915875769929496 3:-0.3741442802364221 4:-0.3812947578178094 5:-0.7032156297055756 6:-0.18339122712542388 7:-0.8634662520461855 8:-0.714561692659166 9:0.020558676493369177 10:0.22804428969949986 --8.822357870425154 1:0.39332200105884363 2:0.5652370435795515 3:0.6220479966351453 4:-0.018976695481651484 5:-0.6868425195058918 6:0.2029750380170401 7:-0.5550873767310935 8:0.16864133648532342 9:-0.008843355054633628 10:0.6472547984399621 -0.36392761004065594 1:-0.9059630492963144 2:-0.41039282402227384 3:-0.006673269562094131 4:-0.4989314017618798 5:-0.17726034513032318 6:0.037764439388023874 7:0.30703957185016595 8:-0.09040426404909185 9:0.38661451965066274 10:0.1630571642147851 -7.415902871490132 1:0.188586850708651 2:-0.33013604761672566 3:0.6667976416858177 4:0.8537064956198137 5:0.03971370422819254 6:-0.43229195778759966 7:-0.9607154505216515 8:0.8413204878098277 9:0.40010565279599897 10:0.7306602852367441 --4.129456164370826 1:-0.7967510984807558 2:0.545111159425699 3:0.16038228447433012 4:0.6311115528116698 5:-0.01985759480036542 6:-0.9516543115476572 7:0.18022912194075458 8:-0.2177157123823752 9:-0.5433158910016767 10:-0.4603867691069983 --9.211066571082247 1:-0.3611235296125135 2:0.1402619601475985 3:-0.23132525512647795 4:0.5534401725834837 5:-0.34978585787763206 6:-0.24147682088922773 7:0.8089009287617064 8:-0.09075864922490862 9:-0.05759391404550773 10:0.3371306765964468 -6.52392916461972 1:0.19122050285976044 2:-0.625453376800498 3:-0.26804961781489856 4:0.9669297468261109 5:0.9142504122291741 6:0.7678963028488108 7:-0.6852943621882759 8:0.5898129788981794 9:-0.6580947533327339 10:0.46875109532259396 --12.46765638103286 1:0.35148385951742633 2:-0.5206883134357769 3:0.35436280451876345 4:-0.8837833467474128 5:0.3433887284719144 6:0.3914771858025621 7:-0.17813796710416252 8:0.6553344538056296 9:0.3721548243590813 10:0.9442185832979726 --4.937258492902948 1:0.9150659354384785 2:-0.17085510578573548 3:0.8233227233543232 4:0.2539669132090434 5:0.18955049451212935 6:-0.2833188558310358 7:-0.48483747414616496 8:0.8917378487725669 9:-0.13169122011498646 10:0.9815059855284158 --0.5233425797210233 1:0.4238363705720569 2:-0.18363058784066522 3:0.2949874786744968 4:0.12235592695567354 5:-0.9746310186182559 6:-0.8990867637441311 7:-0.8580982328464586 8:-0.7930887027205957 9:0.16757307988090275 10:0.988861929608575 --11.904986902675114 1:-0.3692990475534952 2:0.32166293883244323 3:0.3401547722249436 4:0.10009747375878408 5:0.7598877208920192 6:0.2853003389082669 7:0.22880221701675074 8:0.4521491122351502 9:0.33222018268933895 10:-0.9500018867461919 -8.324969054805921 1:-0.48086111720736513 2:0.3705524122401185 3:0.43635448766342133 4:0.6544321903349255 5:0.059000747296945155 6:0.3328036763371236 7:0.9609146376298034 8:0.5943082361322021 9:-0.3074246170581105 10:-0.6763916655761453 -0.21701641918233017 1:-0.29449708766806304 2:0.040640346437143426 3:-0.6524819533513639 4:0.37482287233702394 5:-0.29800608396043216 6:-0.537030944860492 7:0.2862394027536084 8:-0.3783043133672048 9:-0.5292179323972728 10:-0.09583783955916791 --6.84977373580439 1:0.825136109631339 2:-0.5722868691442817 3:0.11048134523744757 4:-0.5946054293068455 5:0.28061485657354823 6:0.9135611623885838 7:0.35590421873954603 8:0.8943562249941011 9:0.4183378981109729 10:0.5714160298247304 --11.039347808253828 1:-0.9620263418414967 2:0.22669065740934724 3:-0.7378036492234086 4:-0.4460191511609126 5:-0.2594476006347024 6:-0.989879976130936 7:0.762096015449097 8:0.6983868222083149 9:0.8729993459982626 10:0.3426647417451305 --5.882860061103163 1:0.5247178959769465 2:-0.6217169944869176 3:-0.13640714414758315 4:0.6608201052790283 5:0.5789945243704264 6:-0.12686057623612612 7:0.7277882307863026 8:-0.47949544949858236 9:0.9781208432412936 10:-0.8980068284379361 -23.52945433069272 1:-0.12339549394875426 2:-0.6769524283089239 3:0.9324962870874394 4:0.28956947294105206 5:-0.2957355479338608 6:0.7504385350771912 7:-0.8769262306643106 8:0.41591311300668155 9:-0.7694611231426498 10:0.9885110924181837 -19.043184423383824 1:-0.13783178628851878 2:-0.853631844645959 3:-0.12792415583066052 4:0.6936898387576049 5:0.8488563282318959 6:-0.6530521292304581 7:0.27832187660440666 8:0.09838048719062442 9:-0.5913230087557231 10:0.260839433107553 -6.83105883806984 1:-0.9085282656519695 2:0.65203708247844 3:-0.687580071985604 4:-0.045008726377529173 5:0.4762107922777967 6:0.15939259525248506 7:-0.46363191848939334 8:-0.25856682230410266 9:0.313842004143269 10:0.5042938214484851 --9.409197719620593 1:-0.34356198962701945 2:-0.06381545064099514 3:-0.9332814619122063 4:-0.2629675367181199 5:-0.03876014002851913 6:-0.4606936151803749 7:0.49272969757318563 8:0.5550196351479111 9:-0.1758425343811718 10:0.20285868144226837 --1.3101852978323116 1:-0.3740821549570985 2:-0.9788976137554464 3:-0.6078739734947245 4:-0.8007745980271539 5:0.7381298546055934 6:0.7407750458109124 7:-0.7711351008178868 8:-0.9895256155202141 9:0.35793767138197174 10:0.6589909255086295 -0.5180809608973377 1:0.19289850282287446 2:0.6301214514538145 3:-0.15311307199521518 4:-0.8607670552113709 5:-0.46422067276745316 6:-0.29812862604449464 7:0.519464836430044 8:-0.9480450997338103 9:0.973503038633444 10:-0.7843880226794626 -1.9947872601406775 1:-0.15799682110486057 2:0.22645891561571352 3:0.3141842574216682 4:-0.36086019480721676 5:-0.1429373936064291 6:0.8097261636650581 7:0.11764088861630029 8:-0.9151998265501957 9:0.6536711690904891 10:-0.17232697113157425 -12.352290000973428 1:0.8176113135335772 2:0.39342616792621987 3:0.44011948797971234 4:-0.4412435869837865 5:-0.24509203724837314 6:0.8636655043434542 7:-0.4251583124505798 8:0.2068056615503988 9:-0.3501114760443049 10:-0.23701353324739483 --2.891643319177732 1:0.7722403010820704 2:0.7994121584045861 3:0.18520464815273208 4:0.7273575609391227 5:-0.3758589216283552 6:-0.7598404862373955 7:0.5748649410179301 8:0.6897988099260968 9:0.5638920860629713 10:-0.992567809902162 -4.803737144054077 1:-0.7367711178556622 2:0.07370548192399351 3:-0.5510509754264419 4:0.11949095653894504 5:-0.7723751845800411 6:0.6450480728551136 7:-0.9508825019800493 8:-0.3250395411575804 9:-0.24913562167143777 10:-0.3617439870343031 -5.051689886526102 1:-0.09854955786627007 2:0.5298224514703289 3:-0.014996634675966236 4:-0.4462048687049027 5:0.22912790083984547 6:-0.513533454471272 7:0.1452771069237353 8:0.371152210841464 9:0.9204732090987018 10:0.7472990716905279 -3.8591142298280476 1:0.7532169023970261 2:0.8291433156934658 3:0.9255891263525324 4:0.3248663809949248 5:0.9905320652281553 6:-0.10383453745167626 7:0.8519246838852608 8:0.6024015353989258 9:-0.06958036249881938 10:0.5862142389541998 -11.30005914221598 1:0.026411858067972194 2:-0.6968445330429607 3:-0.8194566946165238 4:-0.12780659247925996 5:0.8406393783194903 6:-0.24617182945415128 7:0.30199973460219853 8:0.6062457235841974 9:-0.19314055910416927 10:-0.48313233883372964 --10.288657252388708 1:-0.7388306404020344 2:0.07753617971873439 3:-0.5735498713988352 4:0.2183581175474576 5:-0.873572721679176 6:-0.8788755575751708 7:0.7087858362905568 8:0.7126712562404713 9:-0.7607334319316799 10:-0.4627367552114916 -4.895250842405817 1:0.9772954128558484 2:0.6020087399988574 3:0.16946626176056134 4:-0.011334492807484997 5:-0.5391845039589362 6:-0.4315843612118535 7:0.9065130011032458 8:-0.4860160207844919 9:0.0921755607946162 10:-0.022200673265013515 -1.0479421939727227 1:-0.055436367433274514 2:-0.6710483362647659 3:0.9222786043047919 4:-0.22005981623386184 5:-0.8141845044113469 6:-0.31766631447334226 7:0.6067696845798944 8:-0.1445661385071555 9:0.9172271611227454 10:-0.8079554780561127 --9.754451457291598 1:0.533713237587885 2:0.6499588942067549 3:-0.49188790503368285 4:-0.6925119436487435 5:0.3345265979579788 6:-0.8117849521672496 7:0.9312055115656304 8:0.3273803451149724 9:0.7567478475677727 10:-0.6256676928549367 -5.869027126482974 1:0.7273823383600513 2:-0.2519813990388706 3:-0.8239584025397881 4:-0.13749750031735974 5:0.6142824732416132 6:0.6251630800232315 7:-0.6138240706157267 8:0.7210396245391326 9:-0.41832155201953714 10:-0.8965988320689853 -9.14234252751227 1:0.7295320896113133 2:0.6150271212503227 3:-0.9785024737101733 4:0.30006672036705506 5:0.11703528191771406 6:0.2971639460196238 7:-0.7920108995168815 8:0.32649036066184567 9:0.03522428067355543 10:-0.1766251898148803 --5.643698771141404 1:0.27360638280623983 2:-0.6124401810442446 3:0.24950528730210886 4:0.09920211684887548 5:0.7187490549286091 6:0.6212724115415782 7:0.5864634211269566 8:0.114951165007104 9:0.44859258949094283 10:-0.3768352371578665 -12.781643819428492 1:0.9144335582094396 2:-0.4579872615218674 3:-0.6521934534632468 4:0.4462086111316512 5:0.240360283350179 6:0.23974046479581124 7:0.4840439971437822 8:-0.7250363120037027 9:-0.29769496257362094 10:-0.3382859512018359 -8.393556738722923 1:-0.8263387132502396 2:0.9434824094966923 3:0.1607861709872136 4:0.15217100448798782 5:-0.6517945935711484 6:-3.354731073326178E-4 7:0.07846631386981562 8:0.687844846942889 9:0.9277854407325892 10:-0.8855380268588307 --15.348871155379253 1:-0.5734707274250155 2:-0.2526008551945753 3:0.23752094195309925 4:-0.7074613963298721 5:0.4674168537545218 6:-0.3198997855552628 7:-0.10415974108745596 8:0.5616912699671224 9:0.43742425558560694 10:0.19732530755184596 -13.138260063721448 1:-0.9415220143797984 2:0.6015431361268124 3:0.38898046240229545 4:-0.5750448371021175 5:-0.5803995196333898 6:0.11772198725731342 7:0.7512685244060366 8:-0.6683465740662857 9:0.9515652825318053 10:-0.32405935964523547 --26.736207182601724 1:-0.47083104147202404 2:0.28748860067800597 3:0.007399318769021113 4:-0.8189013750589702 5:-0.5156633937248272 6:-0.9906928746525896 7:-0.8848419810272337 8:0.2197280161306785 9:0.12855082514870197 10:-0.7862803985146845 --20.212077258958672 1:0.5609065808412279 2:-0.9201904391147984 3:0.908305865183735 4:0.9255146658282842 5:0.6871419344095282 6:0.4201876217923466 7:-0.42906289792612684 8:0.5787691868233418 9:0.7260522064761288 10:0.28251641556690554 --0.44652227528840105 1:0.37640618494870504 2:-0.20012451052963542 3:0.9420894309510319 4:0.4218728633972739 5:0.5551974480349577 6:0.07615991810462619 7:-0.12409220462011294 8:-0.22212591926375946 9:0.21160498862483723 10:-0.6092792830633924 --1.9481059746438067 1:-0.43820030250217457 2:-0.6836588417639442 3:0.733018205278934 4:-0.6564348753121718 5:0.7333385435136448 6:-0.5577457688360317 7:-0.31035811050608975 8:-0.7189201447768139 9:-0.7629842028723994 10:0.7179459779331092 -1.1951162998609508 1:0.19541555859727744 2:-0.4796785506546435 3:0.14123852670749248 4:0.7161847585887089 5:-0.2502765085719578 6:0.8815667909545981 7:-0.6418691905513725 8:0.49600147195728783 9:-0.3091837674381053 10:0.4320162841463153 --8.99125390483227 1:-0.01183888602092864 2:-0.5901829024081027 3:-0.4343074406380647 4:-0.40450313056290166 5:0.05269590196351448 6:0.733631212862198 7:0.9575176715505025 8:0.5974628692830348 9:-0.20284241796038271 10:0.9577348510907686 --7.955533026930219 1:0.6104830760481679 2:0.5915483572646505 3:0.3275427350991458 4:0.48361434056132424 5:-0.9466590639056058 6:-0.24662428438925743 7:0.9856361456534972 8:0.9434155212648045 9:0.3466736921968707 10:0.12927980558284102 --12.500773785355054 1:0.5733321361720694 2:0.39154119830075085 3:-0.9347116355607772 4:0.0920586614926524 5:-0.6959457183810456 6:0.2136579936466858 7:0.17595268059814395 8:0.8828168055200465 9:0.18934277314853398 10:0.7565908584660754 --11.43180236554046 1:0.082018621904135 2:0.9074181204118958 3:0.46125595008850273 4:0.40328845936169966 5:0.7803064691948824 6:0.20802011482729377 7:-0.41368899649077284 8:-0.8997565495498339 9:-0.1880483213318005 10:-0.15538597634233264 --5.055293333055445 1:0.4442675297698402 2:0.19045719972922193 3:0.4877438951288897 4:0.7984474402420494 5:0.3251350777349489 6:-0.18676050499673869 7:-0.2701840041572374 8:0.4486609996458524 9:0.5403637876036615 10:-0.8971614841211264 -1.0276485382241776 1:0.7953696703382547 2:-0.3245779681908927 3:-0.3507435626548021 4:0.9510986059491036 5:-0.8655491074076527 6:0.20729233888498677 7:-0.43078300089533594 8:0.19504657032168216 9:-0.3173814102187291 10:-0.042479969052890754 -9.690201571311908 1:0.16852987139559206 2:-0.2514893273405625 3:-0.9993240281686275 4:-0.2166013247997891 5:0.33294165754921234 6:-0.5824203831560628 7:-0.15253642946648616 8:0.3547892367555441 9:-0.047604356104869794 10:0.9229112136183077 -2.2591036039970347 1:-0.9919593184325572 2:0.6323551392201245 3:-0.20815293136790447 4:-0.002395046469600759 5:-0.5015903362190326 6:-0.16698803749234048 7:0.7901657583805675 8:0.33755402936964973 9:-0.3707337678548108 10:0.6995480653730146 -1.5130881908855742 1:0.973710432688613 2:0.6518972988019702 3:-0.16491318496856833 4:-0.6066757853095415 5:0.8762371591845273 6:-0.9056066630820714 7:-0.3388079327070965 8:0.3934146060660142 9:-0.8756168865642253 10:0.9522427911640303 -4.023618949132531 1:-0.14974626191548301 2:-0.5874962377709136 3:0.6780439909311404 4:-0.37291203746764356 5:0.08104034602232169 6:-0.4706923395029945 7:-0.8924577368048239 8:-0.3363784341297067 9:-0.4139746050396018 10:-0.5107600309932907 --2.8674162893420965 1:-0.7554383289076523 2:-0.1355597928418868 3:-0.3891904246986413 4:0.43949832438341785 5:-0.43859957095446833 6:0.37548094528561093 7:-0.5228633291549518 8:0.24169710795100352 9:0.7131753590746546 10:0.03458176767001042 -4.661164232198611 1:-0.12738868751385546 2:0.9446285809821182 3:-0.17981416859193433 4:-0.7535879975625193 5:-0.08594548726529161 6:-0.9983154486609989 7:-0.7272748852665216 8:-0.8197811039616518 9:0.5177610923333253 10:-0.6180731281817853 --0.12347625601866746 1:0.10820547757674692 2:0.1825421454873002 3:-0.3412486258429426 4:-0.14925445930975534 5:-0.6594599831395103 6:0.9552502376248448 7:-0.7875626067291472 8:0.3854984181307912 9:0.014303876202374832 10:-0.7300443667550689 -14.546296184422973 1:0.2459523985646046 2:0.9434777073825811 3:0.2112745925235362 4:0.7730688005214974 5:-0.13727994893203732 6:0.6140037510172511 7:0.7545298281668846 8:0.7814551909982614 9:0.0026683642139069264 10:0.5633973602849358 --19.66731861537172 1:0.9353590082406811 2:0.8768609458072838 3:0.9618210554140587 4:0.12103715737151921 5:-0.7691766106953688 6:-0.4220229608873225 7:-0.18117247651928658 8:-0.14333978019692784 9:-0.31512358142857066 10:0.4022153556528465 -18.84119697288412 1:0.4423204637505467 2:-0.4364821709544735 3:0.3935363893778452 4:-0.7750286735195999 5:-0.6981814766625978 6:0.6889512553826111 7:0.3646791168217727 8:0.0023536025493677837 9:-0.08378048150085249 10:-0.05659381771155503 -17.40329212914592 1:0.9155980216177384 2:-0.35593866074295355 3:0.44775710780914824 4:-0.42914421567532357 5:-0.2734430718503955 6:-0.8937042912745483 7:-0.3143761936611371 8:0.07805814979426184 9:-0.31386151509289784 10:0.6202932236456253 --19.402336030214553 1:0.462288625222409 2:-0.902975525942725 3:0.7442695642729447 4:0.3802724233363486 5:0.4068685903786069 6:-0.5054707879424198 7:-0.8686166000900748 8:-0.014710838968344575 9:-0.1362606460134499 10:0.8444452252816472 --3.855123203007599 1:0.5072557393175969 2:0.4626973233672753 3:-0.20910077161652119 4:0.9431415515135266 5:-0.1293690767585638 6:-0.2033835058111637 7:0.501429131658198 8:0.175133281735671 9:-0.6091682952201736 10:0.543010689352589 -1.493768355655548 1:-0.7772812666041105 2:-0.7743738591348672 3:-0.2848754060915175 4:0.3336846848765145 5:0.6219572132443736 6:-0.11144657683793624 7:0.7606913325884337 8:0.8547085151723017 9:-0.31728444617771134 10:-0.4668474022688931 --17.803626188664516 1:0.5176340000264179 2:0.23048377874011128 3:0.6162746928601832 4:0.16908590014785418 5:0.9695207469685181 6:-0.34713218673384705 7:0.8526833760069625 8:0.9895592279649763 9:0.8805561957342884 10:-0.43452438291417894 -1.4060200157931342 1:-0.41964471941333525 2:0.7738486114171979 3:-0.0964606192284374 4:-0.25351781452566025 5:-0.21065389913054244 6:-0.40490416354122916 7:-0.7696501777959646 8:-0.7710488116813146 9:-0.6777228721053572 10:-0.09381158095961428 --17.026492264209548 1:0.8367805314799452 2:0.1559190443625338 3:0.048200110551483544 4:-0.7340083467235765 5:0.2661150265782781 6:0.3881661781792165 7:0.9485287302765621 8:0.7201540574376382 9:0.8509234862656003 10:0.9658114866648093 -8.729450606651499 1:0.6404862166906327 2:0.16516090922657822 3:0.29013117743588057 4:0.37056732180613317 5:-0.3376494575302882 6:0.9012625630650577 7:-0.42150978319487 8:-0.05630249989686087 9:0.706104255632954 10:0.01935884085365225 --5.516822117602276 1:-0.5718348423045241 2:-0.2145777722920088 3:-0.09307467998835195 4:-0.7311274103678378 5:0.5272184003067053 6:-0.00528176138162495 7:0.2852826178935919 8:0.6180999884045897 9:-0.7526372151008776 10:0.20416472532830543 -13.001541259752251 1:-0.5137703877272299 2:-0.15452359837207896 3:-0.25657600903152744 4:-0.9773110735601165 5:0.0718147980090178 6:0.18965211809311744 7:0.7795354990363292 8:0.21976898743223638 9:-0.20364089221752524 10:0.33822332985943304 -18.443388694564348 1:-0.9278344397401963 2:0.2678538727090136 3:-0.46932389854374734 4:0.8494176173177825 5:0.45765527018197694 6:0.20546395745879287 7:-0.199860294349123 8:0.47798730134403256 9:-0.2279771893187592 10:-0.30836118564314274 -8.952089112152663 1:-0.7371671220953286 2:0.8160149639986789 3:-0.026630089188139028 4:0.5931015267817183 5:-0.12216243475451294 6:0.161290795125286 7:0.7423016751095652 8:-0.5212872902985852 9:5.606147011660845E-5 10:-0.409626733921443 --3.7062463981908027 1:0.5633514321449928 2:0.9914900963311462 3:0.1867799930236702 4:-0.15960235736142847 5:0.1204791067384241 6:-0.7733281422620872 7:-0.887447048141158 8:0.7931515335800692 9:0.732289882696125 10:-0.034992898370363124 --10.58331129986813 1:0.6627003739767989 2:0.10688718810947728 3:-0.49230090744757216 4:0.8936580036513948 5:0.012227929286241057 6:-0.1442038886014838 7:0.9203452040795139 8:-0.20719832624131262 9:0.29561869366253335 10:-0.08597725084864649 -9.818996211259908 1:0.580133516885796 2:0.07422424429848573 3:0.33438634998226924 4:0.26054797992533696 5:-0.8771304726537796 6:-0.9710990591964794 7:-0.1869287393875041 8:-0.6167738073093247 9:0.34401921428837245 10:0.6737600514607418 --11.87816749996684 1:-0.7193071334885193 2:0.5247127705364141 3:-0.02978727198197606 4:0.18353223007701058 5:0.40350110058596944 6:-0.36002841871228686 7:-0.20781535546501528 8:0.5517883176456557 9:-0.9938027872744732 10:0.6245061418135955 --12.198096564661412 1:0.27542314155961156 2:0.3459734388741733 3:-0.38737776987446937 4:0.6244101669171684 5:-0.7801218302490938 6:0.20444733666197523 7:-0.5667599464182904 8:-0.9462131580071358 9:0.5576565405741785 10:-0.9307557040059242 --3.6610413123521357 1:0.045569951437504086 2:0.32203961277046145 3:-0.04228927426053675 4:-0.9435304938416831 5:0.3750509710699601 6:0.21298970117620142 7:0.5491054691791977 8:0.33695088608872203 9:-0.9923500858828505 10:-0.6402707119893463 -3.782742149409224 1:0.7795250611996376 2:0.43296979846218275 3:-0.6481485005937841 4:0.3235717281667645 5:-0.8067382770768907 6:-0.06740397503468509 7:-0.2835017205434338 8:-0.5875853498478532 9:-0.25699561837680585 10:0.7813561594373908 --17.065399625876015 1:-0.01772446594568744 2:0.563282914714494 3:0.14232420381013955 4:0.031667902604941345 5:-0.7815348482900619 6:0.3657733497576803 7:0.7208326162626688 8:-0.7863253120180662 9:0.3329194167867533 10:0.6175752945608013 -16.23248797654815 1:0.2615647748812251 2:-0.6631801348538622 3:0.6420349382574477 4:-0.31980528388089846 5:0.38021930887251365 6:-0.060298437830818896 7:-0.8911652782989568 8:0.3424617259589986 9:-0.8515350749364614 10:-0.42354709676980207 --5.015963911416578 1:-0.07890564237014686 2:-0.09864377281008885 3:-0.13139943914680408 4:0.6610949669857866 5:0.06777579108221987 6:-0.26586245727222835 7:0.17443498956808612 8:-0.3129854922817781 9:-0.37913757211269505 10:0.7627186373372121 -22.647750304177556 1:-0.03666997412165163 2:0.49691867674483814 3:-0.45898559472166967 4:-0.09932248891016404 5:0.05692910907689508 6:-0.5006743461081364 7:0.9992936758550379 8:0.8252525466172065 9:0.9431711015127009 10:-0.4891497061921315 --3.731112242951253 1:0.44353490207818513 2:0.23112032838224117 3:0.4697682541445527 4:-0.7507514828346664 5:-0.06323257550543837 6:0.0997091431243109 7:0.9394036761509628 8:0.4103869738859962 9:0.6564209227640914 10:-0.5427466755921158 -0.6761872737225261 1:-0.30051626190360503 2:-0.26699232020158803 3:0.8668758741279379 4:-0.40325291744583347 5:-0.9756425738484267 6:-0.5116398654634617 7:0.16424789009043073 8:0.8034099442414044 9:0.8554935001446193 10:0.42747702930667497 -8.449247195197387 1:-0.6588765973399024 2:0.2502285196526799 3:-0.20481547024283087 4:0.3770725284683252 5:-0.169707887761277 6:-0.0804075502584003 7:-0.3580757176408007 8:-0.6042549664471129 9:0.360349278976142 10:0.15899650901110962 -27.111027963108548 1:0.7106841652047162 2:0.6853699382312817 3:-0.8076297545289823 4:0.7932321056591545 5:-0.8011085095234463 6:-0.7017292726737878 7:0.10568649778064154 8:-0.40755358264969255 9:-0.061008981132773865 10:0.08895972651409556 -27.78383192005107 1:-0.8378790218922778 2:-0.6651002504721837 3:0.021049638665430415 4:0.32994334871293196 5:-0.7981304887988308 6:-0.2947962117284566 7:0.9739408711845776 8:0.9442893181893954 9:0.010541491359981059 10:0.8332791453382604 -15.700710963871254 1:-0.538773982400854 2:-0.5966426806845984 3:0.14570292467314627 4:-0.5937791901212952 5:0.7779251136963325 6:0.9962962075803357 7:-0.4774083823748394 8:-0.02528476957876369 9:-0.17305036341254398 10:-0.6013841506503688 --12.558575788856189 1:0.03250364930617211 2:-0.6723950859659307 3:0.7090474884514901 4:0.25034305882632735 5:0.7036774024093582 6:0.622650236684523 7:0.5776881238206741 8:0.7999754726258337 9:0.21332972563833508 10:0.33849062947231645 -6.2776776518215955 1:-0.009605588630256623 2:0.5786496865369053 3:0.9208276908400748 4:-0.9477397424337148 5:0.6306053656362194 6:0.5396434662389846 7:-0.9841930450269964 8:0.5492682920407823 9:-0.020767248025529206 10:-0.8684655435686472 -6.424586997399564 1:0.861374923392324 2:0.8356037964367176 3:-0.7173479824827564 4:-0.6309584820438245 5:0.16136758138471285 6:-0.7485184163431866 7:-0.006053583829132236 8:-0.8762221084691306 9:0.19195377669247726 10:0.07259634302552964 --9.64772485466405 1:0.7568015336230662 2:-0.4221524485756756 3:0.011711847664269248 4:0.7387065048724242 5:-0.04347512566745104 6:0.06642100869974654 7:-0.6993705848315939 8:0.16312217088045422 9:-0.11975577990989916 10:-0.6188717473788392 -3.8183706502283647 1:-0.7226937936463145 2:-0.5462756960199258 3:-0.39158419906610664 4:0.014310440945434433 5:-0.9950315917350652 6:-0.1844037449550875 7:0.9023517651879036 8:0.7948752060508435 9:-0.6792702010973877 10:0.40730074403235617 -1.1585019476700562 1:0.5575546848694 2:0.8997032130006739 3:0.6088643323129037 4:0.4872893656051758 5:-0.03977520372748922 6:0.3202565433572042 7:-0.31231768645537206 8:-0.6861153669592381 9:-0.08561643820383291 10:0.522243657731251 --8.18651039877047 1:-0.809069379967462 2:-0.04827229852445103 3:0.19963602092982624 4:0.2568971171641006 5:-0.0015346733366310428 6:-0.6104625526166494 7:0.7746715041233412 8:-0.7343750018341593 9:-0.49272635466510106 10:-0.8115191199688623 --3.377690136019927 1:-0.9408187510685164 2:0.9654993263332854 3:-0.16725010447984268 4:0.2574069587853294 5:-0.6930506968932861 6:0.11124762075550176 7:0.39145805505914866 8:0.2906495128462767 9:-0.27454907309824916 10:0.9001175309434777 -12.692571815413245 1:0.7404426710258791 2:0.9060576634778448 3:0.7023712021897308 4:-0.9808126157768493 5:0.03447666475715194 6:-0.4146339211599541 7:-0.7329651749553896 8:-0.2696019807317358 9:-0.9885367164723897 10:-0.8540304023043486 -2.5111054050889354 1:0.7448154454968356 2:-0.7532143233138027 3:-0.9724617436335079 4:0.662620399592766 5:0.45517204589358307 6:0.37409736074838684 7:0.337245076577648 8:0.50951903847353 9:0.2590369923587328 10:-0.3248257475117191 --8.300340493749207 1:0.5504850435404609 2:0.5077232940244447 3:0.778859307357816 4:0.2601916883813373 5:-0.0032275666062382413 6:0.039752927221862855 7:0.19468432568826755 8:-0.2859531554546477 9:-0.4113477962970582 10:0.43272011953041667 -5.904938653193952 1:0.6622293273002955 2:0.6428891633785236 3:0.6999663090423285 4:0.9132698742913088 5:-0.3960072336866507 6:-0.14500922264286054 7:-0.4390171033743564 8:0.002067106212897185 9:-0.6079874251539117 10:-0.7131416109696531 -5.004048239623824 1:0.7212309895357449 2:0.3425199843383353 3:-0.7290323633040705 4:-0.5563097960397918 5:-0.7577898297822001 6:0.647883070472203 7:-0.23710559062843073 8:0.34398507133293954 9:-0.5440251617348038 10:-0.2971638032112218 -6.21255598077158 1:0.2498685983586959 2:-0.2586857335205359 3:-0.6380810501916263 4:0.17008841621855852 5:0.9485802018202867 6:-0.2580306792121272 7:0.032916516140567786 8:0.32950951532163675 9:-0.9291915084526683 10:0.8454021164786922 --3.741044592262687 1:0.763300390779396 2:-0.1832552896771813 3:-0.39361907876758573 4:0.9050768615040607 5:-0.8850093869496836 6:0.9302208653737598 7:-0.12972094056755412 8:-0.459442486378308 9:0.5044112394875107 10:0.1399067554681861 -7.378402183384303 1:-0.27686808475610114 2:0.12735524561214606 3:0.5216635958678004 4:-0.9418584785460469 5:0.20441570818728771 6:-0.35073421178920583 7:0.7847501694079704 8:0.3222999552829353 9:0.21025696511089764 10:-0.5813710201294744 --7.1500991588127265 1:-0.1945259148773102 2:-0.4089845159829022 3:-0.1971859124232922 4:0.9531447983295496 5:0.07996455700202221 6:0.17013529724757648 7:-0.2442095218739362 8:-0.8564146371721229 9:-0.5843910532907555 10:-0.33846471424918767 --4.288417758202577 1:0.020710986120182184 2:-0.7450564238727908 3:0.3674992023059285 4:0.46737461414601555 5:0.9411702705113052 6:-0.7257365059912877 7:0.5813280037560231 8:-0.01567531846894843 9:0.24734195293533467 10:0.6516001002566887 -5.916426037500391 1:0.8260000862135342 2:-0.11324162495165968 3:0.13061304369435334 4:0.5762591624576425 5:0.548049763999644 6:-0.9751599851764361 7:0.02828821483057764 8:-0.4113286027346803 9:0.8912856976307486 10:-0.8470910204808244 -2.431004294471012 1:0.14088576701299083 2:-0.45104190898994734 3:0.29891134031619115 4:0.955503074037666 5:0.15962522624750242 6:0.7664481093046553 7:0.051697815479792686 8:-0.3471787155014081 9:-0.8007151537631465 10:-0.5598899500902301 --16.08565904102149 1:0.3946137229565083 2:0.8443779319638349 3:0.5116855547320893 4:-0.5319339991982652 5:0.26564506849312797 6:0.18905397829944448 7:0.1976357098053687 8:0.15505612242632538 9:-0.935633748308776 10:-0.9782957013204887 -18.058440348477184 1:0.8402487524597533 2:-0.6200725197687718 3:-0.6158487677192792 4:0.0709328308135515 5:0.7501256905495493 6:0.38092209802839583 7:-0.8192579128383128 8:-0.9304002828581583 9:-0.6570300818845025 10:-0.5252554781538985 --1.0026720160736349 1:0.46122079684901474 2:-0.7609201036934166 3:-0.9372178059537293 4:-0.25391036498391006 5:-0.7487429157699828 6:0.38024314675291637 7:0.21886059803198576 8:0.027516853267765207 9:0.33483464322377765 10:0.618580130027746 --2.6688695419207162 1:-0.8775911623423445 2:-0.6647410420697879 3:0.05948516302547313 4:0.7278526664475804 5:-0.011366224409705028 6:0.33475665968289436 7:-0.6386120399761575 8:0.39609772177595115 9:-0.7872076290319412 10:-0.6195857302948329 --13.867087895158768 1:-0.9114780602695882 2:0.7997695296649912 3:0.8337252417804881 4:-0.7927267913881113 5:0.6863829853181673 6:0.4162562153517635 7:0.2659922421074139 8:-0.551994669040742 9:-0.6403900338772157 10:-0.8680387717518072 -7.826011095515239 1:-0.2881951904396949 2:-0.19317071325391022 3:-0.06581062483451183 4:-0.6074074436315555 5:-0.9434740067975405 6:0.9426572655575483 7:-0.1812629432036228 8:0.39425575292939863 9:0.5065890539615039 10:0.8969825696966649 -1.4213836206303339 1:0.6996840540120932 2:0.1283999569152492 3:-0.2537375462472613 4:0.24772110606788456 5:0.9040210381745799 6:0.47062010977660207 7:0.9697678931927365 8:-0.9215764371674713 9:-0.27541598110075793 10:0.44277003247067803 --0.973650798730175 1:-0.2121645467631068 2:-0.6770222508071349 3:-0.5733067523949165 4:0.27979529516037105 5:0.7128588235545461 6:-0.9208763636184307 7:0.14128337151047532 8:-0.002851660400375433 9:0.6943908711123281 10:-0.9201922993121072 --0.17500848560451965 1:-0.6015070903427717 2:0.7815998200409671 3:-0.9932006200204946 4:-0.3303953411379028 5:-0.3329917860768894 6:-0.2822852019877604 7:0.6834785385197197 8:-0.6458607648553825 9:-0.06171476054995373 10:0.11421513352405444 --15.310980589416289 1:-0.35290763483001486 2:-0.7263565311032778 3:-0.8688987069582226 4:-0.991098319894185 5:0.7029028082332363 6:-0.20251284356518684 7:-0.10928416773360117 8:0.307764663956116 9:0.6423143148384418 10:-0.15527637175127107 -3.260298266762908 1:-0.7817510582064782 2:0.45336200757318257 3:-0.15365670773321338 4:0.5063951567230205 5:-0.7102867196895872 6:-0.48050036620725955 7:0.9838016675169072 8:0.07854601230194436 9:-0.18953694857147863 10:0.19370072527454107 -3.846123583197846 1:0.6665586449040093 2:-0.2894063530813835 3:0.29965348483445386 4:0.23590344101670313 5:-0.7456743720187828 6:-0.4680876353446175 7:0.8106301610699425 8:0.691280702194663 9:-0.6060141408622055 10:0.34018639920235194 --10.945919657782932 1:0.7669971723591666 2:0.38702771863552776 3:-0.6664311930513411 4:-0.2817072090916286 5:-0.16955916900934387 6:-0.9425831315444453 7:0.5685476711649924 8:-0.20782258743798265 9:0.015213591474494637 10:0.8183723865760859 -9.820049725467145 1:0.9582163993327679 2:0.7503905881505508 3:0.6255110430336392 4:0.6522701954798096 5:0.09248037700932144 6:-0.2833482854986902 7:-0.9841968940607242 8:-0.9343780716625845 9:-0.605526104070818 10:0.6000165028195326 -11.398715935456183 1:0.6605086903456443 2:0.14675454515266395 3:-0.7880053589830274 4:-0.8570785944515658 5:-0.4317693974151271 6:-0.12244918233307645 7:0.9808241653220866 8:0.5455853515046201 9:0.6870972425676756 10:0.7427686762232875 --7.846310147695936 1:0.4355817642106965 2:0.7659504362110916 3:-0.3784171977305315 4:-0.5675896574776877 5:-0.20116390539973938 6:0.8775467546326667 7:-0.2824903364469842 8:0.7470660314619617 9:0.8967783051712528 10:0.7133700339519966 --1.3847391232663768 1:0.3707613476850027 2:0.6931092598460797 3:-0.7701621508103305 4:-0.5679366502518555 5:-0.7234356749703683 6:-0.8059255104944509 7:-0.8307993875388229 8:0.6133975694770035 9:-0.7399749904168824 10:-0.1534990394513953 -16.93981662267873 1:0.6552665678625891 2:0.023248457840923775 3:-0.6850641408327465 4:0.7129790774369389 5:0.04166304042825364 6:-0.7160289667702797 7:-0.4733073680976494 8:0.2720897719417634 9:0.05850741911975099 10:0.34427554125371174 -2.8497179990245116 1:0.6664937514484015 2:0.3343796939204209 3:0.2611910348746209 4:-0.13658810351647 5:-0.5821801257591224 6:0.9854683468621908 7:-0.21396555404689188 8:-0.5923272173716836 9:-0.5674796199927252 10:-0.5681633547764235 -4.981807952389501 1:0.7517426071091595 2:0.7029291090701855 3:0.7126619831046563 4:-0.9982007415355478 5:-0.7743343367502893 6:-0.9048858749551119 7:-0.8243783842398396 8:0.4936163270697016 9:-0.6835495591484724 10:0.8412758607464845 -8.508637575729951 1:0.6837354268578517 2:-0.435346907350056 3:0.6597448795477736 4:0.8870204157376871 5:-0.6938576101541436 6:0.9199495715292882 7:0.33119640706964293 8:-0.6181273221979411 9:0.12929034268333317 10:0.6855150395247027 -14.369378079132883 1:-0.9489372180887643 2:-0.6577177233364067 3:0.543899463531252 4:0.5411152154119976 5:0.43733244485250733 6:0.5927084968109424 7:0.6100068837998656 8:0.9392735722529637 9:-0.9806701698603073 10:0.3984176141500082 --6.456944198081549 1:0.8380442392342373 2:0.05166133486184443 3:-0.25864153418691704 4:-0.9506672344106888 5:0.5227275493542325 6:-0.03899736644563956 7:0.7660133053649136 8:-0.9375236703284806 9:-0.37213210747743175 10:0.0560768367274771 --10.041353112580456 1:0.5293717914660876 2:-0.35874932480194044 3:0.14403824250820763 4:-0.4106496629336782 5:-0.794648717231762 6:-0.4369956159772408 7:0.8273613210141495 8:0.9212255384858874 9:0.00409867676727993 10:-0.23796544184855795 --6.606325361718908 1:0.2765102732490652 2:0.10184669160432525 3:-0.9406443798496789 4:-0.46661976112717896 5:-0.5836573778289609 6:0.1308554421925976 7:0.05232199712543473 8:-0.4965370542771641 9:-0.3695836654343949 10:0.4874427445939513 --15.359544879832677 1:-0.8253830145927283 2:0.29683545543963885 3:-0.9790356574071053 4:0.33749594518426473 5:-0.449483349548623 6:0.1740013774913005 7:0.5737323257916764 8:0.20159372721320645 9:-0.1812760896634873 10:-0.17652712339895738 -2.1801769966756845 1:0.3664130766917151 2:-0.1929450967547921 3:-0.7834945448457515 4:-0.03806442314852432 5:-0.6167622313628849 6:0.34919852301325394 7:-0.785891329691004 8:-0.5704062599527768 9:0.9846140894872721 10:-0.548571249100203 --2.7006646885251415 1:-0.48505178676353067 2:0.06347121974094883 3:-0.3704723119141229 4:0.7407080276548548 5:0.06713252857406937 6:-0.2103524488773294 7:-0.9402467715192988 8:-0.8555624501612784 9:0.6244760190429901 10:-0.9038885681517279 -0.2105613019270259 1:-0.17125223509187282 2:-0.23328463772140529 3:-0.6497773470047024 4:0.33111604806115524 5:0.7944287248398398 6:0.5163977380074081 7:-0.025715995643062595 8:0.11762566041047462 9:0.9938658554834845 10:0.5363394203614278 --0.6433952980357234 1:-0.905126800719938 2:0.5826442985002787 3:-0.8207546276288018 4:-0.0773547002692121 5:-0.6420058913410687 6:-0.9290787206193325 7:0.21829202840889095 8:-0.7752845890678082 9:0.4533233304372326 10:0.5457315861825041 -5.622874731146287 1:0.5486636398086722 2:-0.21867854114956642 3:0.13260110994566032 4:-0.024868470628895967 5:0.9246597814546305 6:0.07490395250443149 7:-0.21327567620097132 8:-0.33970581204395867 9:-0.19408398882121713 10:0.9757334811378136 --18.27521356600463 1:-0.489685764918109 2:0.6832314342743568 3:0.9115808714640257 4:-4.680515344936964E-4 5:0.03760860984717218 6:0.4344127744883004 7:-0.30019645809377127 8:-0.48339658188341783 9:-0.5488933834939806 10:-0.4735052851773165 -5.518650144654079 1:-0.16881374315243192 2:0.22747702179774354 3:-0.8555270909193926 4:-0.6914231522703247 5:0.03618437407657238 6:-0.8404831131806643 7:0.16378525699004887 8:-0.333895928854854 9:0.23026574917978326 10:0.9409087845740918 -2.5599738684677646 1:-0.24371170373626905 2:-0.1752613047793694 3:-0.7930324885557696 4:0.17288443448968627 5:0.7233942014077801 6:0.47222694561171963 7:0.7878187692414558 8:-0.6520011755878357 9:-0.9952507460157223 10:-0.32951026378415094 --8.508663400554862 1:0.9194236423060742 2:0.9517284917259223 3:-0.18723709334016392 4:-0.24913001260985546 5:0.8818286401027424 6:0.13661210218384512 7:-0.40792517201812983 8:-0.33132907984544957 9:-0.49137388288628703 10:-0.3273925353006979 --10.233439586953153 1:0.0960128812383454 2:-0.8611756848964027 3:0.11807312551418647 4:-0.24570750746947145 5:-0.047826307143366886 6:-0.717269426008625 7:-0.2841658181308486 8:-0.31500935950449516 9:0.23183474949267713 10:-0.512986169560546 --6.3459370724834265 1:0.9537835418930307 2:0.4598546399405288 3:-0.257013655072986 4:-0.29185820894937575 5:-0.6843688281544562 6:0.8343952028925479 7:-0.9656517094615942 8:-0.447440560943553 9:-0.9510349521362857 10:0.5918946980259567 -1.114406550703455 1:-0.5721838436595965 2:0.1201917297381252 3:-0.5253701290141362 4:-0.5874011312890843 5:0.7893580092022578 6:-0.18012813622584134 7:0.4781905737504004 8:-4.6732390143988667E-4 9:-0.7965374182885014 10:-0.8515444146742359 -8.688243146888663 1:0.2245581140502393 2:-0.0697600364101425 3:-0.7661833153629154 4:-0.2289151515902894 5:-0.5643191391300282 6:0.08069861795512168 7:-0.9670317635091523 8:0.14826752863715287 9:0.9325364047311011 10:0.4071178661803092 -14.896035572185347 1:0.20630949870309911 2:-0.5738578325975092 3:0.5664829389128903 4:0.3732752326637825 5:0.04079303403038881 6:-0.6604984910400766 7:0.15136076091734352 8:-0.6244939282579305 9:-0.5236288549540624 10:0.47284992666739023 -4.396558596072123 1:0.5565602414172521 2:0.1444095747909111 3:0.028227502879770272 4:0.38297378287943773 5:-0.26739745457451725 6:-0.708209627997985 7:0.7604483272526881 8:0.8072075261139096 9:0.11460574885028274 10:-0.07669406807610635 -1.7457141275341528 1:0.3668576517164046 2:-0.5352200081463954 3:0.5853385976871426 4:-0.4482551060006992 5:-0.5676795208498786 6:0.8043295590331514 7:-0.02160829797068753 8:0.42281303847010454 9:0.027894531623162466 10:-0.541120112980032 --15.334767479922341 1:-0.036676500783341615 2:0.804758241454594 3:-0.0642091078911513 4:0.1402705435750966 5:-0.9215322030628859 6:0.7951173116514345 7:-0.994819896842561 8:0.2382406912119326 9:0.6634166177958731 10:0.7623222578718651 -5.017247792012723 1:-0.5925393497160352 2:0.48506599831456443 3:-0.5079795649118319 4:0.6668553329827696 5:-0.1103174867779837 6:0.7048535526809607 7:-0.9819230894106692 8:0.19609620625274982 9:0.5173985272313828 10:-0.11269849619148875 -6.201510810634532 1:-0.6802942101330738 2:0.898957584078176 3:0.853293387559251 4:0.6089336185656065 5:-0.9352626288322801 6:0.3208583332890447 7:-0.964481544931127 8:-0.8294773786068643 9:-0.8817311989413614 10:0.5165364663580934 -19.174935630244647 1:-0.20026105252200788 2:0.7276178994821614 3:0.7748716685190951 4:-0.7423420145576229 5:0.13147770471985032 6:-0.8382015712894606 7:0.021760992104270294 8:-0.24586987823702944 9:-0.05958177281299326 10:0.47347236224860834 --14.822152909751189 1:0.7255660700197897 2:-0.22751988933383926 3:-0.08409197084114317 4:0.072750455428638 5:0.1841692073989072 6:-0.33838406658716513 7:-0.44701963574290526 8:0.5031210959133143 9:0.09640858549693743 10:0.9857351194637847 --6.310082095945472 1:-0.7692076133438608 2:0.8533601511731044 3:0.676268298275629 4:-0.783895030001512 5:-0.8195462819549715 6:0.3963101354895673 7:-0.6254922461977397 8:-0.7521135990258581 9:-0.8032003997516024 10:0.8388672800826487 -8.853802632714807 1:0.46950948246522195 2:-0.6148693581037883 3:0.028739220735170656 4:-0.024281643566285815 5:-0.3495458137792231 6:-0.12347196435522867 7:0.5253894065203333 8:0.5100713458262918 9:0.63975795701667 10:0.08644353314625053 --10.293714040655924 1:-0.17971950768550893 2:-0.6621720204354751 3:0.888036885802737 4:-0.04977483590350751 5:-0.8964991391283221 6:0.6873490822438724 7:0.42369087852118836 8:0.48972554317650663 9:0.8617233178519317 10:-0.8348331836605276 -0.23985611568891863 1:0.050526696983213215 2:0.8544297176525815 3:0.8586358519997579 4:-0.021299752441110487 5:0.2606696929560939 6:-0.39446486150105997 7:-0.4166234435381613 8:-0.6097643266459343 9:0.46633996256010146 10:-0.22521646199731027 -21.57719950299147 1:-0.5878491135126271 2:0.802134056970349 3:-0.5471017580843434 4:0.6067966843473331 5:-0.691712219323007 6:0.7814323754276735 7:0.31689445927290016 8:-0.1668780061940922 9:0.5285692389527452 10:0.8027091025203246 --0.7836538830323514 1:0.5766794801558166 2:0.8281463568384935 3:0.5087453132796032 4:0.5212853344036532 5:0.6294700781054074 6:-0.9385097739886943 7:-0.13127371407538302 8:0.9845390503404141 9:-0.7224166213906742 10:-0.11155327354295896 -6.710413649604831 1:-0.6919803228062729 2:-0.6526904017578161 3:-0.34211291948607014 4:0.9094842803341618 5:-0.9454398661995895 6:0.3780766512494227 7:0.5823385348738088 8:0.8817830051841733 9:-0.514843382774189 10:0.32579701113259296 -5.384747201245483 1:-0.9661857672086316 2:-0.519769534339731 3:-0.4466396856529564 4:-0.4370113024678448 5:-0.6397400687811474 6:0.08225309277403725 7:-0.25936524603970756 8:-0.1711463274766858 9:-0.42848099098115755 10:-0.8096854737357237 -7.688509532916731 1:0.3892872094452817 2:-0.13306620868059982 3:-0.932974891205117 4:-0.8921357494146682 5:0.4806996560679244 6:-0.21500288444218696 7:-0.8911268070046585 8:-0.9510264953215406 9:0.1899740993687098 10:-0.43944320580463536 -2.2546997585565296 1:-0.5963883101717473 2:-0.01115153603404151 3:0.8781871380140298 4:0.7736250964135891 5:-0.7325745711528668 6:0.2518631794989008 7:0.5760249284318746 8:0.8690107952725199 9:0.02320853138646095 10:0.08570951531344417 -5.597710012706039 1:-0.5323512235815979 2:0.03366944321271936 3:0.5143537675853551 4:0.28471250955283445 5:0.4012202634439719 6:0.12032039285431151 7:-0.08108716844967812 8:0.30231384371011294 9:0.03259115565303028 10:0.9567467516929173 --12.977848725392104 1:-0.5908891529017144 2:-0.7678208242918028 3:0.8512434510178621 4:-0.14910196410347298 5:0.6250260229199651 6:0.5393378705290228 7:-0.9573580597625002 8:-0.864881502860934 9:0.4175735160503429 10:0.4872169215922426 -10.35887243981476 1:-0.09126023790482862 2:0.18852634121926526 3:-0.13523918100503107 4:0.8333842692409983 5:-0.6015442103644761 6:0.5347736461652235 7:-0.823489760471118 8:0.5562688292037381 9:-0.807478561291906 10:-0.666881464988351 -0.4250502150408626 1:0.7771717566171905 2:-0.8729202752916785 3:-0.25782888805127024 4:-0.13605474993771205 5:0.5911781118120025 6:-0.8444023967853633 7:0.6787302541469229 8:-0.5444299313083194 9:0.356121883138657 10:-0.8845333845080687 --0.8743487925900991 1:-0.9087681208947878 2:-0.292625136739453 3:-0.35113758823291774 4:-0.705933223571676 5:-0.6882289471031144 6:0.8350131255297044 7:-0.7659016065609232 8:0.11400114955653207 9:-0.9466143658505732 10:-0.5033643125229932 --5.615143641864686 1:-0.6688289820084299 2:-0.4623159855015393 3:0.012827807007503855 4:-0.44521264878006117 5:-0.5563111031201406 6:-0.6065295981983794 7:0.3806712426786838 8:-0.11317152118817408 9:0.507896127467435 10:-0.8487801189674464 --0.1829397047693725 1:0.09377558075225512 2:0.5774384503027374 3:-0.7104684187448009 4:-0.07285914169135976 5:-0.8797920488335114 6:0.6099615504974201 7:-0.8047440624324915 8:-0.6877856114263066 9:0.5843004021777447 10:0.5190581455348131 -18.479680552020344 1:0.9635517137863321 2:0.9954507816218203 3:0.11959899129360774 4:0.3753283274192787 5:-0.9386713095183621 6:0.0926833703812433 7:0.48003949462701323 8:0.9432769781973132 9:-0.9637036991931129 10:-0.4064407447273508 -1.3850645873427236 1:0.14476184437006356 2:-0.11280617018445871 3:-0.4385084538142101 4:-0.5961619435136434 5:0.419554626795412 6:-0.5047767472761191 7:0.457180284958592 8:-0.9129360314541999 9:-0.6320022059786656 10:-0.44989608519659363 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_movielens_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_movielens_data.txt deleted file mode 100644 index f0eee19..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_movielens_data.txt +++ /dev/null @@ -1,1501 +0,0 @@ -0::2::3 -0::3::1 -0::5::2 -0::9::4 -0::11::1 -0::12::2 -0::15::1 -0::17::1 -0::19::1 -0::21::1 -0::23::1 -0::26::3 -0::27::1 -0::28::1 -0::29::1 -0::30::1 -0::31::1 -0::34::1 -0::37::1 -0::41::2 -0::44::1 -0::45::2 -0::46::1 -0::47::1 -0::48::1 -0::50::1 -0::51::1 -0::54::1 -0::55::1 -0::59::2 -0::61::2 -0::64::1 -0::67::1 -0::68::1 -0::69::1 -0::71::1 -0::72::1 -0::77::2 -0::79::1 -0::83::1 -0::87::1 -0::89::2 -0::91::3 -0::92::4 -0::94::1 -0::95::2 -0::96::1 -0::98::1 -0::99::1 -1::2::2 -1::3::1 -1::4::2 -1::6::1 -1::9::3 -1::12::1 -1::13::1 -1::14::1 -1::16::1 -1::19::1 -1::21::3 -1::27::1 -1::28::3 -1::33::1 -1::36::2 -1::37::1 -1::40::1 -1::41::2 -1::43::1 -1::44::1 -1::47::1 -1::50::1 -1::54::1 -1::56::2 -1::57::1 -1::58::1 -1::60::1 -1::62::4 -1::63::1 -1::67::1 -1::68::4 -1::70::2 -1::72::1 -1::73::1 -1::74::2 -1::76::1 -1::77::3 -1::78::1 -1::81::1 -1::82::1 -1::85::3 -1::86::2 -1::88::2 -1::91::1 -1::92::2 -1::93::1 -1::94::2 -1::96::1 -1::97::1 -2::4::3 -2::6::1 -2::8::5 -2::9::1 -2::10::1 -2::12::3 -2::13::1 -2::15::2 -2::18::2 -2::19::4 -2::22::1 -2::26::1 -2::28::1 -2::34::4 -2::35::1 -2::37::5 -2::38::1 -2::39::5 -2::40::4 -2::47::1 -2::50::1 -2::52::2 -2::54::1 -2::55::1 -2::57::2 -2::58::2 -2::59::1 -2::61::1 -2::62::1 -2::64::1 -2::65::1 -2::66::3 -2::68::1 -2::71::3 -2::76::1 -2::77::1 -2::78::1 -2::80::1 -2::83::5 -2::85::1 -2::87::2 -2::88::1 -2::89::4 -2::90::1 -2::92::4 -2::93::5 -3::0::1 -3::1::1 -3::2::1 -3::7::3 -3::8::3 -3::9::1 -3::14::1 -3::15::1 -3::16::1 -3::18::4 -3::19::1 -3::24::3 -3::26::1 -3::29::3 -3::33::1 -3::34::3 -3::35::1 -3::36::3 -3::37::1 -3::38::2 -3::43::1 -3::44::1 -3::46::1 -3::47::1 -3::51::5 -3::52::3 -3::56::1 -3::58::1 -3::60::3 -3::62::1 -3::65::2 -3::66::1 -3::67::1 -3::68::2 -3::70::1 -3::72::2 -3::76::3 -3::79::3 -3::80::4 -3::81::1 -3::83::1 -3::84::1 -3::86::1 -3::87::2 -3::88::4 -3::89::1 -3::91::1 -3::94::3 -4::1::1 -4::6::1 -4::8::1 -4::9::1 -4::10::1 -4::11::1 -4::12::1 -4::13::1 -4::14::2 -4::15::1 -4::17::1 -4::20::1 -4::22::1 -4::23::1 -4::24::1 -4::29::4 -4::30::1 -4::31::1 -4::34::1 -4::35::1 -4::36::1 -4::39::2 -4::40::3 -4::41::4 -4::43::2 -4::44::1 -4::45::1 -4::46::1 -4::47::1 -4::49::2 -4::50::1 -4::51::1 -4::52::4 -4::54::1 -4::55::1 -4::60::3 -4::61::1 -4::62::4 -4::63::3 -4::65::1 -4::67::2 -4::69::1 -4::70::4 -4::71::1 -4::73::1 -4::78::1 -4::84::1 -4::85::1 -4::87::3 -4::88::3 -4::89::2 -4::96::1 -4::97::1 -4::98::1 -4::99::1 -5::0::1 -5::1::1 -5::4::1 -5::5::1 -5::8::1 -5::9::3 -5::10::2 -5::13::3 -5::15::1 -5::19::1 -5::20::3 -5::21::2 -5::23::3 -5::27::1 -5::28::1 -5::29::1 -5::31::1 -5::36::3 -5::38::2 -5::39::1 -5::42::1 -5::48::3 -5::49::4 -5::50::3 -5::51::1 -5::52::1 -5::54::1 -5::55::5 -5::56::3 -5::58::1 -5::60::1 -5::61::1 -5::64::3 -5::65::2 -5::68::4 -5::70::1 -5::71::1 -5::72::1 -5::74::1 -5::79::1 -5::81::2 -5::84::1 -5::85::1 -5::86::1 -5::88::1 -5::90::4 -5::91::2 -5::95::2 -5::99::1 -6::0::1 -6::1::1 -6::2::3 -6::5::1 -6::6::1 -6::9::1 -6::10::1 -6::15::2 -6::16::2 -6::17::1 -6::18::1 -6::20::1 -6::21::1 -6::22::1 -6::24::1 -6::25::5 -6::26::1 -6::28::1 -6::30::1 -6::33::1 -6::38::1 -6::39::1 -6::43::4 -6::44::1 -6::45::1 -6::48::1 -6::49::1 -6::50::1 -6::53::1 -6::54::1 -6::55::1 -6::56::1 -6::58::4 -6::59::1 -6::60::1 -6::61::3 -6::63::3 -6::66::1 -6::67::3 -6::68::1 -6::69::1 -6::71::2 -6::73::1 -6::75::1 -6::77::1 -6::79::1 -6::81::1 -6::84::1 -6::85::3 -6::86::1 -6::87::1 -6::88::1 -6::89::1 -6::91::2 -6::94::1 -6::95::2 -6::96::1 -7::1::1 -7::2::2 -7::3::1 -7::4::1 -7::7::1 -7::10::1 -7::11::2 -7::14::2 -7::15::1 -7::16::1 -7::18::1 -7::21::1 -7::22::1 -7::23::1 -7::25::5 -7::26::1 -7::29::4 -7::30::1 -7::31::3 -7::32::1 -7::33::1 -7::35::1 -7::37::2 -7::39::3 -7::40::2 -7::42::2 -7::44::1 -7::45::2 -7::47::4 -7::48::1 -7::49::1 -7::53::1 -7::54::1 -7::55::1 -7::56::1 -7::59::1 -7::61::2 -7::62::3 -7::63::2 -7::66::1 -7::67::3 -7::74::1 -7::75::1 -7::76::3 -7::77::1 -7::81::1 -7::82::1 -7::84::2 -7::85::4 -7::86::1 -7::92::2 -7::96::1 -7::97::1 -7::98::1 -8::0::1 -8::2::4 -8::3::2 -8::4::2 -8::5::1 -8::7::1 -8::9::1 -8::11::1 -8::15::1 -8::18::1 -8::19::1 -8::21::1 -8::29::5 -8::31::3 -8::33::1 -8::35::1 -8::36::1 -8::40::2 -8::44::1 -8::45::1 -8::50::1 -8::51::1 -8::52::5 -8::53::5 -8::54::1 -8::55::1 -8::56::1 -8::58::4 -8::60::3 -8::62::4 -8::64::1 -8::67::3 -8::69::1 -8::71::1 -8::72::3 -8::77::3 -8::78::1 -8::79::1 -8::83::1 -8::85::5 -8::86::1 -8::88::1 -8::90::1 -8::92::2 -8::95::4 -8::96::3 -8::97::1 -8::98::1 -8::99::1 -9::2::3 -9::3::1 -9::4::1 -9::5::1 -9::6::1 -9::7::5 -9::9::1 -9::12::1 -9::14::3 -9::15::1 -9::19::1 -9::21::1 -9::22::1 -9::24::1 -9::25::1 -9::26::1 -9::30::3 -9::32::4 -9::35::2 -9::36::2 -9::37::2 -9::38::1 -9::39::1 -9::43::3 -9::49::5 -9::50::3 -9::53::1 -9::54::1 -9::58::1 -9::59::1 -9::60::1 -9::61::1 -9::63::3 -9::64::3 -9::68::1 -9::69::1 -9::70::3 -9::71::1 -9::73::2 -9::75::1 -9::77::2 -9::81::2 -9::82::1 -9::83::1 -9::84::1 -9::86::1 -9::87::4 -9::88::1 -9::90::3 -9::94::2 -9::95::3 -9::97::2 -9::98::1 -10::0::3 -10::2::4 -10::4::3 -10::7::1 -10::8::1 -10::10::1 -10::13::2 -10::14::1 -10::16::2 -10::17::1 -10::18::1 -10::21::1 -10::22::1 -10::24::1 -10::25::3 -10::28::1 -10::35::1 -10::36::1 -10::37::1 -10::38::1 -10::39::1 -10::40::4 -10::41::2 -10::42::3 -10::43::1 -10::49::3 -10::50::1 -10::51::1 -10::52::1 -10::55::2 -10::56::1 -10::58::1 -10::63::1 -10::66::1 -10::67::2 -10::68::1 -10::75::1 -10::77::1 -10::79::1 -10::86::1 -10::89::3 -10::90::1 -10::97::1 -10::98::1 -11::0::1 -11::6::2 -11::9::1 -11::10::1 -11::11::1 -11::12::1 -11::13::4 -11::16::1 -11::18::5 -11::19::4 -11::20::1 -11::21::1 -11::22::1 -11::23::5 -11::25::1 -11::27::5 -11::30::5 -11::32::5 -11::35::3 -11::36::2 -11::37::2 -11::38::4 -11::39::1 -11::40::1 -11::41::1 -11::43::2 -11::45::1 -11::47::1 -11::48::5 -11::50::4 -11::51::3 -11::59::1 -11::61::1 -11::62::1 -11::64::1 -11::66::4 -11::67::1 -11::69::5 -11::70::1 -11::71::3 -11::72::3 -11::75::3 -11::76::1 -11::77::1 -11::78::1 -11::79::5 -11::80::3 -11::81::4 -11::82::1 -11::86::1 -11::88::1 -11::89::1 -11::90::4 -11::94::2 -11::97::3 -11::99::1 -12::2::1 -12::4::1 -12::6::1 -12::7::3 -12::8::1 -12::14::1 -12::15::2 -12::16::4 -12::17::5 -12::18::2 -12::21::1 -12::22::2 -12::23::3 -12::24::1 -12::25::1 -12::27::5 -12::30::2 -12::31::4 -12::35::5 -12::38::1 -12::41::1 -12::44::2 -12::45::1 -12::50::4 -12::51::1 -12::52::1 -12::53::1 -12::54::1 -12::56::2 -12::57::1 -12::60::1 -12::63::1 -12::64::5 -12::66::3 -12::67::1 -12::70::1 -12::72::1 -12::74::1 -12::75::1 -12::77::1 -12::78::1 -12::79::3 -12::82::2 -12::83::1 -12::84::1 -12::85::1 -12::86::1 -12::87::1 -12::88::1 -12::91::3 -12::92::1 -12::94::4 -12::95::2 -12::96::1 -12::98::2 -13::0::1 -13::3::1 -13::4::2 -13::5::1 -13::6::1 -13::12::1 -13::14::2 -13::15::1 -13::17::1 -13::18::3 -13::20::1 -13::21::1 -13::22::1 -13::26::1 -13::27::1 -13::29::3 -13::31::1 -13::33::1 -13::40::2 -13::43::2 -13::44::1 -13::45::1 -13::49::1 -13::51::1 -13::52::2 -13::53::3 -13::54::1 -13::62::1 -13::63::2 -13::64::1 -13::68::1 -13::71::1 -13::72::3 -13::73::1 -13::74::3 -13::77::2 -13::78::1 -13::79::2 -13::83::3 -13::85::1 -13::86::1 -13::87::2 -13::88::2 -13::90::1 -13::93::4 -13::94::1 -13::98::1 -13::99::1 -14::1::1 -14::3::3 -14::4::1 -14::5::1 -14::6::1 -14::7::1 -14::9::1 -14::10::1 -14::11::1 -14::12::1 -14::13::1 -14::14::3 -14::15::1 -14::16::1 -14::17::1 -14::20::1 -14::21::1 -14::24::1 -14::25::2 -14::27::1 -14::28::1 -14::29::5 -14::31::3 -14::34::1 -14::36::1 -14::37::2 -14::39::2 -14::40::1 -14::44::1 -14::45::1 -14::47::3 -14::48::1 -14::49::1 -14::51::1 -14::52::5 -14::53::3 -14::54::1 -14::55::1 -14::56::1 -14::62::4 -14::63::5 -14::67::3 -14::68::1 -14::69::3 -14::71::1 -14::72::4 -14::73::1 -14::76::5 -14::79::1 -14::82::1 -14::83::1 -14::88::1 -14::93::3 -14::94::1 -14::95::2 -14::96::4 -14::98::1 -15::0::1 -15::1::4 -15::2::1 -15::5::2 -15::6::1 -15::7::1 -15::13::1 -15::14::1 -15::15::1 -15::17::2 -15::19::2 -15::22::2 -15::23::2 -15::25::1 -15::26::3 -15::27::1 -15::28::2 -15::29::1 -15::32::1 -15::33::2 -15::34::1 -15::35::2 -15::36::1 -15::37::1 -15::39::1 -15::42::1 -15::46::5 -15::48::2 -15::50::2 -15::51::1 -15::52::1 -15::58::1 -15::62::1 -15::64::3 -15::65::2 -15::72::1 -15::73::1 -15::74::1 -15::79::1 -15::80::1 -15::81::1 -15::82::2 -15::85::1 -15::87::1 -15::91::2 -15::96::1 -15::97::1 -15::98::3 -16::2::1 -16::5::3 -16::6::2 -16::7::1 -16::9::1 -16::12::1 -16::14::1 -16::15::1 -16::19::1 -16::21::2 -16::29::4 -16::30::2 -16::32::1 -16::34::1 -16::36::1 -16::38::1 -16::46::1 -16::47::3 -16::48::1 -16::49::1 -16::50::1 -16::51::5 -16::54::5 -16::55::1 -16::56::2 -16::57::1 -16::60::1 -16::63::2 -16::65::1 -16::67::1 -16::72::1 -16::74::1 -16::80::1 -16::81::1 -16::82::1 -16::85::5 -16::86::1 -16::90::5 -16::91::1 -16::93::1 -16::94::3 -16::95::2 -16::96::3 -16::98::3 -16::99::1 -17::2::1 -17::3::1 -17::6::1 -17::10::4 -17::11::1 -17::13::2 -17::17::5 -17::19::1 -17::20::5 -17::22::4 -17::28::1 -17::29::1 -17::33::1 -17::34::1 -17::35::2 -17::37::1 -17::38::1 -17::45::1 -17::46::5 -17::47::1 -17::49::3 -17::51::1 -17::55::5 -17::56::3 -17::57::1 -17::58::1 -17::59::1 -17::60::1 -17::63::1 -17::66::1 -17::68::4 -17::69::1 -17::70::1 -17::72::1 -17::73::3 -17::78::1 -17::79::1 -17::82::2 -17::84::1 -17::90::5 -17::91::3 -17::92::1 -17::93::1 -17::94::4 -17::95::2 -17::97::1 -18::1::1 -18::4::3 -18::5::2 -18::6::1 -18::7::1 -18::10::1 -18::11::4 -18::12::2 -18::13::1 -18::15::1 -18::18::1 -18::20::1 -18::21::2 -18::22::1 -18::23::2 -18::25::1 -18::26::1 -18::27::1 -18::28::5 -18::29::1 -18::31::1 -18::32::1 -18::36::1 -18::38::5 -18::39::5 -18::40::1 -18::42::1 -18::43::1 -18::44::4 -18::46::1 -18::47::1 -18::48::1 -18::51::2 -18::55::1 -18::56::1 -18::57::1 -18::62::1 -18::63::1 -18::66::3 -18::67::1 -18::70::1 -18::75::1 -18::76::3 -18::77::1 -18::80::3 -18::81::3 -18::82::1 -18::83::5 -18::84::1 -18::97::1 -18::98::1 -18::99::2 -19::0::1 -19::1::1 -19::2::1 -19::4::1 -19::6::2 -19::11::1 -19::12::1 -19::14::1 -19::23::1 -19::26::1 -19::31::1 -19::32::4 -19::33::1 -19::34::1 -19::37::1 -19::38::1 -19::41::1 -19::43::1 -19::45::1 -19::48::1 -19::49::1 -19::50::2 -19::53::2 -19::54::3 -19::55::1 -19::56::2 -19::58::1 -19::61::1 -19::62::1 -19::63::1 -19::64::1 -19::65::1 -19::69::2 -19::72::1 -19::74::3 -19::76::1 -19::78::1 -19::79::1 -19::81::1 -19::82::1 -19::84::1 -19::86::1 -19::87::2 -19::90::4 -19::93::1 -19::94::4 -19::95::2 -19::96::1 -19::98::4 -20::0::1 -20::1::1 -20::2::2 -20::4::2 -20::6::1 -20::8::1 -20::12::1 -20::21::2 -20::22::5 -20::24::2 -20::25::1 -20::26::1 -20::29::2 -20::30::2 -20::32::2 -20::39::1 -20::40::1 -20::41::2 -20::45::2 -20::48::1 -20::50::1 -20::51::3 -20::53::3 -20::55::1 -20::57::2 -20::60::1 -20::61::1 -20::64::1 -20::66::1 -20::70::2 -20::72::1 -20::73::2 -20::75::4 -20::76::1 -20::77::4 -20::78::1 -20::79::1 -20::84::2 -20::85::2 -20::88::3 -20::89::1 -20::90::3 -20::91::1 -20::92::2 -20::93::1 -20::94::4 -20::97::1 -21::0::1 -21::2::4 -21::3::1 -21::7::2 -21::11::1 -21::12::1 -21::13::1 -21::14::3 -21::17::1 -21::19::1 -21::20::1 -21::21::1 -21::22::1 -21::23::1 -21::24::1 -21::27::1 -21::29::5 -21::30::2 -21::38::1 -21::40::2 -21::43::3 -21::44::1 -21::45::1 -21::46::1 -21::48::1 -21::51::1 -21::53::5 -21::54::1 -21::55::1 -21::56::1 -21::58::3 -21::59::3 -21::64::1 -21::66::1 -21::68::1 -21::71::1 -21::73::1 -21::74::4 -21::80::1 -21::81::1 -21::83::1 -21::84::1 -21::85::3 -21::87::4 -21::89::2 -21::92::2 -21::96::3 -21::99::1 -22::0::1 -22::3::2 -22::5::2 -22::6::2 -22::9::1 -22::10::1 -22::11::1 -22::13::1 -22::14::1 -22::16::1 -22::18::3 -22::19::1 -22::22::5 -22::25::1 -22::26::1 -22::29::3 -22::30::5 -22::32::4 -22::33::1 -22::35::1 -22::36::3 -22::37::1 -22::40::1 -22::41::3 -22::44::1 -22::45::2 -22::48::1 -22::51::5 -22::55::1 -22::56::2 -22::60::3 -22::61::1 -22::62::4 -22::63::1 -22::65::1 -22::66::1 -22::68::4 -22::69::4 -22::70::3 -22::71::1 -22::74::5 -22::75::5 -22::78::1 -22::80::3 -22::81::1 -22::82::1 -22::84::1 -22::86::1 -22::87::3 -22::88::5 -22::90::2 -22::92::3 -22::95::2 -22::96::2 -22::98::4 -22::99::1 -23::0::1 -23::2::1 -23::4::1 -23::6::2 -23::10::4 -23::12::1 -23::13::4 -23::14::1 -23::15::1 -23::18::4 -23::22::2 -23::23::4 -23::24::1 -23::25::1 -23::26::1 -23::27::5 -23::28::1 -23::29::1 -23::30::4 -23::32::5 -23::33::2 -23::36::3 -23::37::1 -23::38::1 -23::39::1 -23::43::1 -23::48::5 -23::49::5 -23::50::4 -23::53::1 -23::55::5 -23::57::1 -23::59::1 -23::60::1 -23::61::1 -23::64::4 -23::65::5 -23::66::2 -23::67::1 -23::68::3 -23::69::1 -23::72::1 -23::73::3 -23::77::1 -23::82::2 -23::83::1 -23::84::1 -23::85::1 -23::87::3 -23::88::1 -23::95::2 -23::97::1 -24::4::1 -24::6::3 -24::7::1 -24::10::2 -24::12::1 -24::15::1 -24::19::1 -24::24::1 -24::27::3 -24::30::5 -24::31::1 -24::32::3 -24::33::1 -24::37::1 -24::39::1 -24::40::1 -24::42::1 -24::43::3 -24::45::2 -24::46::1 -24::47::1 -24::48::1 -24::49::1 -24::50::1 -24::52::5 -24::57::1 -24::59::4 -24::63::4 -24::65::1 -24::66::1 -24::67::1 -24::68::3 -24::69::5 -24::71::1 -24::72::4 -24::77::4 -24::78::1 -24::80::1 -24::82::1 -24::84::1 -24::86::1 -24::87::1 -24::88::2 -24::89::1 -24::90::5 -24::91::1 -24::92::1 -24::94::2 -24::95::1 -24::96::5 -24::98::1 -24::99::1 -25::1::3 -25::2::1 -25::7::1 -25::9::1 -25::12::3 -25::16::3 -25::17::1 -25::18::1 -25::20::1 -25::22::1 -25::23::1 -25::26::2 -25::29::1 -25::30::1 -25::31::2 -25::33::4 -25::34::3 -25::35::2 -25::36::1 -25::37::1 -25::40::1 -25::41::1 -25::43::1 -25::47::4 -25::50::1 -25::51::1 -25::53::1 -25::56::1 -25::58::2 -25::64::2 -25::67::2 -25::68::1 -25::70::1 -25::71::4 -25::73::1 -25::74::1 -25::76::1 -25::79::1 -25::82::1 -25::84::2 -25::85::1 -25::91::3 -25::92::1 -25::94::1 -25::95::1 -25::97::2 -26::0::1 -26::1::1 -26::2::1 -26::3::1 -26::4::4 -26::5::2 -26::6::3 -26::7::5 -26::13::3 -26::14::1 -26::16::1 -26::18::3 -26::20::1 -26::21::3 -26::22::5 -26::23::5 -26::24::5 -26::27::1 -26::31::1 -26::35::1 -26::36::4 -26::40::1 -26::44::1 -26::45::2 -26::47::1 -26::48::1 -26::49::3 -26::50::2 -26::52::1 -26::54::4 -26::55::1 -26::57::3 -26::58::1 -26::61::1 -26::62::2 -26::66::1 -26::68::4 -26::71::1 -26::73::4 -26::76::1 -26::81::3 -26::85::1 -26::86::3 -26::88::5 -26::91::1 -26::94::5 -26::95::1 -26::96::1 -26::97::1 -27::0::1 -27::9::1 -27::10::1 -27::18::4 -27::19::3 -27::20::1 -27::22::2 -27::24::2 -27::25::1 -27::27::3 -27::28::1 -27::29::1 -27::31::1 -27::33::3 -27::40::1 -27::42::1 -27::43::1 -27::44::3 -27::45::1 -27::51::3 -27::52::1 -27::55::3 -27::57::1 -27::59::1 -27::60::1 -27::61::1 -27::64::1 -27::66::3 -27::68::1 -27::70::1 -27::71::2 -27::72::1 -27::75::3 -27::78::1 -27::80::3 -27::82::1 -27::83::3 -27::86::1 -27::87::2 -27::90::1 -27::91::1 -27::92::1 -27::93::1 -27::94::2 -27::95::1 -27::98::1 -28::0::3 -28::1::1 -28::2::4 -28::3::1 -28::6::1 -28::7::1 -28::12::5 -28::13::2 -28::14::1 -28::15::1 -28::17::1 -28::19::3 -28::20::1 -28::23::3 -28::24::3 -28::27::1 -28::29::1 -28::33::1 -28::34::1 -28::36::1 -28::38::2 -28::39::2 -28::44::1 -28::45::1 -28::49::4 -28::50::1 -28::52::1 -28::54::1 -28::56::1 -28::57::3 -28::58::1 -28::59::1 -28::60::1 -28::62::3 -28::63::1 -28::65::1 -28::75::1 -28::78::1 -28::81::5 -28::82::4 -28::83::1 -28::85::1 -28::88::2 -28::89::4 -28::90::1 -28::92::5 -28::94::1 -28::95::2 -28::98::1 -28::99::1 -29::3::1 -29::4::1 -29::5::1 -29::7::2 -29::9::1 -29::10::3 -29::11::1 -29::13::3 -29::14::1 -29::15::1 -29::17::3 -29::19::3 -29::22::3 -29::23::4 -29::25::1 -29::29::1 -29::31::1 -29::32::4 -29::33::2 -29::36::2 -29::38::3 -29::39::1 -29::42::1 -29::46::5 -29::49::3 -29::51::2 -29::59::1 -29::61::1 -29::62::1 -29::67::1 -29::68::3 -29::69::1 -29::70::1 -29::74::1 -29::75::1 -29::79::2 -29::80::1 -29::81::2 -29::83::1 -29::85::1 -29::86::1 -29::90::4 -29::93::1 -29::94::4 -29::97::1 -29::99::1 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_multiclass_classification_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_multiclass_classification_data.txt deleted file mode 100644 index a0d7f90..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_multiclass_classification_data.txt +++ /dev/null @@ -1,150 +0,0 @@ -1 1:-0.222222 2:0.5 3:-0.762712 4:-0.833333 -1 1:-0.555556 2:0.25 3:-0.864407 4:-0.916667 -1 1:-0.722222 2:-0.166667 3:-0.864407 4:-0.833333 -1 1:-0.722222 2:0.166667 3:-0.694915 4:-0.916667 -0 1:0.166667 2:-0.416667 3:0.457627 4:0.5 -1 1:-0.833333 3:-0.864407 4:-0.916667 -2 1:-1.32455e-07 2:-0.166667 3:0.220339 4:0.0833333 -2 1:-1.32455e-07 2:-0.333333 3:0.0169491 4:-4.03573e-08 -1 1:-0.5 2:0.75 3:-0.830508 4:-1 -0 1:0.611111 3:0.694915 4:0.416667 -0 1:0.222222 2:-0.166667 3:0.423729 4:0.583333 -1 1:-0.722222 2:-0.166667 3:-0.864407 4:-1 -1 1:-0.5 2:0.166667 3:-0.864407 4:-0.916667 -2 1:-0.222222 2:-0.333333 3:0.0508474 4:-4.03573e-08 -2 1:-0.0555556 2:-0.833333 3:0.0169491 4:-0.25 -2 1:-0.166667 2:-0.416667 3:-0.0169491 4:-0.0833333 -1 1:-0.944444 3:-0.898305 4:-0.916667 -2 1:-0.277778 2:-0.583333 3:-0.0169491 4:-0.166667 -0 1:0.111111 2:-0.333333 3:0.38983 4:0.166667 -2 1:-0.222222 2:-0.166667 3:0.0847457 4:-0.0833333 -0 1:0.166667 2:-0.333333 3:0.559322 4:0.666667 -1 1:-0.611111 2:0.0833333 3:-0.864407 4:-0.916667 -2 1:-0.333333 2:-0.583333 3:0.0169491 4:-4.03573e-08 -0 1:0.555555 2:-0.166667 3:0.661017 4:0.666667 -2 1:0.166667 3:0.186441 4:0.166667 -2 1:0.111111 2:-0.75 3:0.152542 4:-4.03573e-08 -2 1:0.166667 2:-0.25 3:0.118644 4:-4.03573e-08 -0 1:-0.0555556 2:-0.833333 3:0.355932 4:0.166667 -0 1:-0.277778 2:-0.333333 3:0.322034 4:0.583333 -2 1:-0.222222 2:-0.5 3:-0.152542 4:-0.25 -2 1:-0.111111 3:0.288136 4:0.416667 -2 1:-0.0555556 2:-0.25 3:0.186441 4:0.166667 -2 1:0.333333 2:-0.166667 3:0.355932 4:0.333333 -1 1:-0.611111 2:0.25 3:-0.898305 4:-0.833333 -0 1:0.166667 2:-0.333333 3:0.559322 4:0.75 -0 1:0.111111 2:-0.25 3:0.559322 4:0.416667 -0 1:0.833333 2:-0.166667 3:0.898305 4:0.666667 -2 1:-0.277778 2:-0.166667 3:0.186441 4:0.166667 -0 1:-0.666667 2:-0.583333 3:0.186441 4:0.333333 -1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 -1 1:-0.166667 2:0.666667 3:-0.932203 4:-0.916667 -0 1:0.0555554 2:-0.333333 3:0.288136 4:0.416667 -1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 -1 1:-0.833333 2:0.166667 3:-0.864407 4:-0.833333 -0 1:0.0555554 2:0.166667 3:0.491525 4:0.833333 -0 1:0.722222 2:-0.333333 3:0.728813 4:0.5 -2 1:-0.166667 2:-0.416667 3:0.0508474 4:-0.25 -2 1:0.5 3:0.254237 4:0.0833333 -0 1:0.111111 2:-0.583333 3:0.355932 4:0.5 -1 1:-0.944444 2:-0.166667 3:-0.898305 4:-0.916667 -2 1:0.277778 2:-0.25 3:0.220339 4:-4.03573e-08 -0 1:0.666667 2:-0.25 3:0.79661 4:0.416667 -0 1:0.111111 2:0.0833333 3:0.694915 4:1 -0 1:0.444444 3:0.59322 4:0.833333 -2 1:-0.0555556 2:0.166667 3:0.186441 4:0.25 -1 1:-0.833333 2:0.333333 3:-1 4:-0.916667 -1 1:-0.555556 2:0.416667 3:-0.830508 4:-0.75 -2 1:-0.333333 2:-0.5 3:0.152542 4:-0.0833333 -1 1:-1 2:-0.166667 3:-0.966102 4:-1 -1 1:-0.333333 2:0.25 3:-0.898305 4:-0.916667 -2 1:0.388889 2:-0.333333 3:0.288136 4:0.0833333 -2 1:0.277778 2:-0.166667 3:0.152542 4:0.0833333 -0 1:0.333333 2:0.0833333 3:0.59322 4:0.666667 -1 1:-0.777778 3:-0.79661 4:-0.916667 -1 1:-0.444444 2:0.416667 3:-0.830508 4:-0.916667 -0 1:0.222222 2:-0.166667 3:0.627119 4:0.75 -1 1:-0.555556 2:0.5 3:-0.79661 4:-0.916667 -1 1:-0.555556 2:0.5 3:-0.694915 4:-0.75 -2 1:-1.32455e-07 2:-0.25 3:0.254237 4:0.0833333 -1 1:-0.5 2:0.25 3:-0.830508 4:-0.916667 -0 1:0.166667 3:0.457627 4:0.833333 -2 1:0.444444 2:-0.0833334 3:0.322034 4:0.166667 -0 1:0.111111 2:0.166667 3:0.559322 4:0.916667 -1 1:-0.611111 2:0.25 3:-0.79661 4:-0.583333 -0 1:0.388889 3:0.661017 4:0.833333 -1 1:-0.722222 2:0.166667 3:-0.79661 4:-0.916667 -1 1:-0.722222 2:-0.0833334 3:-0.79661 4:-0.916667 -1 1:-0.555556 2:0.166667 3:-0.830508 4:-0.916667 -2 1:-0.666667 2:-0.666667 3:-0.220339 4:-0.25 -2 1:-0.611111 2:-0.75 3:-0.220339 4:-0.25 -2 1:0.0555554 2:-0.833333 3:0.186441 4:0.166667 -0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5 -0 1:0.611111 2:0.333333 3:0.728813 4:1 -2 1:0.0555554 2:-0.25 3:0.118644 4:-4.03573e-08 -1 1:-0.666667 2:-0.166667 3:-0.864407 4:-0.916667 -1 1:-0.833333 2:-0.0833334 3:-0.830508 4:-0.916667 -0 1:0.611111 2:-0.166667 3:0.627119 4:0.25 -0 1:0.888889 2:0.5 3:0.932203 4:0.75 -2 1:0.222222 2:-0.333333 3:0.220339 4:0.166667 -1 1:-0.555556 2:0.25 3:-0.864407 4:-0.833333 -0 1:-1.32455e-07 2:-0.166667 3:0.322034 4:0.416667 -0 1:-1.32455e-07 2:-0.5 3:0.559322 4:0.0833333 -1 1:-0.611111 3:-0.932203 4:-0.916667 -1 1:-0.333333 2:0.833333 3:-0.864407 4:-0.916667 -0 1:-0.166667 2:-0.333333 3:0.38983 4:0.916667 -2 1:-0.333333 2:-0.666667 3:-0.0847458 4:-0.25 -2 1:-0.0555556 2:-0.416667 3:0.38983 4:0.25 -1 1:-0.388889 2:0.416667 3:-0.830508 4:-0.916667 -0 1:0.444444 2:-0.0833334 3:0.38983 4:0.833333 -1 1:-0.611111 2:0.333333 3:-0.864407 4:-0.916667 -0 1:0.111111 2:-0.416667 3:0.322034 4:0.416667 -0 1:0.166667 2:-0.0833334 3:0.525424 4:0.416667 -2 1:0.333333 2:-0.0833334 3:0.152542 4:0.0833333 -0 1:-0.0555556 2:-0.166667 3:0.288136 4:0.416667 -0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5 -1 1:-0.611111 2:0.166667 3:-0.830508 4:-0.916667 -0 1:0.888889 2:-0.166667 3:0.728813 4:0.833333 -2 1:-0.277778 2:-0.25 3:-0.118644 4:-4.03573e-08 -2 1:-0.222222 2:-0.333333 3:0.186441 4:-4.03573e-08 -0 1:0.333333 2:-0.583333 3:0.627119 4:0.416667 -0 1:0.444444 2:-0.0833334 3:0.491525 4:0.666667 -2 1:-0.222222 2:-0.25 3:0.0847457 4:-4.03573e-08 -1 1:-0.611111 2:0.166667 3:-0.79661 4:-0.75 -2 1:-0.277778 2:-0.166667 3:0.0508474 4:-4.03573e-08 -0 1:1 2:0.5 3:0.830508 4:0.583333 -2 1:-0.333333 2:-0.666667 3:-0.0508475 4:-0.166667 -2 1:-0.277778 2:-0.416667 3:0.0847457 4:-4.03573e-08 -0 1:0.888889 2:-0.333333 3:0.932203 4:0.583333 -2 1:-0.111111 2:-0.166667 3:0.0847457 4:0.166667 -2 1:0.111111 2:-0.583333 3:0.322034 4:0.166667 -0 1:0.333333 2:0.0833333 3:0.59322 4:1 -0 1:0.222222 2:-0.166667 3:0.525424 4:0.416667 -1 1:-0.555556 2:0.5 3:-0.830508 4:-0.833333 -0 1:-0.111111 2:-0.166667 3:0.38983 4:0.416667 -0 1:0.888889 2:-0.5 3:1 4:0.833333 -1 1:-0.388889 2:0.583333 3:-0.898305 4:-0.75 -2 1:0.111111 2:0.0833333 3:0.254237 4:0.25 -0 1:0.333333 2:-0.166667 3:0.423729 4:0.833333 -1 1:-0.388889 2:0.166667 3:-0.762712 4:-0.916667 -0 1:0.333333 2:-0.0833334 3:0.559322 4:0.916667 -2 1:-0.333333 2:-0.75 3:0.0169491 4:-4.03573e-08 -1 1:-0.222222 2:1 3:-0.830508 4:-0.75 -1 1:-0.388889 2:0.583333 3:-0.762712 4:-0.75 -2 1:-0.611111 2:-1 3:-0.152542 4:-0.25 -2 1:-1.32455e-07 2:-0.333333 3:0.254237 4:-0.0833333 -2 1:-0.5 2:-0.416667 3:-0.0169491 4:0.0833333 -1 1:-0.888889 2:-0.75 3:-0.898305 4:-0.833333 -1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 -2 1:-0.555556 2:-0.583333 3:-0.322034 4:-0.166667 -2 1:-0.166667 2:-0.5 3:0.0169491 4:-0.0833333 -1 1:-0.555556 2:0.0833333 3:-0.762712 4:-0.666667 -1 1:-0.777778 3:-0.898305 4:-0.916667 -0 1:0.388889 2:-0.166667 3:0.525424 4:0.666667 -0 1:0.222222 3:0.38983 4:0.583333 -2 1:0.333333 2:-0.0833334 3:0.254237 4:0.166667 -2 1:-0.388889 2:-0.166667 3:0.186441 4:0.166667 -0 1:-0.222222 2:-0.583333 3:0.355932 4:0.583333 -1 1:-0.611111 2:-0.166667 3:-0.79661 4:-0.916667 -1 1:-0.944444 2:-0.25 3:-0.864407 4:-0.916667 -1 1:-0.388889 2:0.166667 3:-0.830508 4:-0.75 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_svm_data.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_svm_data.txt deleted file mode 100644 index 7ab30bd..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/sample_svm_data.txt +++ /dev/null @@ -1,322 +0,0 @@ -1 0 2.52078447201548 0 0 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 0 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 12.72816758217773 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -1 0 0 0 0 4.745052855503306 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 0 0 2.061393766919624 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 0 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 0 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 0 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 0 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 0 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 0 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 0 0 2.004684436494304 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 6.857275130999357 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 0 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 10.4087817597473 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 0 0 2.061393766919624 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 0 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 12.72816758217773 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 4.745052855503306 0 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 2.52078447201548 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -1 0 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 0 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 0 2.52078447201548 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -0 0 2.52078447201548 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 2.061393766919624 0 0 2.004684436494304 0 2.122974378789621 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 2.122974378789621 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 4.745052855503306 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 2.52078447201548 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 0 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 0 0 2.619965104088255 0 0 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 0 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 2.122974378789621 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -1 0 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 0 2.52078447201548 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 0 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 6.857275130999357 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 4.745052855503306 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 2.52078447201548 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 2.122974378789621 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 0 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 0 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 4.745052855503306 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 0 0 0 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -0 0 2.52078447201548 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 0 2.061393766919624 2.619965104088255 4.745052855503306 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 0 2.52078447201548 0 0 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 0 0 0 0 0 0 0 0 -0 0 0 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 0 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 6.857275130999357 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -0 0 2.52078447201548 2.061393766919624 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 0 0 2.061393766919624 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 0 2.52078447201548 0 0 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 0 0 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -1 0 0 2.061393766919624 0 0 2.004684436494304 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 0 2.52078447201548 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 4.745052855503306 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 2.122974378789621 0 0 0 0 12.72816758217773 10.4087817597473 12.72816758217773 17.97228742438751 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 4.745052855503306 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 0 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 0 2.52078447201548 2.061393766919624 0 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 0 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 4.745052855503306 2.004684436494304 0 0 0 0 0 0 0 0 0 0 -1 0 0 2.061393766919624 0 0 2.004684436494304 2.000347299268466 0 0 0 6.857275130999357 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 4.745052855503306 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 6.857275130999357 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 0 4.745052855503306 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 0 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 4.745052855503306 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -0 0 2.52078447201548 2.061393766919624 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 4.745052855503306 2.004684436494304 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 0 2.52078447201548 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 0 0 0 2.000347299268466 2.122974378789621 0 0 6.857275130999357 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 4.745052855503306 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 2.619965104088255 0 0 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 0 0 2.061393766919624 0 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 2.061393766919624 0 0 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 6.857275130999357 0 0 0 0 0 -1 2.857738033247042 0 2.061393766919624 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 2.000347299268466 0 0 0 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 2.52078447201548 2.061393766919624 2.619965104088255 0 0 2.000347299268466 0 0 0 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 0 0 0 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 4.745052855503306 0 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 2.52078447201548 0 0 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 2.857738033247042 2.52078447201548 0 2.619965104088255 0 2.004684436494304 0 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 2.061393766919624 0 0 2.004684436494304 0 2.122974378789621 0 0 0 0 0 10.4087817597473 0 0 -0 2.857738033247042 0 2.061393766919624 2.619965104088255 0 2.004684436494304 0 2.122974378789621 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -0 2.857738033247042 0 0 2.619965104088255 0 2.004684436494304 2.000347299268466 2.122974378789621 0 0 0 0 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 2.000347299268466 0 2.228387042742021 2.228387042742023 0 0 0 0 0 0 -1 0 2.52078447201548 0 2.619965104088255 0 0 0 0 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 -1 2.857738033247042 0 0 2.619965104088255 0 0 0 2.122974378789621 2.228387042742021 2.228387042742023 0 2.055002875864414 0 0 0 0 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/streaming_kmeans_data_test.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/streaming_kmeans_data_test.txt deleted file mode 100644 index 649a0d6..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/mllib/streaming_kmeans_data_test.txt +++ /dev/null @@ -1,2 +0,0 @@ -(1.0), [1.7, 0.4, 0.9] -(2.0), [2.2, 1.8, 0.0] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/data/streaming/AFINN-111.txt b/scripts/spark-2.4.3-bin-hadoop2.7/data/streaming/AFINN-111.txt deleted file mode 100644 index 0f6fb8e..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/data/streaming/AFINN-111.txt +++ /dev/null @@ -1,2477 +0,0 @@ -abandon -2 -abandoned -2 -abandons -2 -abducted -2 -abduction -2 -abductions -2 -abhor -3 -abhorred -3 -abhorrent -3 -abhors -3 -abilities 2 -ability 2 -aboard 1 -absentee -1 -absentees -1 -absolve 2 -absolved 2 -absolves 2 -absolving 2 -absorbed 1 -abuse -3 -abused -3 -abuses -3 -abusive -3 -accept 1 -accepted 1 -accepting 1 -accepts 1 -accident -2 -accidental -2 -accidentally -2 -accidents -2 -accomplish 2 -accomplished 2 -accomplishes 2 -accusation -2 -accusations -2 -accuse -2 -accused -2 -accuses -2 -accusing -2 -ache -2 -achievable 1 -aching -2 -acquit 2 -acquits 2 -acquitted 2 -acquitting 2 -acrimonious -3 -active 1 -adequate 1 -admire 3 -admired 3 -admires 3 -admiring 3 -admit -1 -admits -1 -admitted -1 -admonish -2 -admonished -2 -adopt 1 -adopts 1 -adorable 3 -adore 3 -adored 3 -adores 3 -advanced 1 -advantage 2 -advantages 2 -adventure 2 -adventures 2 -adventurous 2 -affected -1 -affection 3 -affectionate 3 -afflicted -1 -affronted -1 -afraid -2 -aggravate -2 -aggravated -2 -aggravates -2 -aggravating -2 -aggression -2 -aggressions -2 -aggressive -2 -aghast -2 -agog 2 -agonise -3 -agonised -3 -agonises -3 -agonising -3 -agonize -3 -agonized -3 -agonizes -3 -agonizing -3 -agree 1 -agreeable 2 -agreed 1 -agreement 1 -agrees 1 -alarm -2 -alarmed -2 -alarmist -2 -alarmists -2 -alas -1 -alert -1 -alienation -2 -alive 1 -allergic -2 -allow 1 -alone -2 -amaze 2 -amazed 2 -amazes 2 -amazing 4 -ambitious 2 -ambivalent -1 -amuse 3 -amused 3 -amusement 3 -amusements 3 -anger -3 -angers -3 -angry -3 -anguish -3 -anguished -3 -animosity -2 -annoy -2 -annoyance -2 -annoyed -2 -annoying -2 -annoys -2 -antagonistic -2 -anti -1 -anticipation 1 -anxiety -2 -anxious -2 -apathetic -3 -apathy -3 -apeshit -3 -apocalyptic -2 -apologise -1 -apologised -1 -apologises -1 -apologising -1 -apologize -1 -apologized -1 -apologizes -1 -apologizing -1 -apology -1 -appalled -2 -appalling -2 -appease 2 -appeased 2 -appeases 2 -appeasing 2 -applaud 2 -applauded 2 -applauding 2 -applauds 2 -applause 2 -appreciate 2 -appreciated 2 -appreciates 2 -appreciating 2 -appreciation 2 -apprehensive -2 -approval 2 -approved 2 -approves 2 -ardent 1 -arrest -2 -arrested -3 -arrests -2 -arrogant -2 -ashame -2 -ashamed -2 -ass -4 -assassination -3 -assassinations -3 -asset 2 -assets 2 -assfucking -4 -asshole -4 -astonished 2 -astound 3 -astounded 3 -astounding 3 -astoundingly 3 -astounds 3 -attack -1 -attacked -1 -attacking -1 -attacks -1 -attract 1 -attracted 1 -attracting 2 -attraction 2 -attractions 2 -attracts 1 -audacious 3 -authority 1 -avert -1 -averted -1 -averts -1 -avid 2 -avoid -1 -avoided -1 -avoids -1 -await -1 -awaited -1 -awaits -1 -award 3 -awarded 3 -awards 3 -awesome 4 -awful -3 -awkward -2 -axe -1 -axed -1 -backed 1 -backing 2 -backs 1 -bad -3 -badass -3 -badly -3 -bailout -2 -bamboozle -2 -bamboozled -2 -bamboozles -2 -ban -2 -banish -1 -bankrupt -3 -bankster -3 -banned -2 -bargain 2 -barrier -2 -bastard -5 -bastards -5 -battle -1 -battles -1 -beaten -2 -beatific 3 -beating -1 -beauties 3 -beautiful 3 -beautifully 3 -beautify 3 -belittle -2 -belittled -2 -beloved 3 -benefit 2 -benefits 2 -benefitted 2 -benefitting 2 -bereave -2 -bereaved -2 -bereaves -2 -bereaving -2 -best 3 -betray -3 -betrayal -3 -betrayed -3 -betraying -3 -betrays -3 -better 2 -bias -1 -biased -2 -big 1 -bitch -5 -bitches -5 -bitter -2 -bitterly -2 -bizarre -2 -blah -2 -blame -2 -blamed -2 -blames -2 -blaming -2 -bless 2 -blesses 2 -blessing 3 -blind -1 -bliss 3 -blissful 3 -blithe 2 -block -1 -blockbuster 3 -blocked -1 -blocking -1 -blocks -1 -bloody -3 -blurry -2 -boastful -2 -bold 2 -boldly 2 -bomb -1 -boost 1 -boosted 1 -boosting 1 -boosts 1 -bore -2 -bored -2 -boring -3 -bother -2 -bothered -2 -bothers -2 -bothersome -2 -boycott -2 -boycotted -2 -boycotting -2 -boycotts -2 -brainwashing -3 -brave 2 -breakthrough 3 -breathtaking 5 -bribe -3 -bright 1 -brightest 2 -brightness 1 -brilliant 4 -brisk 2 -broke -1 -broken -1 -brooding -2 -bullied -2 -bullshit -4 -bully -2 -bullying -2 -bummer -2 -buoyant 2 -burden -2 -burdened -2 -burdening -2 -burdens -2 -calm 2 -calmed 2 -calming 2 -calms 2 -can't stand -3 -cancel -1 -cancelled -1 -cancelling -1 -cancels -1 -cancer -1 -capable 1 -captivated 3 -care 2 -carefree 1 -careful 2 -carefully 2 -careless -2 -cares 2 -cashing in -2 -casualty -2 -catastrophe -3 -catastrophic -4 -cautious -1 -celebrate 3 -celebrated 3 -celebrates 3 -celebrating 3 -censor -2 -censored -2 -censors -2 -certain 1 -chagrin -2 -chagrined -2 -challenge -1 -chance 2 -chances 2 -chaos -2 -chaotic -2 -charged -3 -charges -2 -charm 3 -charming 3 -charmless -3 -chastise -3 -chastised -3 -chastises -3 -chastising -3 -cheat -3 -cheated -3 -cheater -3 -cheaters -3 -cheats -3 -cheer 2 -cheered 2 -cheerful 2 -cheering 2 -cheerless -2 -cheers 2 -cheery 3 -cherish 2 -cherished 2 -cherishes 2 -cherishing 2 -chic 2 -childish -2 -chilling -1 -choke -2 -choked -2 -chokes -2 -choking -2 -clarifies 2 -clarity 2 -clash -2 -classy 3 -clean 2 -cleaner 2 -clear 1 -cleared 1 -clearly 1 -clears 1 -clever 2 -clouded -1 -clueless -2 -cock -5 -cocksucker -5 -cocksuckers -5 -cocky -2 -coerced -2 -collapse -2 -collapsed -2 -collapses -2 -collapsing -2 -collide -1 -collides -1 -colliding -1 -collision -2 -collisions -2 -colluding -3 -combat -1 -combats -1 -comedy 1 -comfort 2 -comfortable 2 -comforting 2 -comforts 2 -commend 2 -commended 2 -commit 1 -commitment 2 -commits 1 -committed 1 -committing 1 -compassionate 2 -compelled 1 -competent 2 -competitive 2 -complacent -2 -complain -2 -complained -2 -complains -2 -comprehensive 2 -conciliate 2 -conciliated 2 -conciliates 2 -conciliating 2 -condemn -2 -condemnation -2 -condemned -2 -condemns -2 -confidence 2 -confident 2 -conflict -2 -conflicting -2 -conflictive -2 -conflicts -2 -confuse -2 -confused -2 -confusing -2 -congrats 2 -congratulate 2 -congratulation 2 -congratulations 2 -consent 2 -consents 2 -consolable 2 -conspiracy -3 -constrained -2 -contagion -2 -contagions -2 -contagious -1 -contempt -2 -contemptuous -2 -contemptuously -2 -contend -1 -contender -1 -contending -1 -contentious -2 -contestable -2 -controversial -2 -controversially -2 -convince 1 -convinced 1 -convinces 1 -convivial 2 -cool 1 -cool stuff 3 -cornered -2 -corpse -1 -costly -2 -courage 2 -courageous 2 -courteous 2 -courtesy 2 -cover-up -3 -coward -2 -cowardly -2 -coziness 2 -cramp -1 -crap -3 -crash -2 -crazier -2 -craziest -2 -crazy -2 -creative 2 -crestfallen -2 -cried -2 -cries -2 -crime -3 -criminal -3 -criminals -3 -crisis -3 -critic -2 -criticism -2 -criticize -2 -criticized -2 -criticizes -2 -criticizing -2 -critics -2 -cruel -3 -cruelty -3 -crush -1 -crushed -2 -crushes -1 -crushing -1 -cry -1 -crying -2 -cunt -5 -curious 1 -curse -1 -cut -1 -cute 2 -cuts -1 -cutting -1 -cynic -2 -cynical -2 -cynicism -2 -damage -3 -damages -3 -damn -4 -damned -4 -damnit -4 -danger -2 -daredevil 2 -daring 2 -darkest -2 -darkness -1 -dauntless 2 -dead -3 -deadlock -2 -deafening -1 -dear 2 -dearly 3 -death -2 -debonair 2 -debt -2 -deceit -3 -deceitful -3 -deceive -3 -deceived -3 -deceives -3 -deceiving -3 -deception -3 -decisive 1 -dedicated 2 -defeated -2 -defect -3 -defects -3 -defender 2 -defenders 2 -defenseless -2 -defer -1 -deferring -1 -defiant -1 -deficit -2 -degrade -2 -degraded -2 -degrades -2 -dehumanize -2 -dehumanized -2 -dehumanizes -2 -dehumanizing -2 -deject -2 -dejected -2 -dejecting -2 -dejects -2 -delay -1 -delayed -1 -delight 3 -delighted 3 -delighting 3 -delights 3 -demand -1 -demanded -1 -demanding -1 -demands -1 -demonstration -1 -demoralized -2 -denied -2 -denier -2 -deniers -2 -denies -2 -denounce -2 -denounces -2 -deny -2 -denying -2 -depressed -2 -depressing -2 -derail -2 -derailed -2 -derails -2 -deride -2 -derided -2 -derides -2 -deriding -2 -derision -2 -desirable 2 -desire 1 -desired 2 -desirous 2 -despair -3 -despairing -3 -despairs -3 -desperate -3 -desperately -3 -despondent -3 -destroy -3 -destroyed -3 -destroying -3 -destroys -3 -destruction -3 -destructive -3 -detached -1 -detain -2 -detained -2 -detention -2 -determined 2 -devastate -2 -devastated -2 -devastating -2 -devoted 3 -diamond 1 -dick -4 -dickhead -4 -die -3 -died -3 -difficult -1 -diffident -2 -dilemma -1 -dipshit -3 -dire -3 -direful -3 -dirt -2 -dirtier -2 -dirtiest -2 -dirty -2 -disabling -1 -disadvantage -2 -disadvantaged -2 -disappear -1 -disappeared -1 -disappears -1 -disappoint -2 -disappointed -2 -disappointing -2 -disappointment -2 -disappointments -2 -disappoints -2 -disaster -2 -disasters -2 -disastrous -3 -disbelieve -2 -discard -1 -discarded -1 -discarding -1 -discards -1 -disconsolate -2 -disconsolation -2 -discontented -2 -discord -2 -discounted -1 -discouraged -2 -discredited -2 -disdain -2 -disgrace -2 -disgraced -2 -disguise -1 -disguised -1 -disguises -1 -disguising -1 -disgust -3 -disgusted -3 -disgusting -3 -disheartened -2 -dishonest -2 -disillusioned -2 -disinclined -2 -disjointed -2 -dislike -2 -dismal -2 -dismayed -2 -disorder -2 -disorganized -2 -disoriented -2 -disparage -2 -disparaged -2 -disparages -2 -disparaging -2 -displeased -2 -dispute -2 -disputed -2 -disputes -2 -disputing -2 -disqualified -2 -disquiet -2 -disregard -2 -disregarded -2 -disregarding -2 -disregards -2 -disrespect -2 -disrespected -2 -disruption -2 -disruptions -2 -disruptive -2 -dissatisfied -2 -distort -2 -distorted -2 -distorting -2 -distorts -2 -distract -2 -distracted -2 -distraction -2 -distracts -2 -distress -2 -distressed -2 -distresses -2 -distressing -2 -distrust -3 -distrustful -3 -disturb -2 -disturbed -2 -disturbing -2 -disturbs -2 -dithering -2 -dizzy -1 -dodging -2 -dodgy -2 -does not work -3 -dolorous -2 -dont like -2 -doom -2 -doomed -2 -doubt -1 -doubted -1 -doubtful -1 -doubting -1 -doubts -1 -douche -3 -douchebag -3 -downcast -2 -downhearted -2 -downside -2 -drag -1 -dragged -1 -drags -1 -drained -2 -dread -2 -dreaded -2 -dreadful -3 -dreading -2 -dream 1 -dreams 1 -dreary -2 -droopy -2 -drop -1 -drown -2 -drowned -2 -drowns -2 -drunk -2 -dubious -2 -dud -2 -dull -2 -dumb -3 -dumbass -3 -dump -1 -dumped -2 -dumps -1 -dupe -2 -duped -2 -dysfunction -2 -eager 2 -earnest 2 -ease 2 -easy 1 -ecstatic 4 -eerie -2 -eery -2 -effective 2 -effectively 2 -elated 3 -elation 3 -elegant 2 -elegantly 2 -embarrass -2 -embarrassed -2 -embarrasses -2 -embarrassing -2 -embarrassment -2 -embittered -2 -embrace 1 -emergency -2 -empathetic 2 -emptiness -1 -empty -1 -enchanted 2 -encourage 2 -encouraged 2 -encouragement 2 -encourages 2 -endorse 2 -endorsed 2 -endorsement 2 -endorses 2 -enemies -2 -enemy -2 -energetic 2 -engage 1 -engages 1 -engrossed 1 -enjoy 2 -enjoying 2 -enjoys 2 -enlighten 2 -enlightened 2 -enlightening 2 -enlightens 2 -ennui -2 -enrage -2 -enraged -2 -enrages -2 -enraging -2 -enrapture 3 -enslave -2 -enslaved -2 -enslaves -2 -ensure 1 -ensuring 1 -enterprising 1 -entertaining 2 -enthral 3 -enthusiastic 3 -entitled 1 -entrusted 2 -envies -1 -envious -2 -envy -1 -envying -1 -erroneous -2 -error -2 -errors -2 -escape -1 -escapes -1 -escaping -1 -esteemed 2 -ethical 2 -euphoria 3 -euphoric 4 -eviction -1 -evil -3 -exaggerate -2 -exaggerated -2 -exaggerates -2 -exaggerating -2 -exasperated 2 -excellence 3 -excellent 3 -excite 3 -excited 3 -excitement 3 -exciting 3 -exclude -1 -excluded -2 -exclusion -1 -exclusive 2 -excuse -1 -exempt -1 -exhausted -2 -exhilarated 3 -exhilarates 3 -exhilarating 3 -exonerate 2 -exonerated 2 -exonerates 2 -exonerating 2 -expand 1 -expands 1 -expel -2 -expelled -2 -expelling -2 -expels -2 -exploit -2 -exploited -2 -exploiting -2 -exploits -2 -exploration 1 -explorations 1 -expose -1 -exposed -1 -exposes -1 -exposing -1 -extend 1 -extends 1 -exuberant 4 -exultant 3 -exultantly 3 -fabulous 4 -fad -2 -fag -3 -faggot -3 -faggots -3 -fail -2 -failed -2 -failing -2 -fails -2 -failure -2 -failures -2 -fainthearted -2 -fair 2 -faith 1 -faithful 3 -fake -3 -fakes -3 -faking -3 -fallen -2 -falling -1 -falsified -3 -falsify -3 -fame 1 -fan 3 -fantastic 4 -farce -1 -fascinate 3 -fascinated 3 -fascinates 3 -fascinating 3 -fascist -2 -fascists -2 -fatalities -3 -fatality -3 -fatigue -2 -fatigued -2 -fatigues -2 -fatiguing -2 -favor 2 -favored 2 -favorite 2 -favorited 2 -favorites 2 -favors 2 -fear -2 -fearful -2 -fearing -2 -fearless 2 -fearsome -2 -fed up -3 -feeble -2 -feeling 1 -felonies -3 -felony -3 -fervent 2 -fervid 2 -festive 2 -fiasco -3 -fidgety -2 -fight -1 -fine 2 -fire -2 -fired -2 -firing -2 -fit 1 -fitness 1 -flagship 2 -flees -1 -flop -2 -flops -2 -flu -2 -flustered -2 -focused 2 -fond 2 -fondness 2 -fool -2 -foolish -2 -fools -2 -forced -1 -foreclosure -2 -foreclosures -2 -forget -1 -forgetful -2 -forgive 1 -forgiving 1 -forgotten -1 -fortunate 2 -frantic -1 -fraud -4 -frauds -4 -fraudster -4 -fraudsters -4 -fraudulence -4 -fraudulent -4 -free 1 -freedom 2 -frenzy -3 -fresh 1 -friendly 2 -fright -2 -frightened -2 -frightening -3 -frikin -2 -frisky 2 -frowning -1 -frustrate -2 -frustrated -2 -frustrates -2 -frustrating -2 -frustration -2 -ftw 3 -fuck -4 -fucked -4 -fucker -4 -fuckers -4 -fuckface -4 -fuckhead -4 -fucking -4 -fucktard -4 -fud -3 -fuked -4 -fuking -4 -fulfill 2 -fulfilled 2 -fulfills 2 -fuming -2 -fun 4 -funeral -1 -funerals -1 -funky 2 -funnier 4 -funny 4 -furious -3 -futile 2 -gag -2 -gagged -2 -gain 2 -gained 2 -gaining 2 -gains 2 -gallant 3 -gallantly 3 -gallantry 3 -generous 2 -genial 3 -ghost -1 -giddy -2 -gift 2 -glad 3 -glamorous 3 -glamourous 3 -glee 3 -gleeful 3 -gloom -1 -gloomy -2 -glorious 2 -glory 2 -glum -2 -god 1 -goddamn -3 -godsend 4 -good 3 -goodness 3 -grace 1 -gracious 3 -grand 3 -grant 1 -granted 1 -granting 1 -grants 1 -grateful 3 -gratification 2 -grave -2 -gray -1 -great 3 -greater 3 -greatest 3 -greed -3 -greedy -2 -green wash -3 -green washing -3 -greenwash -3 -greenwasher -3 -greenwashers -3 -greenwashing -3 -greet 1 -greeted 1 -greeting 1 -greetings 2 -greets 1 -grey -1 -grief -2 -grieved -2 -gross -2 -growing 1 -growth 2 -guarantee 1 -guilt -3 -guilty -3 -gullibility -2 -gullible -2 -gun -1 -ha 2 -hacked -1 -haha 3 -hahaha 3 -hahahah 3 -hail 2 -hailed 2 -hapless -2 -haplessness -2 -happiness 3 -happy 3 -hard -1 -hardier 2 -hardship -2 -hardy 2 -harm -2 -harmed -2 -harmful -2 -harming -2 -harms -2 -harried -2 -harsh -2 -harsher -2 -harshest -2 -hate -3 -hated -3 -haters -3 -hates -3 -hating -3 -haunt -1 -haunted -2 -haunting 1 -haunts -1 -havoc -2 -healthy 2 -heartbreaking -3 -heartbroken -3 -heartfelt 3 -heaven 2 -heavenly 4 -heavyhearted -2 -hell -4 -help 2 -helpful 2 -helping 2 -helpless -2 -helps 2 -hero 2 -heroes 2 -heroic 3 -hesitant -2 -hesitate -2 -hid -1 -hide -1 -hides -1 -hiding -1 -highlight 2 -hilarious 2 -hindrance -2 -hoax -2 -homesick -2 -honest 2 -honor 2 -honored 2 -honoring 2 -honour 2 -honoured 2 -honouring 2 -hooligan -2 -hooliganism -2 -hooligans -2 -hope 2 -hopeful 2 -hopefully 2 -hopeless -2 -hopelessness -2 -hopes 2 -hoping 2 -horrendous -3 -horrible -3 -horrific -3 -horrified -3 -hostile -2 -huckster -2 -hug 2 -huge 1 -hugs 2 -humerous 3 -humiliated -3 -humiliation -3 -humor 2 -humorous 2 -humour 2 -humourous 2 -hunger -2 -hurrah 5 -hurt -2 -hurting -2 -hurts -2 -hypocritical -2 -hysteria -3 -hysterical -3 -hysterics -3 -idiot -3 -idiotic -3 -ignorance -2 -ignorant -2 -ignore -1 -ignored -2 -ignores -1 -ill -2 -illegal -3 -illiteracy -2 -illness -2 -illnesses -2 -imbecile -3 -immobilized -1 -immortal 2 -immune 1 -impatient -2 -imperfect -2 -importance 2 -important 2 -impose -1 -imposed -1 -imposes -1 -imposing -1 -impotent -2 -impress 3 -impressed 3 -impresses 3 -impressive 3 -imprisoned -2 -improve 2 -improved 2 -improvement 2 -improves 2 -improving 2 -inability -2 -inaction -2 -inadequate -2 -incapable -2 -incapacitated -2 -incensed -2 -incompetence -2 -incompetent -2 -inconsiderate -2 -inconvenience -2 -inconvenient -2 -increase 1 -increased 1 -indecisive -2 -indestructible 2 -indifference -2 -indifferent -2 -indignant -2 -indignation -2 -indoctrinate -2 -indoctrinated -2 -indoctrinates -2 -indoctrinating -2 -ineffective -2 -ineffectively -2 -infatuated 2 -infatuation 2 -infected -2 -inferior -2 -inflamed -2 -influential 2 -infringement -2 -infuriate -2 -infuriated -2 -infuriates -2 -infuriating -2 -inhibit -1 -injured -2 -injury -2 -injustice -2 -innovate 1 -innovates 1 -innovation 1 -innovative 2 -inquisition -2 -inquisitive 2 -insane -2 -insanity -2 -insecure -2 -insensitive -2 -insensitivity -2 -insignificant -2 -insipid -2 -inspiration 2 -inspirational 2 -inspire 2 -inspired 2 -inspires 2 -inspiring 3 -insult -2 -insulted -2 -insulting -2 -insults -2 -intact 2 -integrity 2 -intelligent 2 -intense 1 -interest 1 -interested 2 -interesting 2 -interests 1 -interrogated -2 -interrupt -2 -interrupted -2 -interrupting -2 -interruption -2 -interrupts -2 -intimidate -2 -intimidated -2 -intimidates -2 -intimidating -2 -intimidation -2 -intricate 2 -intrigues 1 -invincible 2 -invite 1 -inviting 1 -invulnerable 2 -irate -3 -ironic -1 -irony -1 -irrational -1 -irresistible 2 -irresolute -2 -irresponsible 2 -irreversible -1 -irritate -3 -irritated -3 -irritating -3 -isolated -1 -itchy -2 -jackass -4 -jackasses -4 -jailed -2 -jaunty 2 -jealous -2 -jeopardy -2 -jerk -3 -jesus 1 -jewel 1 -jewels 1 -jocular 2 -join 1 -joke 2 -jokes 2 -jolly 2 -jovial 2 -joy 3 -joyful 3 -joyfully 3 -joyless -2 -joyous 3 -jubilant 3 -jumpy -1 -justice 2 -justifiably 2 -justified 2 -keen 1 -kill -3 -killed -3 -killing -3 -kills -3 -kind 2 -kinder 2 -kiss 2 -kudos 3 -lack -2 -lackadaisical -2 -lag -1 -lagged -2 -lagging -2 -lags -2 -lame -2 -landmark 2 -laugh 1 -laughed 1 -laughing 1 -laughs 1 -laughting 1 -launched 1 -lawl 3 -lawsuit -2 -lawsuits -2 -lazy -1 -leak -1 -leaked -1 -leave -1 -legal 1 -legally 1 -lenient 1 -lethargic -2 -lethargy -2 -liar -3 -liars -3 -libelous -2 -lied -2 -lifesaver 4 -lighthearted 1 -like 2 -liked 2 -likes 2 -limitation -1 -limited -1 -limits -1 -litigation -1 -litigious -2 -lively 2 -livid -2 -lmao 4 -lmfao 4 -loathe -3 -loathed -3 -loathes -3 -loathing -3 -lobby -2 -lobbying -2 -lol 3 -lonely -2 -lonesome -2 -longing -1 -loom -1 -loomed -1 -looming -1 -looms -1 -loose -3 -looses -3 -loser -3 -losing -3 -loss -3 -lost -3 -lovable 3 -love 3 -loved 3 -lovelies 3 -lovely 3 -loving 2 -lowest -1 -loyal 3 -loyalty 3 -luck 3 -luckily 3 -lucky 3 -lugubrious -2 -lunatic -3 -lunatics -3 -lurk -1 -lurking -1 -lurks -1 -mad -3 -maddening -3 -made-up -1 -madly -3 -madness -3 -mandatory -1 -manipulated -1 -manipulating -1 -manipulation -1 -marvel 3 -marvelous 3 -marvels 3 -masterpiece 4 -masterpieces 4 -matter 1 -matters 1 -mature 2 -meaningful 2 -meaningless -2 -medal 3 -mediocrity -3 -meditative 1 -melancholy -2 -menace -2 -menaced -2 -mercy 2 -merry 3 -mess -2 -messed -2 -messing up -2 -methodical 2 -mindless -2 -miracle 4 -mirth 3 -mirthful 3 -mirthfully 3 -misbehave -2 -misbehaved -2 -misbehaves -2 -misbehaving -2 -mischief -1 -mischiefs -1 -miserable -3 -misery -2 -misgiving -2 -misinformation -2 -misinformed -2 -misinterpreted -2 -misleading -3 -misread -1 -misreporting -2 -misrepresentation -2 -miss -2 -missed -2 -missing -2 -mistake -2 -mistaken -2 -mistakes -2 -mistaking -2 -misunderstand -2 -misunderstanding -2 -misunderstands -2 -misunderstood -2 -moan -2 -moaned -2 -moaning -2 -moans -2 -mock -2 -mocked -2 -mocking -2 -mocks -2 -mongering -2 -monopolize -2 -monopolized -2 -monopolizes -2 -monopolizing -2 -moody -1 -mope -1 -moping -1 -moron -3 -motherfucker -5 -motherfucking -5 -motivate 1 -motivated 2 -motivating 2 -motivation 1 -mourn -2 -mourned -2 -mournful -2 -mourning -2 -mourns -2 -mumpish -2 -murder -2 -murderer -2 -murdering -3 -murderous -3 -murders -2 -myth -1 -n00b -2 -naive -2 -nasty -3 -natural 1 -naïve -2 -needy -2 -negative -2 -negativity -2 -neglect -2 -neglected -2 -neglecting -2 -neglects -2 -nerves -1 -nervous -2 -nervously -2 -nice 3 -nifty 2 -niggas -5 -nigger -5 -no -1 -no fun -3 -noble 2 -noisy -1 -nonsense -2 -noob -2 -nosey -2 -not good -2 -not working -3 -notorious -2 -novel 2 -numb -1 -nuts -3 -obliterate -2 -obliterated -2 -obnoxious -3 -obscene -2 -obsessed 2 -obsolete -2 -obstacle -2 -obstacles -2 -obstinate -2 -odd -2 -offend -2 -offended -2 -offender -2 -offending -2 -offends -2 -offline -1 -oks 2 -ominous 3 -once-in-a-lifetime 3 -opportunities 2 -opportunity 2 -oppressed -2 -oppressive -2 -optimism 2 -optimistic 2 -optionless -2 -outcry -2 -outmaneuvered -2 -outrage -3 -outraged -3 -outreach 2 -outstanding 5 -overjoyed 4 -overload -1 -overlooked -1 -overreact -2 -overreacted -2 -overreaction -2 -overreacts -2 -oversell -2 -overselling -2 -oversells -2 -oversimplification -2 -oversimplified -2 -oversimplifies -2 -oversimplify -2 -overstatement -2 -overstatements -2 -overweight -1 -oxymoron -1 -pain -2 -pained -2 -panic -3 -panicked -3 -panics -3 -paradise 3 -paradox -1 -pardon 2 -pardoned 2 -pardoning 2 -pardons 2 -parley -1 -passionate 2 -passive -1 -passively -1 -pathetic -2 -pay -1 -peace 2 -peaceful 2 -peacefully 2 -penalty -2 -pensive -1 -perfect 3 -perfected 2 -perfectly 3 -perfects 2 -peril -2 -perjury -3 -perpetrator -2 -perpetrators -2 -perplexed -2 -persecute -2 -persecuted -2 -persecutes -2 -persecuting -2 -perturbed -2 -pesky -2 -pessimism -2 -pessimistic -2 -petrified -2 -phobic -2 -picturesque 2 -pileup -1 -pique -2 -piqued -2 -piss -4 -pissed -4 -pissing -3 -piteous -2 -pitied -1 -pity -2 -playful 2 -pleasant 3 -please 1 -pleased 3 -pleasure 3 -poised -2 -poison -2 -poisoned -2 -poisons -2 -pollute -2 -polluted -2 -polluter -2 -polluters -2 -pollutes -2 -poor -2 -poorer -2 -poorest -2 -popular 3 -positive 2 -positively 2 -possessive -2 -postpone -1 -postponed -1 -postpones -1 -postponing -1 -poverty -1 -powerful 2 -powerless -2 -praise 3 -praised 3 -praises 3 -praising 3 -pray 1 -praying 1 -prays 1 -prblm -2 -prblms -2 -prepared 1 -pressure -1 -pressured -2 -pretend -1 -pretending -1 -pretends -1 -pretty 1 -prevent -1 -prevented -1 -preventing -1 -prevents -1 -prick -5 -prison -2 -prisoner -2 -prisoners -2 -privileged 2 -proactive 2 -problem -2 -problems -2 -profiteer -2 -progress 2 -prominent 2 -promise 1 -promised 1 -promises 1 -promote 1 -promoted 1 -promotes 1 -promoting 1 -propaganda -2 -prosecute -1 -prosecuted -2 -prosecutes -1 -prosecution -1 -prospect 1 -prospects 1 -prosperous 3 -protect 1 -protected 1 -protects 1 -protest -2 -protesters -2 -protesting -2 -protests -2 -proud 2 -proudly 2 -provoke -1 -provoked -1 -provokes -1 -provoking -1 -pseudoscience -3 -punish -2 -punished -2 -punishes -2 -punitive -2 -pushy -1 -puzzled -2 -quaking -2 -questionable -2 -questioned -1 -questioning -1 -racism -3 -racist -3 -racists -3 -rage -2 -rageful -2 -rainy -1 -rant -3 -ranter -3 -ranters -3 -rants -3 -rape -4 -rapist -4 -rapture 2 -raptured 2 -raptures 2 -rapturous 4 -rash -2 -ratified 2 -reach 1 -reached 1 -reaches 1 -reaching 1 -reassure 1 -reassured 1 -reassures 1 -reassuring 2 -rebellion -2 -recession -2 -reckless -2 -recommend 2 -recommended 2 -recommends 2 -redeemed 2 -refuse -2 -refused -2 -refusing -2 -regret -2 -regretful -2 -regrets -2 -regretted -2 -regretting -2 -reject -1 -rejected -1 -rejecting -1 -rejects -1 -rejoice 4 -rejoiced 4 -rejoices 4 -rejoicing 4 -relaxed 2 -relentless -1 -reliant 2 -relieve 1 -relieved 2 -relieves 1 -relieving 2 -relishing 2 -remarkable 2 -remorse -2 -repulse -1 -repulsed -2 -rescue 2 -rescued 2 -rescues 2 -resentful -2 -resign -1 -resigned -1 -resigning -1 -resigns -1 -resolute 2 -resolve 2 -resolved 2 -resolves 2 -resolving 2 -respected 2 -responsible 2 -responsive 2 -restful 2 -restless -2 -restore 1 -restored 1 -restores 1 -restoring 1 -restrict -2 -restricted -2 -restricting -2 -restriction -2 -restricts -2 -retained -1 -retard -2 -retarded -2 -retreat -1 -revenge -2 -revengeful -2 -revered 2 -revive 2 -revives 2 -reward 2 -rewarded 2 -rewarding 2 -rewards 2 -rich 2 -ridiculous -3 -rig -1 -rigged -1 -right direction 3 -rigorous 3 -rigorously 3 -riot -2 -riots -2 -risk -2 -risks -2 -rob -2 -robber -2 -robed -2 -robing -2 -robs -2 -robust 2 -rofl 4 -roflcopter 4 -roflmao 4 -romance 2 -rotfl 4 -rotflmfao 4 -rotflol 4 -ruin -2 -ruined -2 -ruining -2 -ruins -2 -sabotage -2 -sad -2 -sadden -2 -saddened -2 -sadly -2 -safe 1 -safely 1 -safety 1 -salient 1 -sappy -1 -sarcastic -2 -satisfied 2 -save 2 -saved 2 -scam -2 -scams -2 -scandal -3 -scandalous -3 -scandals -3 -scapegoat -2 -scapegoats -2 -scare -2 -scared -2 -scary -2 -sceptical -2 -scold -2 -scoop 3 -scorn -2 -scornful -2 -scream -2 -screamed -2 -screaming -2 -screams -2 -screwed -2 -screwed up -3 -scumbag -4 -secure 2 -secured 2 -secures 2 -sedition -2 -seditious -2 -seduced -1 -self-confident 2 -self-deluded -2 -selfish -3 -selfishness -3 -sentence -2 -sentenced -2 -sentences -2 -sentencing -2 -serene 2 -severe -2 -sexy 3 -shaky -2 -shame -2 -shamed -2 -shameful -2 -share 1 -shared 1 -shares 1 -shattered -2 -shit -4 -shithead -4 -shitty -3 -shock -2 -shocked -2 -shocking -2 -shocks -2 -shoot -1 -short-sighted -2 -short-sightedness -2 -shortage -2 -shortages -2 -shrew -4 -shy -1 -sick -2 -sigh -2 -significance 1 -significant 1 -silencing -1 -silly -1 -sincere 2 -sincerely 2 -sincerest 2 -sincerity 2 -sinful -3 -singleminded -2 -skeptic -2 -skeptical -2 -skepticism -2 -skeptics -2 -slam -2 -slash -2 -slashed -2 -slashes -2 -slashing -2 -slavery -3 -sleeplessness -2 -slick 2 -slicker 2 -slickest 2 -sluggish -2 -slut -5 -smart 1 -smarter 2 -smartest 2 -smear -2 -smile 2 -smiled 2 -smiles 2 -smiling 2 -smog -2 -sneaky -1 -snub -2 -snubbed -2 -snubbing -2 -snubs -2 -sobering 1 -solemn -1 -solid 2 -solidarity 2 -solution 1 -solutions 1 -solve 1 -solved 1 -solves 1 -solving 1 -somber -2 -some kind 0 -son-of-a-bitch -5 -soothe 3 -soothed 3 -soothing 3 -sophisticated 2 -sore -1 -sorrow -2 -sorrowful -2 -sorry -1 -spam -2 -spammer -3 -spammers -3 -spamming -2 -spark 1 -sparkle 3 -sparkles 3 -sparkling 3 -speculative -2 -spirit 1 -spirited 2 -spiritless -2 -spiteful -2 -splendid 3 -sprightly 2 -squelched -1 -stab -2 -stabbed -2 -stable 2 -stabs -2 -stall -2 -stalled -2 -stalling -2 -stamina 2 -stampede -2 -startled -2 -starve -2 -starved -2 -starves -2 -starving -2 -steadfast 2 -steal -2 -steals -2 -stereotype -2 -stereotyped -2 -stifled -1 -stimulate 1 -stimulated 1 -stimulates 1 -stimulating 2 -stingy -2 -stolen -2 -stop -1 -stopped -1 -stopping -1 -stops -1 -stout 2 -straight 1 -strange -1 -strangely -1 -strangled -2 -strength 2 -strengthen 2 -strengthened 2 -strengthening 2 -strengthens 2 -stressed -2 -stressor -2 -stressors -2 -stricken -2 -strike -1 -strikers -2 -strikes -1 -strong 2 -stronger 2 -strongest 2 -struck -1 -struggle -2 -struggled -2 -struggles -2 -struggling -2 -stubborn -2 -stuck -2 -stunned -2 -stunning 4 -stupid -2 -stupidly -2 -suave 2 -substantial 1 -substantially 1 -subversive -2 -success 2 -successful 3 -suck -3 -sucks -3 -suffer -2 -suffering -2 -suffers -2 -suicidal -2 -suicide -2 -suing -2 -sulking -2 -sulky -2 -sullen -2 -sunshine 2 -super 3 -superb 5 -superior 2 -support 2 -supported 2 -supporter 1 -supporters 1 -supporting 1 -supportive 2 -supports 2 -survived 2 -surviving 2 -survivor 2 -suspect -1 -suspected -1 -suspecting -1 -suspects -1 -suspend -1 -suspended -1 -suspicious -2 -swear -2 -swearing -2 -swears -2 -sweet 2 -swift 2 -swiftly 2 -swindle -3 -swindles -3 -swindling -3 -sympathetic 2 -sympathy 2 -tard -2 -tears -2 -tender 2 -tense -2 -tension -1 -terrible -3 -terribly -3 -terrific 4 -terrified -3 -terror -3 -terrorize -3 -terrorized -3 -terrorizes -3 -thank 2 -thankful 2 -thanks 2 -thorny -2 -thoughtful 2 -thoughtless -2 -threat -2 -threaten -2 -threatened -2 -threatening -2 -threatens -2 -threats -2 -thrilled 5 -thwart -2 -thwarted -2 -thwarting -2 -thwarts -2 -timid -2 -timorous -2 -tired -2 -tits -2 -tolerant 2 -toothless -2 -top 2 -tops 2 -torn -2 -torture -4 -tortured -4 -tortures -4 -torturing -4 -totalitarian -2 -totalitarianism -2 -tout -2 -touted -2 -touting -2 -touts -2 -tragedy -2 -tragic -2 -tranquil 2 -trap -1 -trapped -2 -trauma -3 -traumatic -3 -travesty -2 -treason -3 -treasonous -3 -treasure 2 -treasures 2 -trembling -2 -tremulous -2 -tricked -2 -trickery -2 -triumph 4 -triumphant 4 -trouble -2 -troubled -2 -troubles -2 -true 2 -trust 1 -trusted 2 -tumor -2 -twat -5 -ugly -3 -unacceptable -2 -unappreciated -2 -unapproved -2 -unaware -2 -unbelievable -1 -unbelieving -1 -unbiased 2 -uncertain -1 -unclear -1 -uncomfortable -2 -unconcerned -2 -unconfirmed -1 -unconvinced -1 -uncredited -1 -undecided -1 -underestimate -1 -underestimated -1 -underestimates -1 -underestimating -1 -undermine -2 -undermined -2 -undermines -2 -undermining -2 -undeserving -2 -undesirable -2 -uneasy -2 -unemployment -2 -unequal -1 -unequaled 2 -unethical -2 -unfair -2 -unfocused -2 -unfulfilled -2 -unhappy -2 -unhealthy -2 -unified 1 -unimpressed -2 -unintelligent -2 -united 1 -unjust -2 -unlovable -2 -unloved -2 -unmatched 1 -unmotivated -2 -unprofessional -2 -unresearched -2 -unsatisfied -2 -unsecured -2 -unsettled -1 -unsophisticated -2 -unstable -2 -unstoppable 2 -unsupported -2 -unsure -1 -untarnished 2 -unwanted -2 -unworthy -2 -upset -2 -upsets -2 -upsetting -2 -uptight -2 -urgent -1 -useful 2 -usefulness 2 -useless -2 -uselessness -2 -vague -2 -validate 1 -validated 1 -validates 1 -validating 1 -verdict -1 -verdicts -1 -vested 1 -vexation -2 -vexing -2 -vibrant 3 -vicious -2 -victim -3 -victimize -3 -victimized -3 -victimizes -3 -victimizing -3 -victims -3 -vigilant 3 -vile -3 -vindicate 2 -vindicated 2 -vindicates 2 -vindicating 2 -violate -2 -violated -2 -violates -2 -violating -2 -violence -3 -violent -3 -virtuous 2 -virulent -2 -vision 1 -visionary 3 -visioning 1 -visions 1 -vitality 3 -vitamin 1 -vitriolic -3 -vivacious 3 -vociferous -1 -vulnerability -2 -vulnerable -2 -walkout -2 -walkouts -2 -wanker -3 -want 1 -war -2 -warfare -2 -warm 1 -warmth 2 -warn -2 -warned -2 -warning -3 -warnings -3 -warns -2 -waste -1 -wasted -2 -wasting -2 -wavering -1 -weak -2 -weakness -2 -wealth 3 -wealthy 2 -weary -2 -weep -2 -weeping -2 -weird -2 -welcome 2 -welcomed 2 -welcomes 2 -whimsical 1 -whitewash -3 -whore -4 -wicked -2 -widowed -1 -willingness 2 -win 4 -winner 4 -winning 4 -wins 4 -winwin 3 -wish 1 -wishes 1 -wishing 1 -withdrawal -3 -woebegone -2 -woeful -3 -won 3 -wonderful 4 -woo 3 -woohoo 3 -wooo 4 -woow 4 -worn -1 -worried -3 -worry -3 -worrying -3 -worse -3 -worsen -3 -worsened -3 -worsening -3 -worsens -3 -worshiped 3 -worst -3 -worth 2 -worthless -2 -worthy 2 -wow 4 -wowow 4 -wowww 4 -wrathful -3 -wreck -2 -wrong -2 -wronged -2 -wtf -4 -yeah 1 -yearning 1 -yeees 2 -yes 1 -youthful 2 -yucky -2 -yummy 3 -zealot -2 -zealots -2 -zealous 2 \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/jars/scopt_2.11-3.7.0.jar b/scripts/spark-2.4.3-bin-hadoop2.7/examples/jars/scopt_2.11-3.7.0.jar deleted file mode 100644 index eac19a2..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/examples/jars/scopt_2.11-3.7.0.jar and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/jars/spark-examples_2.11-2.4.3.jar b/scripts/spark-2.4.3-bin-hadoop2.7/examples/jars/spark-examples_2.11-2.4.3.jar deleted file mode 100644 index b7a0913..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/examples/jars/spark-examples_2.11-2.4.3.jar and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaHdfsLR.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaHdfsLR.java deleted file mode 100644 index 362bd44..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaHdfsLR.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.sql.SparkSession; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.Random; -import java.util.regex.Pattern; - -/** - * Logistic regression based classification. - * - * This is an example implementation for learning how to use Spark. For more conventional use, - * please refer to org.apache.spark.ml.classification.LogisticRegression. - */ -public final class JavaHdfsLR { - - private static final int D = 10; // Number of dimensions - private static final Random rand = new Random(42); - - static void showWarning() { - String warning = "WARN: This is a naive implementation of Logistic Regression " + - "and is given as an example!\n" + - "Please use org.apache.spark.ml.classification.LogisticRegression " + - "for more conventional use."; - System.err.println(warning); - } - - static class DataPoint implements Serializable { - DataPoint(double[] x, double y) { - this.x = x; - this.y = y; - } - - double[] x; - double y; - } - - static class ParsePoint implements Function { - private static final Pattern SPACE = Pattern.compile(" "); - - @Override - public DataPoint call(String line) { - String[] tok = SPACE.split(line); - double y = Double.parseDouble(tok[0]); - double[] x = new double[D]; - for (int i = 0; i < D; i++) { - x[i] = Double.parseDouble(tok[i + 1]); - } - return new DataPoint(x, y); - } - } - - static class VectorSum implements Function2 { - @Override - public double[] call(double[] a, double[] b) { - double[] result = new double[D]; - for (int j = 0; j < D; j++) { - result[j] = a[j] + b[j]; - } - return result; - } - } - - static class ComputeGradient implements Function { - private final double[] weights; - - ComputeGradient(double[] weights) { - this.weights = weights; - } - - @Override - public double[] call(DataPoint p) { - double[] gradient = new double[D]; - for (int i = 0; i < D; i++) { - double dot = dot(weights, p.x); - gradient[i] = (1 / (1 + Math.exp(-p.y * dot)) - 1) * p.y * p.x[i]; - } - return gradient; - } - } - - public static double dot(double[] a, double[] b) { - double x = 0; - for (int i = 0; i < D; i++) { - x += a[i] * b[i]; - } - return x; - } - - public static void printWeights(double[] a) { - System.out.println(Arrays.toString(a)); - } - - public static void main(String[] args) { - - if (args.length < 2) { - System.err.println("Usage: JavaHdfsLR "); - System.exit(1); - } - - showWarning(); - - SparkSession spark = SparkSession - .builder() - .appName("JavaHdfsLR") - .getOrCreate(); - - JavaRDD lines = spark.read().textFile(args[0]).javaRDD(); - JavaRDD points = lines.map(new ParsePoint()).cache(); - int ITERATIONS = Integer.parseInt(args[1]); - - // Initialize w to a random value - double[] w = new double[D]; - for (int i = 0; i < D; i++) { - w[i] = 2 * rand.nextDouble() - 1; - } - - System.out.print("Initial w: "); - printWeights(w); - - for (int i = 1; i <= ITERATIONS; i++) { - System.out.println("On iteration " + i); - - double[] gradient = points.map( - new ComputeGradient(w) - ).reduce(new VectorSum()); - - for (int j = 0; j < D; j++) { - w[j] -= gradient[j]; - } - - } - - System.out.print("Final w: "); - printWeights(w); - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java deleted file mode 100644 index cf12de3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples; - -import scala.Tuple2; -import scala.Tuple3; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * Executes a roll up-style query against Apache logs. - * - * Usage: JavaLogQuery [logFile] - */ -public final class JavaLogQuery { - - public static final List exampleApacheLogs = Arrays.asList( - "10.10.10.10 - \"FRED\" [18/Jan/2013:17:56:07 +1100] \"GET http://images.com/2013/Generic.jpg " + - "HTTP/1.1\" 304 315 \"http://referall.com/\" \"Mozilla/4.0 (compatible; MSIE 7.0; " + - "Windows NT 5.1; GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; " + - ".NET CLR 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR " + - "3.5.30729; Release=ARP)\" \"UD-1\" - \"image/jpeg\" \"whatever\" 0.350 \"-\" - \"\" 265 923 934 \"\" " + - "62.24.11.25 images.com 1358492167 - Whatup", - "10.10.10.10 - \"FRED\" [18/Jan/2013:18:02:37 +1100] \"GET http://images.com/2013/Generic.jpg " + - "HTTP/1.1\" 304 306 \"http:/referall.com\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; " + - "GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR " + - "3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR " + - "3.5.30729; Release=ARP)\" \"UD-1\" - \"image/jpeg\" \"whatever\" 0.352 \"-\" - \"\" 256 977 988 \"\" " + - "0 73.23.2.15 images.com 1358492557 - Whatup"); - - public static final Pattern apacheLogRegex = Pattern.compile( - "^([\\d.]+) (\\S+) (\\S+) \\[([\\w\\d:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) ([\\d\\-]+) \"([^\"]+)\" \"([^\"]+)\".*"); - - /** Tracks the total query count and number of aggregate bytes for a particular group. */ - public static class Stats implements Serializable { - - private final int count; - private final int numBytes; - - public Stats(int count, int numBytes) { - this.count = count; - this.numBytes = numBytes; - } - public Stats merge(Stats other) { - return new Stats(count + other.count, numBytes + other.numBytes); - } - - public String toString() { - return String.format("bytes=%s\tn=%s", numBytes, count); - } - } - - public static Tuple3 extractKey(String line) { - Matcher m = apacheLogRegex.matcher(line); - if (m.find()) { - String ip = m.group(1); - String user = m.group(3); - String query = m.group(5); - if (!user.equalsIgnoreCase("-")) { - return new Tuple3<>(ip, user, query); - } - } - return new Tuple3<>(null, null, null); - } - - public static Stats extractStats(String line) { - Matcher m = apacheLogRegex.matcher(line); - if (m.find()) { - int bytes = Integer.parseInt(m.group(7)); - return new Stats(1, bytes); - } else { - return new Stats(1, 0); - } - } - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaLogQuery") - .getOrCreate(); - - JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs); - - JavaPairRDD, Stats> extracted = - dataSet.mapToPair(s -> new Tuple2<>(extractKey(s), extractStats(s))); - - JavaPairRDD, Stats> counts = extracted.reduceByKey(Stats::merge); - - List, Stats>> output = counts.collect(); - for (Tuple2 t : output) { - System.out.println(t._1() + "\t" + t._2()); - } - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java deleted file mode 100644 index b5b4703..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Pattern; - -import scala.Tuple2; - -import com.google.common.collect.Iterables; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.sql.SparkSession; - -/** - * Computes the PageRank of URLs from an input file. Input file should - * be in format of: - * URL neighbor URL - * URL neighbor URL - * URL neighbor URL - * ... - * where URL and their neighbors are separated by space(s). - * - * This is an example implementation for learning how to use Spark. For more conventional use, - * please refer to org.apache.spark.graphx.lib.PageRank - * - * Example Usage: - *
- * bin/run-example JavaPageRank data/mllib/pagerank_data.txt 10
- * 
- */ -public final class JavaPageRank { - private static final Pattern SPACES = Pattern.compile("\\s+"); - - static void showWarning() { - String warning = "WARN: This is a naive implementation of PageRank " + - "and is given as an example! \n" + - "Please use the PageRank implementation found in " + - "org.apache.spark.graphx.lib.PageRank for more conventional use."; - System.err.println(warning); - } - - private static class Sum implements Function2 { - @Override - public Double call(Double a, Double b) { - return a + b; - } - } - - public static void main(String[] args) throws Exception { - if (args.length < 2) { - System.err.println("Usage: JavaPageRank "); - System.exit(1); - } - - showWarning(); - - SparkSession spark = SparkSession - .builder() - .appName("JavaPageRank") - .getOrCreate(); - - // Loads in input file. It should be in format of: - // URL neighbor URL - // URL neighbor URL - // URL neighbor URL - // ... - JavaRDD lines = spark.read().textFile(args[0]).javaRDD(); - - // Loads all URLs from input file and initialize their neighbors. - JavaPairRDD> links = lines.mapToPair(s -> { - String[] parts = SPACES.split(s); - return new Tuple2<>(parts[0], parts[1]); - }).distinct().groupByKey().cache(); - - // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. - JavaPairRDD ranks = links.mapValues(rs -> 1.0); - - // Calculates and updates URL ranks continuously using PageRank algorithm. - for (int current = 0; current < Integer.parseInt(args[1]); current++) { - // Calculates URL contributions to the rank of other URLs. - JavaPairRDD contribs = links.join(ranks).values() - .flatMapToPair(s -> { - int urlCount = Iterables.size(s._1()); - List> results = new ArrayList<>(); - for (String n : s._1) { - results.add(new Tuple2<>(n, s._2() / urlCount)); - } - return results.iterator(); - }); - - // Re-calculates URL ranks based on neighbor contributions. - ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85); - } - - // Collects all URL ranks and dump them to console. - List> output = ranks.collect(); - for (Tuple2 tuple : output) { - System.out.println(tuple._1() + " has rank: " + tuple._2() + "."); - } - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java deleted file mode 100644 index 37bd8ff..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; - -import java.util.ArrayList; -import java.util.List; - -/** - * Computes an approximation to pi - * Usage: JavaSparkPi [partitions] - */ -public final class JavaSparkPi { - - public static void main(String[] args) throws Exception { - SparkSession spark = SparkSession - .builder() - .appName("JavaSparkPi") - .getOrCreate(); - - JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); - - int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2; - int n = 100000 * slices; - List l = new ArrayList<>(n); - for (int i = 0; i < n; i++) { - l.add(i); - } - - JavaRDD dataSet = jsc.parallelize(l, slices); - - int count = dataSet.map(integer -> { - double x = Math.random() * 2 - 1; - double y = Math.random() * 2 - 1; - return (x * x + y * y <= 1) ? 1 : 0; - }).reduce((integer, integer2) -> integer + integer2); - - System.out.println("Pi is roughly " + 4.0 * count / n); - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaStatusTrackerDemo.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaStatusTrackerDemo.java deleted file mode 100644 index b0ebedf..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaStatusTrackerDemo.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples; - -import org.apache.spark.SparkJobInfo; -import org.apache.spark.SparkStageInfo; -import org.apache.spark.api.java.JavaFutureAction; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.sql.SparkSession; - -import java.util.Arrays; -import java.util.List; - -/** - * Example of using Spark's status APIs from Java. - */ -public final class JavaStatusTrackerDemo { - - public static final String APP_NAME = "JavaStatusAPIDemo"; - - public static final class IdentityWithDelay implements Function { - @Override - public T call(T x) throws Exception { - Thread.sleep(2 * 1000); // 2 seconds - return x; - } - } - - public static void main(String[] args) throws Exception { - SparkSession spark = SparkSession - .builder() - .appName(APP_NAME) - .getOrCreate(); - - JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); - - // Example of implementing a progress reporter for a simple job. - JavaRDD rdd = jsc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 5).map( - new IdentityWithDelay<>()); - JavaFutureAction> jobFuture = rdd.collectAsync(); - while (!jobFuture.isDone()) { - Thread.sleep(1000); // 1 second - List jobIds = jobFuture.jobIds(); - if (jobIds.isEmpty()) { - continue; - } - int currentJobId = jobIds.get(jobIds.size() - 1); - SparkJobInfo jobInfo = jsc.statusTracker().getJobInfo(currentJobId); - SparkStageInfo stageInfo = jsc.statusTracker().getStageInfo(jobInfo.stageIds()[0]); - System.out.println(stageInfo.numTasks() + " tasks total: " + stageInfo.numActiveTasks() + - " active, " + stageInfo.numCompletedTasks() + " complete"); - } - - System.out.println("Job results are: " + jobFuture.get()); - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaTC.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaTC.java deleted file mode 100644 index c9ca9c9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaTC.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Random; -import java.util.Set; - -import scala.Tuple2; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.SparkSession; - -/** - * Transitive closure on a graph, implemented in Java. - * Usage: JavaTC [partitions] - */ -public final class JavaTC { - - private static final int numEdges = 200; - private static final int numVertices = 100; - private static final Random rand = new Random(42); - - static List> generateGraph() { - Set> edges = new HashSet<>(numEdges); - while (edges.size() < numEdges) { - int from = rand.nextInt(numVertices); - int to = rand.nextInt(numVertices); - Tuple2 e = new Tuple2<>(from, to); - if (from != to) { - edges.add(e); - } - } - return new ArrayList<>(edges); - } - - static class ProjectFn implements PairFunction>, - Integer, Integer> { - static final ProjectFn INSTANCE = new ProjectFn(); - - @Override - public Tuple2 call(Tuple2> triple) { - return new Tuple2<>(triple._2()._2(), triple._2()._1()); - } - } - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaTC") - .getOrCreate(); - - JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); - - Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2; - JavaPairRDD tc = jsc.parallelizePairs(generateGraph(), slices).cache(); - - // Linear transitive closure: each round grows paths by one edge, - // by joining the graph's edges with the already-discovered paths. - // e.g. join the path (y, z) from the TC with the edge (x, y) from - // the graph to obtain the path (x, z). - - // Because join() joins on keys, the edges are stored in reversed order. - JavaPairRDD edges = tc.mapToPair(e -> new Tuple2<>(e._2(), e._1())); - - long oldCount; - long nextCount = tc.count(); - do { - oldCount = nextCount; - // Perform the join, obtaining an RDD of (y, (z, x)) pairs, - // then project the result to obtain the new (x, z) paths. - tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache(); - nextCount = tc.count(); - } while (nextCount != oldCount); - - System.out.println("TC has " + tc.count() + " edges."); - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java deleted file mode 100644 index f1ce1e9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples; - -import scala.Tuple2; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.sql.SparkSession; - -import java.util.Arrays; -import java.util.List; -import java.util.regex.Pattern; - -public final class JavaWordCount { - private static final Pattern SPACE = Pattern.compile(" "); - - public static void main(String[] args) throws Exception { - - if (args.length < 1) { - System.err.println("Usage: JavaWordCount "); - System.exit(1); - } - - SparkSession spark = SparkSession - .builder() - .appName("JavaWordCount") - .getOrCreate(); - - JavaRDD lines = spark.read().textFile(args[0]).javaRDD(); - - JavaRDD words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator()); - - JavaPairRDD ones = words.mapToPair(s -> new Tuple2<>(s, 1)); - - JavaPairRDD counts = ones.reduceByKey((i1, i2) -> i1 + i2); - - List> output = counts.collect(); - for (Tuple2 tuple : output) { - System.out.println(tuple._1() + ": " + tuple._2()); - } - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java deleted file mode 100644 index 7c741ff..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.regression.AFTSurvivalRegression; -import org.apache.spark.ml.regression.AFTSurvivalRegressionModel; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -/** - * An example demonstrating AFTSurvivalRegression. - * Run with - *
- * bin/run-example ml.JavaAFTSurvivalRegressionExample
- * 
- */ -public class JavaAFTSurvivalRegressionExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaAFTSurvivalRegressionExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(1.218, 1.0, Vectors.dense(1.560, -0.605)), - RowFactory.create(2.949, 0.0, Vectors.dense(0.346, 2.158)), - RowFactory.create(3.627, 0.0, Vectors.dense(1.380, 0.231)), - RowFactory.create(0.273, 1.0, Vectors.dense(0.520, 1.151)), - RowFactory.create(4.199, 0.0, Vectors.dense(0.795, -0.226)) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), - new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()), - new StructField("features", new VectorUDT(), false, Metadata.empty()) - }); - Dataset training = spark.createDataFrame(data, schema); - double[] quantileProbabilities = new double[]{0.3, 0.6}; - AFTSurvivalRegression aft = new AFTSurvivalRegression() - .setQuantileProbabilities(quantileProbabilities) - .setQuantilesCol("quantiles"); - - AFTSurvivalRegressionModel model = aft.fit(training); - - // Print the coefficients, intercept and scale parameter for AFT survival regression - System.out.println("Coefficients: " + model.coefficients()); - System.out.println("Intercept: " + model.intercept()); - System.out.println("Scale: " + model.scale()); - model.transform(training).show(false); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java deleted file mode 100644 index 27052be..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaALSExample.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.io.Serializable; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.ml.evaluation.RegressionEvaluator; -import org.apache.spark.ml.recommendation.ALS; -import org.apache.spark.ml.recommendation.ALSModel; -// $example off$ - -public class JavaALSExample { - - // $example on$ - public static class Rating implements Serializable { - private int userId; - private int movieId; - private float rating; - private long timestamp; - - public Rating() {} - - public Rating(int userId, int movieId, float rating, long timestamp) { - this.userId = userId; - this.movieId = movieId; - this.rating = rating; - this.timestamp = timestamp; - } - - public int getUserId() { - return userId; - } - - public int getMovieId() { - return movieId; - } - - public float getRating() { - return rating; - } - - public long getTimestamp() { - return timestamp; - } - - public static Rating parseRating(String str) { - String[] fields = str.split("::"); - if (fields.length != 4) { - throw new IllegalArgumentException("Each line must contain 4 fields"); - } - int userId = Integer.parseInt(fields[0]); - int movieId = Integer.parseInt(fields[1]); - float rating = Float.parseFloat(fields[2]); - long timestamp = Long.parseLong(fields[3]); - return new Rating(userId, movieId, rating, timestamp); - } - } - // $example off$ - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaALSExample") - .getOrCreate(); - - // $example on$ - JavaRDD ratingsRDD = spark - .read().textFile("data/mllib/als/sample_movielens_ratings.txt").javaRDD() - .map(Rating::parseRating); - Dataset ratings = spark.createDataFrame(ratingsRDD, Rating.class); - Dataset[] splits = ratings.randomSplit(new double[]{0.8, 0.2}); - Dataset training = splits[0]; - Dataset test = splits[1]; - - // Build the recommendation model using ALS on the training data - ALS als = new ALS() - .setMaxIter(5) - .setRegParam(0.01) - .setUserCol("userId") - .setItemCol("movieId") - .setRatingCol("rating"); - ALSModel model = als.fit(training); - - // Evaluate the model by computing the RMSE on the test data - // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics - model.setColdStartStrategy("drop"); - Dataset predictions = model.transform(test); - - RegressionEvaluator evaluator = new RegressionEvaluator() - .setMetricName("rmse") - .setLabelCol("rating") - .setPredictionCol("prediction"); - Double rmse = evaluator.evaluate(predictions); - System.out.println("Root-mean-square error = " + rmse); - - // Generate top 10 movie recommendations for each user - Dataset userRecs = model.recommendForAllUsers(10); - // Generate top 10 user recommendations for each movie - Dataset movieRecs = model.recommendForAllItems(10); - - // Generate top 10 movie recommendations for a specified set of users - Dataset users = ratings.select(als.getUserCol()).distinct().limit(3); - Dataset userSubsetRecs = model.recommendForUserSubset(users, 10); - // Generate top 10 user recommendations for a specified set of movies - Dataset movies = ratings.select(als.getItemCol()).distinct().limit(3); - Dataset movieSubSetRecs = model.recommendForItemSubset(movies, 10); - // $example off$ - userRecs.show(); - movieRecs.show(); - userSubsetRecs.show(); - movieSubSetRecs.show(); - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java deleted file mode 100644 index 3090d8f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.Binarizer; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaBinarizerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaBinarizerExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0, 0.1), - RowFactory.create(1, 0.8), - RowFactory.create(2, 0.2) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) - }); - Dataset continuousDataFrame = spark.createDataFrame(data, schema); - - Binarizer binarizer = new Binarizer() - .setInputCol("feature") - .setOutputCol("binarized_feature") - .setThreshold(0.5); - - Dataset binarizedDataFrame = binarizer.transform(continuousDataFrame); - - System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold()); - binarizedDataFrame.show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java deleted file mode 100644 index 8c82aaa..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.clustering.BisectingKMeans; -import org.apache.spark.ml.clustering.BisectingKMeansModel; -import org.apache.spark.ml.linalg.Vector; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ -import org.apache.spark.sql.SparkSession; - - -/** - * An example demonstrating bisecting k-means clustering. - * Run with - *
- * bin/run-example ml.JavaBisectingKMeansExample
- * 
- */ -public class JavaBisectingKMeansExample { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaBisectingKMeansExample") - .getOrCreate(); - - // $example on$ - // Loads data. - Dataset dataset = spark.read().format("libsvm").load("data/mllib/sample_kmeans_data.txt"); - - // Trains a bisecting k-means model. - BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1); - BisectingKMeansModel model = bkm.fit(dataset); - - // Evaluate clustering. - double cost = model.computeCost(dataset); - System.out.println("Within Set Sum of Squared Errors = " + cost); - - // Shows the result. - System.out.println("Cluster Centers: "); - Vector[] centers = model.clusterCenters(); - for (Vector center : centers) { - System.out.println(center); - } - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java deleted file mode 100644 index ff917b7..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.BucketedRandomProjectionLSH; -import org.apache.spark.ml.feature.BucketedRandomProjectionLSHModel; -import org.apache.spark.ml.linalg.Vector; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -import static org.apache.spark.sql.functions.col; -// $example off$ - -/** - * An example demonstrating BucketedRandomProjectionLSH. - * Run with: - * bin/run-example ml.JavaBucketedRandomProjectionLSHExample - */ -public class JavaBucketedRandomProjectionLSHExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaBucketedRandomProjectionLSHExample") - .getOrCreate(); - - // $example on$ - List dataA = Arrays.asList( - RowFactory.create(0, Vectors.dense(1.0, 1.0)), - RowFactory.create(1, Vectors.dense(1.0, -1.0)), - RowFactory.create(2, Vectors.dense(-1.0, -1.0)), - RowFactory.create(3, Vectors.dense(-1.0, 1.0)) - ); - - List dataB = Arrays.asList( - RowFactory.create(4, Vectors.dense(1.0, 0.0)), - RowFactory.create(5, Vectors.dense(-1.0, 0.0)), - RowFactory.create(6, Vectors.dense(0.0, 1.0)), - RowFactory.create(7, Vectors.dense(0.0, -1.0)) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("features", new VectorUDT(), false, Metadata.empty()) - }); - Dataset dfA = spark.createDataFrame(dataA, schema); - Dataset dfB = spark.createDataFrame(dataB, schema); - - Vector key = Vectors.dense(1.0, 0.0); - - BucketedRandomProjectionLSH mh = new BucketedRandomProjectionLSH() - .setBucketLength(2.0) - .setNumHashTables(3) - .setInputCol("features") - .setOutputCol("hashes"); - - BucketedRandomProjectionLSHModel model = mh.fit(dfA); - - // Feature Transformation - System.out.println("The hashed dataset where hashed values are stored in the column 'hashes':"); - model.transform(dfA).show(); - - // Compute the locality sensitive hashes for the input rows, then perform approximate - // similarity join. - // We could avoid computing hashes by passing in the already-transformed dataset, e.g. - // `model.approxSimilarityJoin(transformedA, transformedB, 1.5)` - System.out.println("Approximately joining dfA and dfB on distance smaller than 1.5:"); - model.approxSimilarityJoin(dfA, dfB, 1.5, "EuclideanDistance") - .select(col("datasetA.id").alias("idA"), - col("datasetB.id").alias("idB"), - col("EuclideanDistance")).show(); - - // Compute the locality sensitive hashes for the input rows, then perform approximate nearest - // neighbor search. - // We could avoid computing hashes by passing in the already-transformed dataset, e.g. - // `model.approxNearestNeighbors(transformedA, key, 2)` - System.out.println("Approximately searching dfA for 2 nearest neighbors of the key:"); - model.approxNearestNeighbors(dfA, key, 2).show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java deleted file mode 100644 index 3e49bf0..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.Bucketizer; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -/** - * An example for Bucketizer. - * Run with - *
- * bin/run-example ml.JavaBucketizerExample
- * 
- */ -public class JavaBucketizerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaBucketizerExample") - .getOrCreate(); - - // $example on$ - double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY}; - - List data = Arrays.asList( - RowFactory.create(-999.9), - RowFactory.create(-0.5), - RowFactory.create(-0.3), - RowFactory.create(0.0), - RowFactory.create(0.2), - RowFactory.create(999.9) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) - }); - Dataset dataFrame = spark.createDataFrame(data, schema); - - Bucketizer bucketizer = new Bucketizer() - .setInputCol("features") - .setOutputCol("bucketedFeatures") - .setSplits(splits); - - // Transform original data into its bucket index. - Dataset bucketedData = bucketizer.transform(dataFrame); - - System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets"); - bucketedData.show(); - // $example off$ - - // $example on$ - // Bucketize multiple columns at one pass. - double[][] splitsArray = { - {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY}, - {Double.NEGATIVE_INFINITY, -0.3, 0.0, 0.3, Double.POSITIVE_INFINITY} - }; - - List data2 = Arrays.asList( - RowFactory.create(-999.9, -999.9), - RowFactory.create(-0.5, -0.2), - RowFactory.create(-0.3, -0.1), - RowFactory.create(0.0, 0.0), - RowFactory.create(0.2, 0.4), - RowFactory.create(999.9, 999.9) - ); - StructType schema2 = new StructType(new StructField[]{ - new StructField("features1", DataTypes.DoubleType, false, Metadata.empty()), - new StructField("features2", DataTypes.DoubleType, false, Metadata.empty()) - }); - Dataset dataFrame2 = spark.createDataFrame(data2, schema2); - - Bucketizer bucketizer2 = new Bucketizer() - .setInputCols(new String[] {"features1", "features2"}) - .setOutputCols(new String[] {"bucketedFeatures1", "bucketedFeatures2"}) - .setSplitsArray(splitsArray); - // Transform original data into its bucket index. - Dataset bucketedData2 = bucketizer2.transform(dataFrame2); - - System.out.println("Bucketizer output with [" + - (bucketizer2.getSplitsArray()[0].length-1) + ", " + - (bucketizer2.getSplitsArray()[1].length-1) + "] buckets for each input column"); - bucketedData2.show(); - // $example off$ - - spark.stop(); - } -} - - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java deleted file mode 100644 index 7373896..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSqSelectorExample.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.ChiSqSelector; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaChiSqSelectorExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaChiSqSelectorExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0), - RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0), - RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("features", new VectorUDT(), false, Metadata.empty()), - new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty()) - }); - - Dataset df = spark.createDataFrame(data, schema); - - ChiSqSelector selector = new ChiSqSelector() - .setNumTopFeatures(1) - .setFeaturesCol("features") - .setLabelCol("clicked") - .setOutputCol("selectedFeatures"); - - Dataset result = selector.fit(df).transform(df); - - System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures() - + " features selected"); - result.show(); - - // $example off$ - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSquareTestExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSquareTestExample.java deleted file mode 100644 index 4b39350..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaChiSquareTestExample.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.stat.ChiSquareTest; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.*; -// $example off$ - -/** - * An example for Chi-square hypothesis testing. - * Run with - *
- * bin/run-example ml.JavaChiSquareTestExample
- * 
- */ -public class JavaChiSquareTestExample { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaChiSquareTestExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0.0, Vectors.dense(0.5, 10.0)), - RowFactory.create(0.0, Vectors.dense(1.5, 20.0)), - RowFactory.create(1.0, Vectors.dense(1.5, 30.0)), - RowFactory.create(0.0, Vectors.dense(3.5, 30.0)), - RowFactory.create(0.0, Vectors.dense(3.5, 40.0)), - RowFactory.create(1.0, Vectors.dense(3.5, 40.0)) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), - new StructField("features", new VectorUDT(), false, Metadata.empty()), - }); - - Dataset df = spark.createDataFrame(data, schema); - Row r = ChiSquareTest.test(df, "features", "label").head(); - System.out.println("pValues: " + r.get(0).toString()); - System.out.println("degreesOfFreedom: " + r.getList(1).toString()); - System.out.println("statistics: " + r.get(2).toString()); - - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationExample.java deleted file mode 100644 index 2a6d62a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationExample.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.stat.Correlation; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.*; -// $example off$ - -/** - * An example for computing correlation matrix. - * Run with - *
- * bin/run-example ml.JavaCorrelationExample
- * 
- */ -public class JavaCorrelationExample { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaCorrelationExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(Vectors.sparse(4, new int[]{0, 3}, new double[]{1.0, -2.0})), - RowFactory.create(Vectors.dense(4.0, 5.0, 0.0, 3.0)), - RowFactory.create(Vectors.dense(6.0, 7.0, 0.0, 8.0)), - RowFactory.create(Vectors.sparse(4, new int[]{0, 3}, new double[]{9.0, 1.0})) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("features", new VectorUDT(), false, Metadata.empty()), - }); - - Dataset df = spark.createDataFrame(data, schema); - Row r1 = Correlation.corr(df, "features").head(); - System.out.println("Pearson correlation matrix:\n" + r1.get(0).toString()); - - Row r2 = Correlation.corr(df, "features", "spearman").head(); - System.out.println("Spearman correlation matrix:\n" + r2.get(0).toString()); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java deleted file mode 100644 index ac2a86c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.CountVectorizer; -import org.apache.spark.ml.feature.CountVectorizerModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.*; -// $example off$ - -public class JavaCountVectorizerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaCountVectorizerExample") - .getOrCreate(); - - // $example on$ - // Input data: Each row is a bag of words from a sentence or document. - List data = Arrays.asList( - RowFactory.create(Arrays.asList("a", "b", "c")), - RowFactory.create(Arrays.asList("a", "b", "b", "c", "a")) - ); - StructType schema = new StructType(new StructField [] { - new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) - }); - Dataset df = spark.createDataFrame(data, schema); - - // fit a CountVectorizerModel from the corpus - CountVectorizerModel cvModel = new CountVectorizer() - .setInputCol("text") - .setOutputCol("feature") - .setVocabSize(3) - .setMinDF(2) - .fit(df); - - // alternatively, define CountVectorizerModel with a-priori vocabulary - CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"}) - .setInputCol("text") - .setOutputCol("feature"); - - cvModel.transform(df).show(false); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java deleted file mode 100644 index 04546d2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.DCT; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaDCTExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaDCTExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)), - RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)), - RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0)) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("features", new VectorUDT(), false, Metadata.empty()), - }); - Dataset df = spark.createDataFrame(data, schema); - - DCT dct = new DCT() - .setInputCol("features") - .setOutputCol("featuresDCT") - .setInverse(false); - - Dataset dctDf = dct.transform(df); - - dctDf.select("featuresDCT").show(false); - // $example off$ - - spark.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java deleted file mode 100644 index a9c6e7f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -// scalastyle:off println -package org.apache.spark.examples.ml; -// $example on$ -import org.apache.spark.ml.Pipeline; -import org.apache.spark.ml.PipelineModel; -import org.apache.spark.ml.PipelineStage; -import org.apache.spark.ml.classification.DecisionTreeClassifier; -import org.apache.spark.ml.classification.DecisionTreeClassificationModel; -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; -import org.apache.spark.ml.feature.*; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -public class JavaDecisionTreeClassificationExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaDecisionTreeClassificationExample") - .getOrCreate(); - - // $example on$ - // Load the data stored in LIBSVM format as a DataFrame. - Dataset data = spark - .read() - .format("libsvm") - .load("data/mllib/sample_libsvm_data.txt"); - - // Index labels, adding metadata to the label column. - // Fit on whole dataset to include all labels in index. - StringIndexerModel labelIndexer = new StringIndexer() - .setInputCol("label") - .setOutputCol("indexedLabel") - .fit(data); - - // Automatically identify categorical features, and index them. - VectorIndexerModel featureIndexer = new VectorIndexer() - .setInputCol("features") - .setOutputCol("indexedFeatures") - .setMaxCategories(4) // features with > 4 distinct values are treated as continuous. - .fit(data); - - // Split the data into training and test sets (30% held out for testing). - Dataset[] splits = data.randomSplit(new double[]{0.7, 0.3}); - Dataset trainingData = splits[0]; - Dataset testData = splits[1]; - - // Train a DecisionTree model. - DecisionTreeClassifier dt = new DecisionTreeClassifier() - .setLabelCol("indexedLabel") - .setFeaturesCol("indexedFeatures"); - - // Convert indexed labels back to original labels. - IndexToString labelConverter = new IndexToString() - .setInputCol("prediction") - .setOutputCol("predictedLabel") - .setLabels(labelIndexer.labels()); - - // Chain indexers and tree in a Pipeline. - Pipeline pipeline = new Pipeline() - .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter}); - - // Train model. This also runs the indexers. - PipelineModel model = pipeline.fit(trainingData); - - // Make predictions. - Dataset predictions = model.transform(testData); - - // Select example rows to display. - predictions.select("predictedLabel", "label", "features").show(5); - - // Select (prediction, true label) and compute test error. - MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() - .setLabelCol("indexedLabel") - .setPredictionCol("prediction") - .setMetricName("accuracy"); - double accuracy = evaluator.evaluate(predictions); - System.out.println("Test Error = " + (1.0 - accuracy)); - - DecisionTreeClassificationModel treeModel = - (DecisionTreeClassificationModel) (model.stages()[2]); - System.out.println("Learned classification tree model:\n" + treeModel.toDebugString()); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java deleted file mode 100644 index cffb713..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -// scalastyle:off println -package org.apache.spark.examples.ml; -// $example on$ -import org.apache.spark.ml.Pipeline; -import org.apache.spark.ml.PipelineModel; -import org.apache.spark.ml.PipelineStage; -import org.apache.spark.ml.evaluation.RegressionEvaluator; -import org.apache.spark.ml.feature.VectorIndexer; -import org.apache.spark.ml.feature.VectorIndexerModel; -import org.apache.spark.ml.regression.DecisionTreeRegressionModel; -import org.apache.spark.ml.regression.DecisionTreeRegressor; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -public class JavaDecisionTreeRegressionExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaDecisionTreeRegressionExample") - .getOrCreate(); - // $example on$ - // Load the data stored in LIBSVM format as a DataFrame. - Dataset data = spark.read().format("libsvm") - .load("data/mllib/sample_libsvm_data.txt"); - - // Automatically identify categorical features, and index them. - // Set maxCategories so features with > 4 distinct values are treated as continuous. - VectorIndexerModel featureIndexer = new VectorIndexer() - .setInputCol("features") - .setOutputCol("indexedFeatures") - .setMaxCategories(4) - .fit(data); - - // Split the data into training and test sets (30% held out for testing). - Dataset[] splits = data.randomSplit(new double[]{0.7, 0.3}); - Dataset trainingData = splits[0]; - Dataset testData = splits[1]; - - // Train a DecisionTree model. - DecisionTreeRegressor dt = new DecisionTreeRegressor() - .setFeaturesCol("indexedFeatures"); - - // Chain indexer and tree in a Pipeline. - Pipeline pipeline = new Pipeline() - .setStages(new PipelineStage[]{featureIndexer, dt}); - - // Train model. This also runs the indexer. - PipelineModel model = pipeline.fit(trainingData); - - // Make predictions. - Dataset predictions = model.transform(testData); - - // Select example rows to display. - predictions.select("label", "features").show(5); - - // Select (prediction, true label) and compute test error. - RegressionEvaluator evaluator = new RegressionEvaluator() - .setLabelCol("label") - .setPredictionCol("prediction") - .setMetricName("rmse"); - double rmse = evaluator.evaluate(predictions); - System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse); - - DecisionTreeRegressionModel treeModel = - (DecisionTreeRegressionModel) (model.stages()[1]); - System.out.println("Learned regression tree model:\n" + treeModel.toDebugString()); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDocument.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDocument.java deleted file mode 100644 index 6459dab..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaDocument.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import java.io.Serializable; - -/** - * Unlabeled instance type, Spark SQL can infer schema from Java Beans. - */ -@SuppressWarnings("serial") -public class JavaDocument implements Serializable { - - private long id; - private String text; - - public JavaDocument(long id, String text) { - this.id = id; - this.text = text; - } - - public long getId() { - return this.id; - } - - public String getText() { - return this.text; - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java deleted file mode 100644 index d2e70c2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.ElementwiseProduct; -import org.apache.spark.ml.linalg.Vector; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaElementwiseProductExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaElementwiseProductExample") - .getOrCreate(); - - // $example on$ - // Create some vector data; also works for sparse vectors - List data = Arrays.asList( - RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)), - RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0)) - ); - - List fields = new ArrayList<>(2); - fields.add(DataTypes.createStructField("id", DataTypes.StringType, false)); - fields.add(DataTypes.createStructField("vector", new VectorUDT(), false)); - - StructType schema = DataTypes.createStructType(fields); - - Dataset dataFrame = spark.createDataFrame(data, schema); - - Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); - - ElementwiseProduct transformer = new ElementwiseProduct() - .setScalingVec(transformingVector) - .setInputCol("vector") - .setOutputCol("transformedVector"); - - // Batch transform the vectors to create new column: - transformer.transform(dataFrame).show(); - // $example off$ - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java deleted file mode 100644 index 9e07a0c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.ml.param.ParamMap; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ -import org.apache.spark.sql.SparkSession; - -/** - * Java example for Estimator, Transformer, and Param. - */ -public class JavaEstimatorTransformerParamExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaEstimatorTransformerParamExample") - .getOrCreate(); - - // $example on$ - // Prepare training data. - List dataTraining = Arrays.asList( - RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)), - RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)), - RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)), - RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5)) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), - new StructField("features", new VectorUDT(), false, Metadata.empty()) - }); - Dataset training = spark.createDataFrame(dataTraining, schema); - - // Create a LogisticRegression instance. This instance is an Estimator. - LogisticRegression lr = new LogisticRegression(); - // Print out the parameters, documentation, and any default values. - System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n"); - - // We may set parameters using setter methods. - lr.setMaxIter(10).setRegParam(0.01); - - // Learn a LogisticRegression model. This uses the parameters stored in lr. - LogisticRegressionModel model1 = lr.fit(training); - // Since model1 is a Model (i.e., a Transformer produced by an Estimator), - // we can view the parameters it used during fit(). - // This prints the parameter (name: value) pairs, where names are unique IDs for this - // LogisticRegression instance. - System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap()); - - // We may alternatively specify parameters using a ParamMap. - ParamMap paramMap = new ParamMap() - .put(lr.maxIter().w(20)) // Specify 1 Param. - .put(lr.maxIter(), 30) // This overwrites the original maxIter. - .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params. - - // One can also combine ParamMaps. - ParamMap paramMap2 = new ParamMap() - .put(lr.probabilityCol().w("myProbability")); // Change output column name - ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2); - - // Now learn a new model using the paramMapCombined parameters. - // paramMapCombined overrides all parameters set earlier via lr.set* methods. - LogisticRegressionModel model2 = lr.fit(training, paramMapCombined); - System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); - - // Prepare test documents. - List dataTest = Arrays.asList( - RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)), - RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)), - RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5)) - ); - Dataset test = spark.createDataFrame(dataTest, schema); - - // Make predictions on test documents using the Transformer.transform() method. - // LogisticRegression.transform will only use the 'features' column. - // Note that model2.transform() outputs a 'myProbability' column instead of the usual - // 'probability' column since we renamed the lr.probabilityCol parameter previously. - Dataset results = model2.transform(test); - Dataset rows = results.select("features", "label", "myProbability", "prediction"); - for (Row r: rows.collectAsList()) { - System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) - + ", prediction=" + r.get(3)); - } - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaFPGrowthExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaFPGrowthExample.java deleted file mode 100644 index 717ec21..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaFPGrowthExample.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.fpm.FPGrowth; -import org.apache.spark.ml.fpm.FPGrowthModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.*; -// $example off$ - -/** - * An example demonstrating FPGrowth. - * Run with - *
- * bin/run-example ml.JavaFPGrowthExample
- * 
- */ -public class JavaFPGrowthExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaFPGrowthExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(Arrays.asList("1 2 5".split(" "))), - RowFactory.create(Arrays.asList("1 2 3 5".split(" "))), - RowFactory.create(Arrays.asList("1 2".split(" "))) - ); - StructType schema = new StructType(new StructField[]{ new StructField( - "items", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) - }); - Dataset itemsDF = spark.createDataFrame(data, schema); - - FPGrowthModel model = new FPGrowth() - .setItemsCol("items") - .setMinSupport(0.5) - .setMinConfidence(0.6) - .fit(itemsDF); - - // Display frequent itemsets. - model.freqItemsets().show(); - - // Display generated association rules. - model.associationRules().show(); - - // transform examines the input items against all the association rules and summarize the - // consequents as prediction - model.transform(itemsDF).show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaFeatureHasherExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaFeatureHasherExample.java deleted file mode 100644 index 9730d42..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaFeatureHasherExample.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.FeatureHasher; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaFeatureHasherExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaFeatureHasherExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(2.2, true, "1", "foo"), - RowFactory.create(3.3, false, "2", "bar"), - RowFactory.create(4.4, false, "3", "baz"), - RowFactory.create(5.5, false, "4", "foo") - ); - StructType schema = new StructType(new StructField[]{ - new StructField("real", DataTypes.DoubleType, false, Metadata.empty()), - new StructField("bool", DataTypes.BooleanType, false, Metadata.empty()), - new StructField("stringNum", DataTypes.StringType, false, Metadata.empty()), - new StructField("string", DataTypes.StringType, false, Metadata.empty()) - }); - Dataset dataset = spark.createDataFrame(data, schema); - - FeatureHasher hasher = new FeatureHasher() - .setInputCols(new String[]{"real", "bool", "stringNum", "string"}) - .setOutputCol("features"); - - Dataset featurized = hasher.transform(dataset); - - featurized.show(false); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java deleted file mode 100644 index 72bd5d0..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.clustering.GaussianMixture; -import org.apache.spark.ml.clustering.GaussianMixtureModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ -import org.apache.spark.sql.SparkSession; - - -/** - * An example demonstrating Gaussian Mixture Model. - * Run with - *
- * bin/run-example ml.JavaGaussianMixtureExample
- * 
- */ -public class JavaGaussianMixtureExample { - - public static void main(String[] args) { - - // Creates a SparkSession - SparkSession spark = SparkSession - .builder() - .appName("JavaGaussianMixtureExample") - .getOrCreate(); - - // $example on$ - // Loads data - Dataset dataset = spark.read().format("libsvm").load("data/mllib/sample_kmeans_data.txt"); - - // Trains a GaussianMixture model - GaussianMixture gmm = new GaussianMixture() - .setK(2); - GaussianMixtureModel model = gmm.fit(dataset); - - // Output the parameters of the mixture model - for (int i = 0; i < model.getK(); i++) { - System.out.printf("Gaussian %d:\nweight=%f\nmu=%s\nsigma=\n%s\n\n", - i, model.weights()[i], model.gaussians()[i].mean(), model.gaussians()[i].cov()); - } - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGeneralizedLinearRegressionExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGeneralizedLinearRegressionExample.java deleted file mode 100644 index 3f072d1..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGeneralizedLinearRegressionExample.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; - -import org.apache.spark.ml.regression.GeneralizedLinearRegression; -import org.apache.spark.ml.regression.GeneralizedLinearRegressionModel; -import org.apache.spark.ml.regression.GeneralizedLinearRegressionTrainingSummary; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ -import org.apache.spark.sql.SparkSession; - -/** - * An example demonstrating generalized linear regression. - * Run with - *
- * bin/run-example ml.JavaGeneralizedLinearRegressionExample
- * 
- */ - -public class JavaGeneralizedLinearRegressionExample { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaGeneralizedLinearRegressionExample") - .getOrCreate(); - - // $example on$ - // Load training data - Dataset dataset = spark.read().format("libsvm") - .load("data/mllib/sample_linear_regression_data.txt"); - - GeneralizedLinearRegression glr = new GeneralizedLinearRegression() - .setFamily("gaussian") - .setLink("identity") - .setMaxIter(10) - .setRegParam(0.3); - - // Fit the model - GeneralizedLinearRegressionModel model = glr.fit(dataset); - - // Print the coefficients and intercept for generalized linear regression model - System.out.println("Coefficients: " + model.coefficients()); - System.out.println("Intercept: " + model.intercept()); - - // Summarize the model over the training set and print out some metrics - GeneralizedLinearRegressionTrainingSummary summary = model.summary(); - System.out.println("Coefficient Standard Errors: " - + Arrays.toString(summary.coefficientStandardErrors())); - System.out.println("T Values: " + Arrays.toString(summary.tValues())); - System.out.println("P Values: " + Arrays.toString(summary.pValues())); - System.out.println("Dispersion: " + summary.dispersion()); - System.out.println("Null Deviance: " + summary.nullDeviance()); - System.out.println("Residual Degree Of Freedom Null: " + summary.residualDegreeOfFreedomNull()); - System.out.println("Deviance: " + summary.deviance()); - System.out.println("Residual Degree Of Freedom: " + summary.residualDegreeOfFreedom()); - System.out.println("AIC: " + summary.aic()); - System.out.println("Deviance Residuals: "); - summary.residuals().show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java deleted file mode 100644 index 3e9eb99..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.Pipeline; -import org.apache.spark.ml.PipelineModel; -import org.apache.spark.ml.PipelineStage; -import org.apache.spark.ml.classification.GBTClassificationModel; -import org.apache.spark.ml.classification.GBTClassifier; -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; -import org.apache.spark.ml.feature.*; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -public class JavaGradientBoostedTreeClassifierExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaGradientBoostedTreeClassifierExample") - .getOrCreate(); - - // $example on$ - // Load and parse the data file, converting it to a DataFrame. - Dataset data = spark - .read() - .format("libsvm") - .load("data/mllib/sample_libsvm_data.txt"); - - // Index labels, adding metadata to the label column. - // Fit on whole dataset to include all labels in index. - StringIndexerModel labelIndexer = new StringIndexer() - .setInputCol("label") - .setOutputCol("indexedLabel") - .fit(data); - // Automatically identify categorical features, and index them. - // Set maxCategories so features with > 4 distinct values are treated as continuous. - VectorIndexerModel featureIndexer = new VectorIndexer() - .setInputCol("features") - .setOutputCol("indexedFeatures") - .setMaxCategories(4) - .fit(data); - - // Split the data into training and test sets (30% held out for testing) - Dataset[] splits = data.randomSplit(new double[] {0.7, 0.3}); - Dataset trainingData = splits[0]; - Dataset testData = splits[1]; - - // Train a GBT model. - GBTClassifier gbt = new GBTClassifier() - .setLabelCol("indexedLabel") - .setFeaturesCol("indexedFeatures") - .setMaxIter(10); - - // Convert indexed labels back to original labels. - IndexToString labelConverter = new IndexToString() - .setInputCol("prediction") - .setOutputCol("predictedLabel") - .setLabels(labelIndexer.labels()); - - // Chain indexers and GBT in a Pipeline. - Pipeline pipeline = new Pipeline() - .setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter}); - - // Train model. This also runs the indexers. - PipelineModel model = pipeline.fit(trainingData); - - // Make predictions. - Dataset predictions = model.transform(testData); - - // Select example rows to display. - predictions.select("predictedLabel", "label", "features").show(5); - - // Select (prediction, true label) and compute test error. - MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() - .setLabelCol("indexedLabel") - .setPredictionCol("prediction") - .setMetricName("accuracy"); - double accuracy = evaluator.evaluate(predictions); - System.out.println("Test Error = " + (1.0 - accuracy)); - - GBTClassificationModel gbtModel = (GBTClassificationModel)(model.stages()[2]); - System.out.println("Learned classification GBT model:\n" + gbtModel.toDebugString()); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java deleted file mode 100644 index 769b5c3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.Pipeline; -import org.apache.spark.ml.PipelineModel; -import org.apache.spark.ml.PipelineStage; -import org.apache.spark.ml.evaluation.RegressionEvaluator; -import org.apache.spark.ml.feature.VectorIndexer; -import org.apache.spark.ml.feature.VectorIndexerModel; -import org.apache.spark.ml.regression.GBTRegressionModel; -import org.apache.spark.ml.regression.GBTRegressor; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -public class JavaGradientBoostedTreeRegressorExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaGradientBoostedTreeRegressorExample") - .getOrCreate(); - - // $example on$ - // Load and parse the data file, converting it to a DataFrame. - Dataset data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); - - // Automatically identify categorical features, and index them. - // Set maxCategories so features with > 4 distinct values are treated as continuous. - VectorIndexerModel featureIndexer = new VectorIndexer() - .setInputCol("features") - .setOutputCol("indexedFeatures") - .setMaxCategories(4) - .fit(data); - - // Split the data into training and test sets (30% held out for testing). - Dataset[] splits = data.randomSplit(new double[] {0.7, 0.3}); - Dataset trainingData = splits[0]; - Dataset testData = splits[1]; - - // Train a GBT model. - GBTRegressor gbt = new GBTRegressor() - .setLabelCol("label") - .setFeaturesCol("indexedFeatures") - .setMaxIter(10); - - // Chain indexer and GBT in a Pipeline. - Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {featureIndexer, gbt}); - - // Train model. This also runs the indexer. - PipelineModel model = pipeline.fit(trainingData); - - // Make predictions. - Dataset predictions = model.transform(testData); - - // Select example rows to display. - predictions.select("prediction", "label", "features").show(5); - - // Select (prediction, true label) and compute test error. - RegressionEvaluator evaluator = new RegressionEvaluator() - .setLabelCol("label") - .setPredictionCol("prediction") - .setMetricName("rmse"); - double rmse = evaluator.evaluate(predictions); - System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse); - - GBTRegressionModel gbtModel = (GBTRegressionModel)(model.stages()[1]); - System.out.println("Learned regression GBT model:\n" + gbtModel.toDebugString()); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaImputerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaImputerExample.java deleted file mode 100644 index ac40ccd..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaImputerExample.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.Imputer; -import org.apache.spark.ml.feature.ImputerModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.*; -// $example off$ - -import static org.apache.spark.sql.types.DataTypes.*; - -/** - * An example demonstrating Imputer. - * Run with: - * bin/run-example ml.JavaImputerExample - */ -public class JavaImputerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaImputerExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(1.0, Double.NaN), - RowFactory.create(2.0, Double.NaN), - RowFactory.create(Double.NaN, 3.0), - RowFactory.create(4.0, 4.0), - RowFactory.create(5.0, 5.0) - ); - StructType schema = new StructType(new StructField[]{ - createStructField("a", DoubleType, false), - createStructField("b", DoubleType, false) - }); - Dataset df = spark.createDataFrame(data, schema); - - Imputer imputer = new Imputer() - .setInputCols(new String[]{"a", "b"}) - .setOutputCols(new String[]{"out_a", "out_b"}); - - ImputerModel model = imputer.fit(df); - model.transform(df).show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java deleted file mode 100644 index 6965512..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaIndexToStringExample.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.attribute.Attribute; -import org.apache.spark.ml.feature.IndexToString; -import org.apache.spark.ml.feature.StringIndexer; -import org.apache.spark.ml.feature.StringIndexerModel; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaIndexToStringExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaIndexToStringExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0, "a"), - RowFactory.create(1, "b"), - RowFactory.create(2, "c"), - RowFactory.create(3, "a"), - RowFactory.create(4, "a"), - RowFactory.create(5, "c") - ); - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("category", DataTypes.StringType, false, Metadata.empty()) - }); - Dataset df = spark.createDataFrame(data, schema); - - StringIndexerModel indexer = new StringIndexer() - .setInputCol("category") - .setOutputCol("categoryIndex") - .fit(df); - Dataset indexed = indexer.transform(df); - - System.out.println("Transformed string column '" + indexer.getInputCol() + "' " + - "to indexed column '" + indexer.getOutputCol() + "'"); - indexed.show(); - - StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol()); - System.out.println("StringIndexer will store labels in output column metadata: " + - Attribute.fromStructField(inputColSchema).toString() + "\n"); - - IndexToString converter = new IndexToString() - .setInputCol("categoryIndex") - .setOutputCol("originalCategory"); - Dataset converted = converter.transform(indexed); - - System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " + - "original string column '" + converter.getOutputCol() + "' using labels in metadata"); - converted.select("id", "categoryIndex", "originalCategory").show(); - - // $example off$ - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java deleted file mode 100644 index 3684a87..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.ml.feature.Interaction; -import org.apache.spark.ml.feature.VectorAssembler; -import org.apache.spark.sql.*; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -import java.util.Arrays; -import java.util.List; - -// $example on$ -// $example off$ - -public class JavaInteractionExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaInteractionExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(1, 1, 2, 3, 8, 4, 5), - RowFactory.create(2, 4, 3, 8, 7, 9, 8), - RowFactory.create(3, 6, 1, 9, 2, 3, 6), - RowFactory.create(4, 10, 8, 6, 9, 4, 5), - RowFactory.create(5, 9, 2, 7, 10, 7, 3), - RowFactory.create(6, 1, 1, 4, 2, 8, 4) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("id1", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("id2", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("id3", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("id4", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("id5", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("id6", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("id7", DataTypes.IntegerType, false, Metadata.empty()) - }); - - Dataset df = spark.createDataFrame(data, schema); - - VectorAssembler assembler1 = new VectorAssembler() - .setInputCols(new String[]{"id2", "id3", "id4"}) - .setOutputCol("vec1"); - - Dataset assembled1 = assembler1.transform(df); - - VectorAssembler assembler2 = new VectorAssembler() - .setInputCols(new String[]{"id5", "id6", "id7"}) - .setOutputCol("vec2"); - - Dataset assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2"); - - Interaction interaction = new Interaction() - .setInputCols(new String[]{"id1","vec1","vec2"}) - .setOutputCol("interactedCol"); - - Dataset interacted = interaction.transform(assembled2); - - interacted.show(false); - // $example off$ - - spark.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java deleted file mode 100644 index a7de8e6..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaIsotonicRegressionExample.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.ml; - -// $example on$ - -import org.apache.spark.ml.regression.IsotonicRegression; -import org.apache.spark.ml.regression.IsotonicRegressionModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ -import org.apache.spark.sql.SparkSession; - -/** - * An example demonstrating IsotonicRegression. - * Run with - *
- * bin/run-example ml.JavaIsotonicRegressionExample
- * 
- */ -public class JavaIsotonicRegressionExample { - - public static void main(String[] args) { - // Create a SparkSession. - SparkSession spark = SparkSession - .builder() - .appName("JavaIsotonicRegressionExample") - .getOrCreate(); - - // $example on$ - // Loads data. - Dataset dataset = spark.read().format("libsvm") - .load("data/mllib/sample_isotonic_regression_libsvm_data.txt"); - - // Trains an isotonic regression model. - IsotonicRegression ir = new IsotonicRegression(); - IsotonicRegressionModel model = ir.fit(dataset); - - System.out.println("Boundaries in increasing order: " + model.boundaries() + "\n"); - System.out.println("Predictions associated with the boundaries: " + model.predictions() + "\n"); - - // Makes predictions. - model.transform(dataset).show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java deleted file mode 100644 index dc4b0bc..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.clustering.KMeansModel; -import org.apache.spark.ml.clustering.KMeans; -import org.apache.spark.ml.evaluation.ClusteringEvaluator; -import org.apache.spark.ml.linalg.Vector; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ -import org.apache.spark.sql.SparkSession; - - -/** - * An example demonstrating k-means clustering. - * Run with - *
- * bin/run-example ml.JavaKMeansExample
- * 
- */ -public class JavaKMeansExample { - - public static void main(String[] args) { - // Create a SparkSession. - SparkSession spark = SparkSession - .builder() - .appName("JavaKMeansExample") - .getOrCreate(); - - // $example on$ - // Loads data. - Dataset dataset = spark.read().format("libsvm").load("data/mllib/sample_kmeans_data.txt"); - - // Trains a k-means model. - KMeans kmeans = new KMeans().setK(2).setSeed(1L); - KMeansModel model = kmeans.fit(dataset); - - // Make predictions - Dataset predictions = model.transform(dataset); - - // Evaluate clustering by computing Silhouette score - ClusteringEvaluator evaluator = new ClusteringEvaluator(); - - double silhouette = evaluator.evaluate(predictions); - System.out.println("Silhouette with squared euclidean distance = " + silhouette); - - // Shows the result. - Vector[] centers = model.clusterCenters(); - System.out.println("Cluster Centers: "); - for (Vector center: centers) { - System.out.println(center); - } - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java deleted file mode 100644 index 0e5d005..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; -// $example on$ -import org.apache.spark.ml.clustering.LDA; -import org.apache.spark.ml.clustering.LDAModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -/** - * An example demonstrating LDA. - * Run with - *
- * bin/run-example ml.JavaLDAExample
- * 
- */ -public class JavaLDAExample { - - public static void main(String[] args) { - // Creates a SparkSession - SparkSession spark = SparkSession - .builder() - .appName("JavaLDAExample") - .getOrCreate(); - - // $example on$ - // Loads data. - Dataset dataset = spark.read().format("libsvm") - .load("data/mllib/sample_lda_libsvm_data.txt"); - - // Trains a LDA model. - LDA lda = new LDA().setK(10).setMaxIter(10); - LDAModel model = lda.fit(dataset); - - double ll = model.logLikelihood(dataset); - double lp = model.logPerplexity(dataset); - System.out.println("The lower bound on the log likelihood of the entire corpus: " + ll); - System.out.println("The upper bound on perplexity: " + lp); - - // Describe topics. - Dataset topics = model.describeTopics(3); - System.out.println("The topics described by their top-weighted terms:"); - topics.show(false); - - // Shows the result. - Dataset transformed = model.transform(dataset); - transformed.show(false); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLabeledDocument.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLabeledDocument.java deleted file mode 100644 index 68d1caf..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLabeledDocument.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import java.io.Serializable; - -/** - * Labeled instance type, Spark SQL can infer schema from Java Beans. - */ -@SuppressWarnings("serial") -public class JavaLabeledDocument extends JavaDocument implements Serializable { - - private double label; - - public JavaLabeledDocument(long id, String text, double label) { - super(id, text); - this.label = label; - } - - public double getLabel() { - return this.label; - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java deleted file mode 100644 index a561b6d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.regression.LinearRegression; -import org.apache.spark.ml.regression.LinearRegressionModel; -import org.apache.spark.ml.regression.LinearRegressionTrainingSummary; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -public class JavaLinearRegressionWithElasticNetExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaLinearRegressionWithElasticNetExample") - .getOrCreate(); - - // $example on$ - // Load training data. - Dataset training = spark.read().format("libsvm") - .load("data/mllib/sample_linear_regression_data.txt"); - - LinearRegression lr = new LinearRegression() - .setMaxIter(10) - .setRegParam(0.3) - .setElasticNetParam(0.8); - - // Fit the model. - LinearRegressionModel lrModel = lr.fit(training); - - // Print the coefficients and intercept for linear regression. - System.out.println("Coefficients: " - + lrModel.coefficients() + " Intercept: " + lrModel.intercept()); - - // Summarize the model over the training set and print out some metrics. - LinearRegressionTrainingSummary trainingSummary = lrModel.summary(); - System.out.println("numIterations: " + trainingSummary.totalIterations()); - System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory())); - trainingSummary.residuals().show(); - System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError()); - System.out.println("r2: " + trainingSummary.r2()); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java deleted file mode 100644 index a18ed1d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLinearSVCExample.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.classification.LinearSVC; -import org.apache.spark.ml.classification.LinearSVCModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -public class JavaLinearSVCExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaLinearSVCExample") - .getOrCreate(); - - // $example on$ - // Load training data - Dataset training = spark.read().format("libsvm") - .load("data/mllib/sample_libsvm_data.txt"); - - LinearSVC lsvc = new LinearSVC() - .setMaxIter(10) - .setRegParam(0.1); - - // Fit the model - LinearSVCModel lsvcModel = lsvc.fit(training); - - // Print the coefficients and intercept for LinearSVC - System.out.println("Coefficients: " - + lsvcModel.coefficients() + " Intercept: " + lsvcModel.intercept()); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java deleted file mode 100644 index 1529da1..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary; -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.functions; -// $example off$ - -public class JavaLogisticRegressionSummaryExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaLogisticRegressionSummaryExample") - .getOrCreate(); - - // Load training data - Dataset training = spark.read().format("libsvm") - .load("data/mllib/sample_libsvm_data.txt"); - - LogisticRegression lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.3) - .setElasticNetParam(0.8); - - // Fit the model - LogisticRegressionModel lrModel = lr.fit(training); - - // $example on$ - // Extract the summary from the returned LogisticRegressionModel instance trained in the earlier - // example - BinaryLogisticRegressionTrainingSummary trainingSummary = lrModel.binarySummary(); - - // Obtain the loss per iteration. - double[] objectiveHistory = trainingSummary.objectiveHistory(); - for (double lossPerIteration : objectiveHistory) { - System.out.println(lossPerIteration); - } - - // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. - Dataset roc = trainingSummary.roc(); - roc.show(); - roc.select("FPR").show(); - System.out.println(trainingSummary.areaUnderROC()); - - // Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with - // this selected threshold. - Dataset fMeasure = trainingSummary.fMeasureByThreshold(); - double maxFMeasure = fMeasure.select(functions.max("F-Measure")).head().getDouble(0); - double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)) - .select("threshold").head().getDouble(0); - lrModel.setThreshold(bestThreshold); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java deleted file mode 100644 index 4cdec21..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -public class JavaLogisticRegressionWithElasticNetExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaLogisticRegressionWithElasticNetExample") - .getOrCreate(); - - // $example on$ - // Load training data - Dataset training = spark.read().format("libsvm") - .load("data/mllib/sample_libsvm_data.txt"); - - LogisticRegression lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.3) - .setElasticNetParam(0.8); - - // Fit the model - LogisticRegressionModel lrModel = lr.fit(training); - - // Print the coefficients and intercept for logistic regression - System.out.println("Coefficients: " - + lrModel.coefficients() + " Intercept: " + lrModel.intercept()); - - // We can also use the multinomial family for binary classification - LogisticRegression mlr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.3) - .setElasticNetParam(0.8) - .setFamily("multinomial"); - - // Fit the model - LogisticRegressionModel mlrModel = mlr.fit(training); - - // Print the coefficients and intercepts for logistic regression with multinomial family - System.out.println("Multinomial coefficients: " + lrModel.coefficientMatrix() - + "\nMultinomial intercepts: " + mlrModel.interceptVector()); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java deleted file mode 100644 index 9f1ce46..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMaxAbsScalerExample.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.MaxAbsScaler; -import org.apache.spark.ml.feature.MaxAbsScalerModel; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ -import org.apache.spark.sql.SparkSession; - -public class JavaMaxAbsScalerExample { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaMaxAbsScalerExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)), - RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)), - RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0)) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("features", new VectorUDT(), false, Metadata.empty()) - }); - Dataset dataFrame = spark.createDataFrame(data, schema); - - MaxAbsScaler scaler = new MaxAbsScaler() - .setInputCol("features") - .setOutputCol("scaledFeatures"); - - // Compute summary statistics and generate MaxAbsScalerModel - MaxAbsScalerModel scalerModel = scaler.fit(dataFrame); - - // rescale each feature to range [-1, 1]. - Dataset scaledData = scalerModel.transform(dataFrame); - scaledData.select("features", "scaledFeatures").show(); - // $example off$ - - spark.stop(); - } - -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java deleted file mode 100644 index e164598..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.MinHashLSH; -import org.apache.spark.ml.feature.MinHashLSHModel; -import org.apache.spark.ml.linalg.Vector; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -import static org.apache.spark.sql.functions.col; -// $example off$ - -/** - * An example demonstrating MinHashLSH. - * Run with: - * bin/run-example ml.JavaMinHashLSHExample - */ -public class JavaMinHashLSHExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaMinHashLSHExample") - .getOrCreate(); - - // $example on$ - List dataA = Arrays.asList( - RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})), - RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})), - RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0})) - ); - - List dataB = Arrays.asList( - RowFactory.create(0, Vectors.sparse(6, new int[]{1, 3, 5}, new double[]{1.0, 1.0, 1.0})), - RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 5}, new double[]{1.0, 1.0, 1.0})), - RowFactory.create(2, Vectors.sparse(6, new int[]{1, 2, 4}, new double[]{1.0, 1.0, 1.0})) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("features", new VectorUDT(), false, Metadata.empty()) - }); - Dataset dfA = spark.createDataFrame(dataA, schema); - Dataset dfB = spark.createDataFrame(dataB, schema); - - int[] indices = {1, 3}; - double[] values = {1.0, 1.0}; - Vector key = Vectors.sparse(6, indices, values); - - MinHashLSH mh = new MinHashLSH() - .setNumHashTables(5) - .setInputCol("features") - .setOutputCol("hashes"); - - MinHashLSHModel model = mh.fit(dfA); - - // Feature Transformation - System.out.println("The hashed dataset where hashed values are stored in the column 'hashes':"); - model.transform(dfA).show(); - - // Compute the locality sensitive hashes for the input rows, then perform approximate - // similarity join. - // We could avoid computing hashes by passing in the already-transformed dataset, e.g. - // `model.approxSimilarityJoin(transformedA, transformedB, 0.6)` - System.out.println("Approximately joining dfA and dfB on Jaccard distance smaller than 0.6:"); - model.approxSimilarityJoin(dfA, dfB, 0.6, "JaccardDistance") - .select(col("datasetA.id").alias("idA"), - col("datasetB.id").alias("idB"), - col("JaccardDistance")).show(); - - // Compute the locality sensitive hashes for the input rows, then perform approximate nearest - // neighbor search. - // We could avoid computing hashes by passing in the already-transformed dataset, e.g. - // `model.approxNearestNeighbors(transformedA, key, 2)` - // It may return less than 2 rows when not enough approximate near-neighbor candidates are - // found. - System.out.println("Approximately searching dfA for 2 nearest neighbors of the key:"); - model.approxNearestNeighbors(dfA, key, 2).show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java deleted file mode 100644 index 2757af8..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.MinMaxScaler; -import org.apache.spark.ml.feature.MinMaxScalerModel; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaMinMaxScalerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaMinMaxScalerExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0, Vectors.dense(1.0, 0.1, -1.0)), - RowFactory.create(1, Vectors.dense(2.0, 1.1, 1.0)), - RowFactory.create(2, Vectors.dense(3.0, 10.1, 3.0)) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("features", new VectorUDT(), false, Metadata.empty()) - }); - Dataset dataFrame = spark.createDataFrame(data, schema); - - MinMaxScaler scaler = new MinMaxScaler() - .setInputCol("features") - .setOutputCol("scaledFeatures"); - - // Compute summary statistics and generate MinMaxScalerModel - MinMaxScalerModel scalerModel = scaler.fit(dataFrame); - - // rescale each feature to range [min, max]. - Dataset scaledData = scalerModel.transform(dataFrame); - System.out.println("Features scaled to range: [" + scaler.getMin() + ", " - + scaler.getMax() + "]"); - scaledData.select("features", "scaledFeatures").show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java deleted file mode 100644 index d973279..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -// $example off$ - -// $example on$ -import org.apache.spark.ml.Pipeline; -import org.apache.spark.ml.PipelineStage; -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator; -import org.apache.spark.ml.feature.HashingTF; -import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.ml.param.ParamMap; -import org.apache.spark.ml.tuning.CrossValidator; -import org.apache.spark.ml.tuning.CrossValidatorModel; -import org.apache.spark.ml.tuning.ParamGridBuilder; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ -import org.apache.spark.sql.SparkSession; - -/** - * Java example for Model Selection via Cross Validation. - */ -public class JavaModelSelectionViaCrossValidationExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaModelSelectionViaCrossValidationExample") - .getOrCreate(); - - // $example on$ - // Prepare training documents, which are labeled. - Dataset training = spark.createDataFrame(Arrays.asList( - new JavaLabeledDocument(0L, "a b c d e spark", 1.0), - new JavaLabeledDocument(1L, "b d", 0.0), - new JavaLabeledDocument(2L,"spark f g h", 1.0), - new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0), - new JavaLabeledDocument(4L, "b spark who", 1.0), - new JavaLabeledDocument(5L, "g d a y", 0.0), - new JavaLabeledDocument(6L, "spark fly", 1.0), - new JavaLabeledDocument(7L, "was mapreduce", 0.0), - new JavaLabeledDocument(8L, "e spark program", 1.0), - new JavaLabeledDocument(9L, "a e c l", 0.0), - new JavaLabeledDocument(10L, "spark compile", 1.0), - new JavaLabeledDocument(11L, "hadoop software", 0.0) - ), JavaLabeledDocument.class); - - // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. - Tokenizer tokenizer = new Tokenizer() - .setInputCol("text") - .setOutputCol("words"); - HashingTF hashingTF = new HashingTF() - .setNumFeatures(1000) - .setInputCol(tokenizer.getOutputCol()) - .setOutputCol("features"); - LogisticRegression lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.01); - Pipeline pipeline = new Pipeline() - .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); - - // We use a ParamGridBuilder to construct a grid of parameters to search over. - // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, - // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from. - ParamMap[] paramGrid = new ParamGridBuilder() - .addGrid(hashingTF.numFeatures(), new int[] {10, 100, 1000}) - .addGrid(lr.regParam(), new double[] {0.1, 0.01}) - .build(); - - // We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. - // This will allow us to jointly choose parameters for all Pipeline stages. - // A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. - // Note that the evaluator here is a BinaryClassificationEvaluator and its default metric - // is areaUnderROC. - CrossValidator cv = new CrossValidator() - .setEstimator(pipeline) - .setEvaluator(new BinaryClassificationEvaluator()) - .setEstimatorParamMaps(paramGrid) - .setNumFolds(2) // Use 3+ in practice - .setParallelism(2); // Evaluate up to 2 parameter settings in parallel - - // Run cross-validation, and choose the best set of parameters. - CrossValidatorModel cvModel = cv.fit(training); - - // Prepare test documents, which are unlabeled. - Dataset test = spark.createDataFrame(Arrays.asList( - new JavaDocument(4L, "spark i j k"), - new JavaDocument(5L, "l m n"), - new JavaDocument(6L, "mapreduce spark"), - new JavaDocument(7L, "apache hadoop") - ), JavaDocument.class); - - // Make predictions on test documents. cvModel uses the best model found (lrModel). - Dataset predictions = cvModel.transform(test); - for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) { - System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) - + ", prediction=" + r.get(3)); - } - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java deleted file mode 100644 index 2ef8bea..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.evaluation.RegressionEvaluator; -import org.apache.spark.ml.param.ParamMap; -import org.apache.spark.ml.regression.LinearRegression; -import org.apache.spark.ml.tuning.ParamGridBuilder; -import org.apache.spark.ml.tuning.TrainValidationSplit; -import org.apache.spark.ml.tuning.TrainValidationSplitModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ -import org.apache.spark.sql.SparkSession; - -/** - * Java example demonstrating model selection using TrainValidationSplit. - * - * Run with - * {{{ - * bin/run-example ml.JavaModelSelectionViaTrainValidationSplitExample - * }}} - */ -public class JavaModelSelectionViaTrainValidationSplitExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaModelSelectionViaTrainValidationSplitExample") - .getOrCreate(); - - // $example on$ - Dataset data = spark.read().format("libsvm") - .load("data/mllib/sample_linear_regression_data.txt"); - - // Prepare training and test data. - Dataset[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345); - Dataset training = splits[0]; - Dataset test = splits[1]; - - LinearRegression lr = new LinearRegression(); - - // We use a ParamGridBuilder to construct a grid of parameters to search over. - // TrainValidationSplit will try all combinations of values and determine best model using - // the evaluator. - ParamMap[] paramGrid = new ParamGridBuilder() - .addGrid(lr.regParam(), new double[] {0.1, 0.01}) - .addGrid(lr.fitIntercept()) - .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.5, 1.0}) - .build(); - - // In this case the estimator is simply the linear regression. - // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. - TrainValidationSplit trainValidationSplit = new TrainValidationSplit() - .setEstimator(lr) - .setEvaluator(new RegressionEvaluator()) - .setEstimatorParamMaps(paramGrid) - .setTrainRatio(0.8) // 80% for training and the remaining 20% for validation - .setParallelism(2); // Evaluate up to 2 parameter settings in parallel - - // Run train validation split, and choose the best set of parameters. - TrainValidationSplitModel model = trainValidationSplit.fit(training); - - // Make predictions on test data. model is the model with combination of parameters - // that performed best. - model.transform(test) - .select("features", "label", "prediction") - .show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMulticlassLogisticRegressionWithElasticNetExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMulticlassLogisticRegressionWithElasticNetExample.java deleted file mode 100644 index 801a82c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMulticlassLogisticRegressionWithElasticNetExample.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -public class JavaMulticlassLogisticRegressionWithElasticNetExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaMulticlassLogisticRegressionWithElasticNetExample") - .getOrCreate(); - - // $example on$ - // Load training data - Dataset training = spark.read().format("libsvm") - .load("data/mllib/sample_multiclass_classification_data.txt"); - - LogisticRegression lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.3) - .setElasticNetParam(0.8); - - // Fit the model - LogisticRegressionModel lrModel = lr.fit(training); - - // Print the coefficients and intercept for multinomial logistic regression - System.out.println("Coefficients: \n" - + lrModel.coefficientMatrix() + " \nIntercept: " + lrModel.interceptVector()); - LogisticRegressionTrainingSummary trainingSummary = lrModel.summary(); - - // Obtain the loss per iteration. - double[] objectiveHistory = trainingSummary.objectiveHistory(); - for (double lossPerIteration : objectiveHistory) { - System.out.println(lossPerIteration); - } - - // for multiclass, we can inspect metrics on a per-label basis - System.out.println("False positive rate by label:"); - int i = 0; - double[] fprLabel = trainingSummary.falsePositiveRateByLabel(); - for (double fpr : fprLabel) { - System.out.println("label " + i + ": " + fpr); - i++; - } - - System.out.println("True positive rate by label:"); - i = 0; - double[] tprLabel = trainingSummary.truePositiveRateByLabel(); - for (double tpr : tprLabel) { - System.out.println("label " + i + ": " + tpr); - i++; - } - - System.out.println("Precision by label:"); - i = 0; - double[] precLabel = trainingSummary.precisionByLabel(); - for (double prec : precLabel) { - System.out.println("label " + i + ": " + prec); - i++; - } - - System.out.println("Recall by label:"); - i = 0; - double[] recLabel = trainingSummary.recallByLabel(); - for (double rec : recLabel) { - System.out.println("label " + i + ": " + rec); - i++; - } - - System.out.println("F-measure by label:"); - i = 0; - double[] fLabel = trainingSummary.fMeasureByLabel(); - for (double f : fLabel) { - System.out.println("label " + i + ": " + f); - i++; - } - - double accuracy = trainingSummary.accuracy(); - double falsePositiveRate = trainingSummary.weightedFalsePositiveRate(); - double truePositiveRate = trainingSummary.weightedTruePositiveRate(); - double fMeasure = trainingSummary.weightedFMeasure(); - double precision = trainingSummary.weightedPrecision(); - double recall = trainingSummary.weightedRecall(); - System.out.println("Accuracy: " + accuracy); - System.out.println("FPR: " + falsePositiveRate); - System.out.println("TPR: " + truePositiveRate); - System.out.println("F-measure: " + fMeasure); - System.out.println("Precision: " + precision); - System.out.println("Recall: " + recall); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java deleted file mode 100644 index 43db41c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel; -import org.apache.spark.ml.classification.MultilayerPerceptronClassifier; -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; -// $example off$ - -/** - * An example for Multilayer Perceptron Classification. - */ -public class JavaMultilayerPerceptronClassifierExample { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaMultilayerPerceptronClassifierExample") - .getOrCreate(); - - // $example on$ - // Load training data - String path = "data/mllib/sample_multiclass_classification_data.txt"; - Dataset dataFrame = spark.read().format("libsvm").load(path); - - // Split the data into train and test - Dataset[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); - Dataset train = splits[0]; - Dataset test = splits[1]; - - // specify layers for the neural network: - // input layer of size 4 (features), two intermediate of size 5 and 4 - // and output of size 3 (classes) - int[] layers = new int[] {4, 5, 4, 3}; - - // create the trainer and set its parameters - MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier() - .setLayers(layers) - .setBlockSize(128) - .setSeed(1234L) - .setMaxIter(100); - - // train the model - MultilayerPerceptronClassificationModel model = trainer.fit(train); - - // compute accuracy on the test set - Dataset result = model.transform(test); - Dataset predictionAndLabels = result.select("prediction", "label"); - MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() - .setMetricName("accuracy"); - - System.out.println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels)); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java deleted file mode 100644 index 5427e46..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.NGram; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaNGramExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaNGramExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0, Arrays.asList("Hi", "I", "heard", "about", "Spark")), - RowFactory.create(1, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")), - RowFactory.create(2, Arrays.asList("Logistic", "regression", "models", "are", "neat")) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField( - "words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) - }); - - Dataset wordDataFrame = spark.createDataFrame(data, schema); - - NGram ngramTransformer = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams"); - - Dataset ngramDataFrame = ngramTransformer.transform(wordDataFrame); - ngramDataFrame.select("ngrams").show(false); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java deleted file mode 100644 index be578dc..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaNaiveBayesExample.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.classification.NaiveBayes; -import org.apache.spark.ml.classification.NaiveBayesModel; -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -/** - * An example for Naive Bayes Classification. - */ -public class JavaNaiveBayesExample { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaNaiveBayesExample") - .getOrCreate(); - - // $example on$ - // Load training data - Dataset dataFrame = - spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); - // Split the data into train and test - Dataset[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); - Dataset train = splits[0]; - Dataset test = splits[1]; - - // create the trainer and set its parameters - NaiveBayes nb = new NaiveBayes(); - - // train the model - NaiveBayesModel model = nb.fit(train); - - // Select example rows to display. - Dataset predictions = model.transform(test); - predictions.show(); - - // compute accuracy on the test set - MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() - .setLabelCol("label") - .setPredictionCol("prediction") - .setMetricName("accuracy"); - double accuracy = evaluator.evaluate(predictions); - System.out.println("Test set accuracy = " + accuracy); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java deleted file mode 100644 index f878c42..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.Normalizer; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaNormalizerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaNormalizerExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)), - RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)), - RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0)) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("features", new VectorUDT(), false, Metadata.empty()) - }); - Dataset dataFrame = spark.createDataFrame(data, schema); - - // Normalize each Vector using $L^1$ norm. - Normalizer normalizer = new Normalizer() - .setInputCol("features") - .setOutputCol("normFeatures") - .setP(1.0); - - Dataset l1NormData = normalizer.transform(dataFrame); - l1NormData.show(); - - // Normalize each Vector using $L^\infty$ norm. - Dataset lInfNormData = - normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY)); - lInfNormData.show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java deleted file mode 100644 index 6f93cff..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.OneHotEncoderEstimator; -import org.apache.spark.ml.feature.OneHotEncoderModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaOneHotEncoderEstimatorExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaOneHotEncoderEstimatorExample") - .getOrCreate(); - - // Note: categorical features are usually first encoded with StringIndexer - // $example on$ - List data = Arrays.asList( - RowFactory.create(0.0, 1.0), - RowFactory.create(1.0, 0.0), - RowFactory.create(2.0, 1.0), - RowFactory.create(0.0, 2.0), - RowFactory.create(0.0, 1.0), - RowFactory.create(2.0, 0.0) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("categoryIndex1", DataTypes.DoubleType, false, Metadata.empty()), - new StructField("categoryIndex2", DataTypes.DoubleType, false, Metadata.empty()) - }); - - Dataset df = spark.createDataFrame(data, schema); - - OneHotEncoderEstimator encoder = new OneHotEncoderEstimator() - .setInputCols(new String[] {"categoryIndex1", "categoryIndex2"}) - .setOutputCols(new String[] {"categoryVec1", "categoryVec2"}); - - OneHotEncoderModel model = encoder.fit(df); - Dataset encoded = model.transform(df); - encoded.show(); - // $example off$ - - spark.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java deleted file mode 100644 index 82fb540..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.classification.OneVsRest; -import org.apache.spark.ml.classification.OneVsRestModel; -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ -import org.apache.spark.sql.SparkSession; - - -/** - * An example of Multiclass to Binary Reduction with One Vs Rest, - * using Logistic Regression as the base classifier. - * Run with - *
- * bin/run-example ml.JavaOneVsRestExample
- * 
- */ -public class JavaOneVsRestExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaOneVsRestExample") - .getOrCreate(); - - // $example on$ - // load data file. - Dataset inputData = spark.read().format("libsvm") - .load("data/mllib/sample_multiclass_classification_data.txt"); - - // generate the train/test split. - Dataset[] tmp = inputData.randomSplit(new double[]{0.8, 0.2}); - Dataset train = tmp[0]; - Dataset test = tmp[1]; - - // configure the base classifier. - LogisticRegression classifier = new LogisticRegression() - .setMaxIter(10) - .setTol(1E-6) - .setFitIntercept(true); - - // instantiate the One Vs Rest Classifier. - OneVsRest ovr = new OneVsRest().setClassifier(classifier); - - // train the multiclass model. - OneVsRestModel ovrModel = ovr.fit(train); - - // score the model on test data. - Dataset predictions = ovrModel.transform(test) - .select("prediction", "label"); - - // obtain evaluator. - MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() - .setMetricName("accuracy"); - - // compute the classification error on test data. - double accuracy = evaluator.evaluate(predictions); - System.out.println("Test Error = " + (1 - accuracy)); - // $example off$ - - spark.stop(); - } - -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java deleted file mode 100644 index 6951a65..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.PCA; -import org.apache.spark.ml.feature.PCAModel; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaPCAExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaPCAExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})), - RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)), - RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("features", new VectorUDT(), false, Metadata.empty()), - }); - - Dataset df = spark.createDataFrame(data, schema); - - PCAModel pca = new PCA() - .setInputCol("features") - .setOutputCol("pcaFeatures") - .setK(3) - .fit(df); - - Dataset result = pca.transform(df).select("pcaFeatures"); - result.show(false); - // $example off$ - spark.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java deleted file mode 100644 index 4ccd8f6..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; - -import org.apache.spark.ml.Pipeline; -import org.apache.spark.ml.PipelineModel; -import org.apache.spark.ml.PipelineStage; -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.feature.HashingTF; -import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ -import org.apache.spark.sql.SparkSession; - -/** - * Java example for simple text document 'Pipeline'. - */ -public class JavaPipelineExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaPipelineExample") - .getOrCreate(); - - // $example on$ - // Prepare training documents, which are labeled. - Dataset training = spark.createDataFrame(Arrays.asList( - new JavaLabeledDocument(0L, "a b c d e spark", 1.0), - new JavaLabeledDocument(1L, "b d", 0.0), - new JavaLabeledDocument(2L, "spark f g h", 1.0), - new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0) - ), JavaLabeledDocument.class); - - // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. - Tokenizer tokenizer = new Tokenizer() - .setInputCol("text") - .setOutputCol("words"); - HashingTF hashingTF = new HashingTF() - .setNumFeatures(1000) - .setInputCol(tokenizer.getOutputCol()) - .setOutputCol("features"); - LogisticRegression lr = new LogisticRegression() - .setMaxIter(10) - .setRegParam(0.001); - Pipeline pipeline = new Pipeline() - .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); - - // Fit the pipeline to training documents. - PipelineModel model = pipeline.fit(training); - - // Prepare test documents, which are unlabeled. - Dataset test = spark.createDataFrame(Arrays.asList( - new JavaDocument(4L, "spark i j k"), - new JavaDocument(5L, "l m n"), - new JavaDocument(6L, "spark hadoop spark"), - new JavaDocument(7L, "apache hadoop") - ), JavaDocument.class); - - // Make predictions on test documents. - Dataset predictions = model.transform(test); - for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) { - System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) - + ", prediction=" + r.get(3)); - } - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java deleted file mode 100644 index 43c636c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.PolynomialExpansion; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaPolynomialExpansionExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaPolynomialExpansionExample") - .getOrCreate(); - - // $example on$ - PolynomialExpansion polyExpansion = new PolynomialExpansion() - .setInputCol("features") - .setOutputCol("polyFeatures") - .setDegree(3); - - List data = Arrays.asList( - RowFactory.create(Vectors.dense(2.0, 1.0)), - RowFactory.create(Vectors.dense(0.0, 0.0)), - RowFactory.create(Vectors.dense(3.0, -1.0)) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("features", new VectorUDT(), false, Metadata.empty()), - }); - Dataset df = spark.createDataFrame(data, schema); - - Dataset polyDF = polyExpansion.transform(df); - polyDF.show(false); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPowerIterationClusteringExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPowerIterationClusteringExample.java deleted file mode 100644 index 5186563..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPowerIterationClusteringExample.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.clustering.PowerIterationClustering; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaPowerIterationClusteringExample { - public static void main(String[] args) { - // Create a SparkSession. - SparkSession spark = SparkSession - .builder() - .appName("JavaPowerIterationClustering") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0L, 1L, 1.0), - RowFactory.create(0L, 2L, 1.0), - RowFactory.create(1L, 2L, 1.0), - RowFactory.create(3L, 4L, 1.0), - RowFactory.create(4L, 0L, 0.1) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("src", DataTypes.LongType, false, Metadata.empty()), - new StructField("dst", DataTypes.LongType, false, Metadata.empty()), - new StructField("weight", DataTypes.DoubleType, false, Metadata.empty()) - }); - - Dataset df = spark.createDataFrame(data, schema); - - PowerIterationClustering model = new PowerIterationClustering() - .setK(2) - .setMaxIter(10) - .setInitMode("degree") - .setWeightCol("weight"); - - Dataset result = model.assignClusters(df); - result.show(false); - // $example off$ - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java deleted file mode 100644 index 98ffd4f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ - -import org.apache.spark.ml.fpm.PrefixSpan; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.*; - -import java.util.Arrays; -import java.util.List; -// $example off$ - -/** - * An example demonstrating PrefixSpan. - * Run with - *
- * bin/run-example ml.JavaPrefixSpanExample
- * 
- */ -public class JavaPrefixSpanExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaPrefixSpanExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3))), - RowFactory.create(Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1,2))), - RowFactory.create(Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5))), - RowFactory.create(Arrays.asList(Arrays.asList(6))) - ); - StructType schema = new StructType(new StructField[]{ new StructField( - "sequence", new ArrayType(new ArrayType(DataTypes.IntegerType, true), true), - false, Metadata.empty()) - }); - Dataset sequenceDF = spark.createDataFrame(data, schema); - - PrefixSpan prefixSpan = new PrefixSpan().setMinSupport(0.5).setMaxPatternLength(5); - - // Finding frequent sequential patterns - prefixSpan.findFrequentSequentialPatterns(sequenceDF).show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java deleted file mode 100644 index 43cc30c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.QuantileDiscretizer; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaQuantileDiscretizerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaQuantileDiscretizerExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0, 18.0), - RowFactory.create(1, 19.0), - RowFactory.create(2, 8.0), - RowFactory.create(3, 5.0), - RowFactory.create(4, 2.2) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("hour", DataTypes.DoubleType, false, Metadata.empty()) - }); - - Dataset df = spark.createDataFrame(data, schema); - // $example off$ - // Output of QuantileDiscretizer for such small datasets can depend on the number of - // partitions. Here we force a single partition to ensure consistent results. - // Note this is not necessary for normal use cases - df = df.repartition(1); - // $example on$ - QuantileDiscretizer discretizer = new QuantileDiscretizer() - .setInputCol("hour") - .setOutputCol("result") - .setNumBuckets(3); - - Dataset result = discretizer.fit(df).transform(df); - result.show(false); - // $example off$ - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java deleted file mode 100644 index 428067e..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.RFormula; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -import static org.apache.spark.sql.types.DataTypes.*; -// $example off$ - -public class JavaRFormulaExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaRFormulaExample") - .getOrCreate(); - - // $example on$ - StructType schema = createStructType(new StructField[]{ - createStructField("id", IntegerType, false), - createStructField("country", StringType, false), - createStructField("hour", IntegerType, false), - createStructField("clicked", DoubleType, false) - }); - - List data = Arrays.asList( - RowFactory.create(7, "US", 18, 1.0), - RowFactory.create(8, "CA", 12, 0.0), - RowFactory.create(9, "NZ", 15, 0.0) - ); - - Dataset dataset = spark.createDataFrame(data, schema); - RFormula formula = new RFormula() - .setFormula("clicked ~ country + hour") - .setFeaturesCol("features") - .setLabelCol("label"); - Dataset output = formula.fit(dataset).transform(dataset); - output.select("features", "label").show(); - // $example off$ - spark.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java deleted file mode 100644 index da2633e..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.Pipeline; -import org.apache.spark.ml.PipelineModel; -import org.apache.spark.ml.PipelineStage; -import org.apache.spark.ml.classification.RandomForestClassificationModel; -import org.apache.spark.ml.classification.RandomForestClassifier; -import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; -import org.apache.spark.ml.feature.*; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -public class JavaRandomForestClassifierExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaRandomForestClassifierExample") - .getOrCreate(); - - // $example on$ - // Load and parse the data file, converting it to a DataFrame. - Dataset data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); - - // Index labels, adding metadata to the label column. - // Fit on whole dataset to include all labels in index. - StringIndexerModel labelIndexer = new StringIndexer() - .setInputCol("label") - .setOutputCol("indexedLabel") - .fit(data); - // Automatically identify categorical features, and index them. - // Set maxCategories so features with > 4 distinct values are treated as continuous. - VectorIndexerModel featureIndexer = new VectorIndexer() - .setInputCol("features") - .setOutputCol("indexedFeatures") - .setMaxCategories(4) - .fit(data); - - // Split the data into training and test sets (30% held out for testing) - Dataset[] splits = data.randomSplit(new double[] {0.7, 0.3}); - Dataset trainingData = splits[0]; - Dataset testData = splits[1]; - - // Train a RandomForest model. - RandomForestClassifier rf = new RandomForestClassifier() - .setLabelCol("indexedLabel") - .setFeaturesCol("indexedFeatures"); - - // Convert indexed labels back to original labels. - IndexToString labelConverter = new IndexToString() - .setInputCol("prediction") - .setOutputCol("predictedLabel") - .setLabels(labelIndexer.labels()); - - // Chain indexers and forest in a Pipeline - Pipeline pipeline = new Pipeline() - .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter}); - - // Train model. This also runs the indexers. - PipelineModel model = pipeline.fit(trainingData); - - // Make predictions. - Dataset predictions = model.transform(testData); - - // Select example rows to display. - predictions.select("predictedLabel", "label", "features").show(5); - - // Select (prediction, true label) and compute test error - MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() - .setLabelCol("indexedLabel") - .setPredictionCol("prediction") - .setMetricName("accuracy"); - double accuracy = evaluator.evaluate(predictions); - System.out.println("Test Error = " + (1.0 - accuracy)); - - RandomForestClassificationModel rfModel = (RandomForestClassificationModel)(model.stages()[2]); - System.out.println("Learned classification forest model:\n" + rfModel.toDebugString()); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java deleted file mode 100644 index a707845..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import org.apache.spark.ml.Pipeline; -import org.apache.spark.ml.PipelineModel; -import org.apache.spark.ml.PipelineStage; -import org.apache.spark.ml.evaluation.RegressionEvaluator; -import org.apache.spark.ml.feature.VectorIndexer; -import org.apache.spark.ml.feature.VectorIndexerModel; -import org.apache.spark.ml.regression.RandomForestRegressionModel; -import org.apache.spark.ml.regression.RandomForestRegressor; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off$ - -public class JavaRandomForestRegressorExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaRandomForestRegressorExample") - .getOrCreate(); - - // $example on$ - // Load and parse the data file, converting it to a DataFrame. - Dataset data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); - - // Automatically identify categorical features, and index them. - // Set maxCategories so features with > 4 distinct values are treated as continuous. - VectorIndexerModel featureIndexer = new VectorIndexer() - .setInputCol("features") - .setOutputCol("indexedFeatures") - .setMaxCategories(4) - .fit(data); - - // Split the data into training and test sets (30% held out for testing) - Dataset[] splits = data.randomSplit(new double[] {0.7, 0.3}); - Dataset trainingData = splits[0]; - Dataset testData = splits[1]; - - // Train a RandomForest model. - RandomForestRegressor rf = new RandomForestRegressor() - .setLabelCol("label") - .setFeaturesCol("indexedFeatures"); - - // Chain indexer and forest in a Pipeline - Pipeline pipeline = new Pipeline() - .setStages(new PipelineStage[] {featureIndexer, rf}); - - // Train model. This also runs the indexer. - PipelineModel model = pipeline.fit(trainingData); - - // Make predictions. - Dataset predictions = model.transform(testData); - - // Select example rows to display. - predictions.select("prediction", "label", "features").show(5); - - // Select (prediction, true label) and compute test error - RegressionEvaluator evaluator = new RegressionEvaluator() - .setLabelCol("label") - .setPredictionCol("prediction") - .setMetricName("rmse"); - double rmse = evaluator.evaluate(predictions); - System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse); - - RandomForestRegressionModel rfModel = (RandomForestRegressionModel)(model.stages()[1]); - System.out.println("Learned regression forest model:\n" + rfModel.toDebugString()); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java deleted file mode 100644 index 2a3d62d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.SQLTransformer; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.*; -// $example off$ - -public class JavaSQLTransformerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaSQLTransformerExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0, 1.0, 3.0), - RowFactory.create(2, 2.0, 5.0) - ); - StructType schema = new StructType(new StructField [] { - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()), - new StructField("v2", DataTypes.DoubleType, false, Metadata.empty()) - }); - Dataset df = spark.createDataFrame(data, schema); - - SQLTransformer sqlTrans = new SQLTransformer().setStatement( - "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"); - - sqlTrans.transform(df).show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java deleted file mode 100644 index 08ea285..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import org.apache.spark.ml.feature.StandardScaler; -import org.apache.spark.ml.feature.StandardScalerModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ - -public class JavaStandardScalerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaStandardScalerExample") - .getOrCreate(); - - // $example on$ - Dataset dataFrame = - spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); - - StandardScaler scaler = new StandardScaler() - .setInputCol("features") - .setOutputCol("scaledFeatures") - .setWithStd(true) - .setWithMean(false); - - // Compute summary statistics by fitting the StandardScaler - StandardScalerModel scalerModel = scaler.fit(dataFrame); - - // Normalize each feature to have unit standard deviation. - Dataset scaledData = scalerModel.transform(dataFrame); - scaledData.show(); - // $example off$ - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java deleted file mode 100644 index 94ead62..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.StopWordsRemover; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaStopWordsRemoverExample { - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaStopWordsRemoverExample") - .getOrCreate(); - - // $example on$ - StopWordsRemover remover = new StopWordsRemover() - .setInputCol("raw") - .setOutputCol("filtered"); - - List data = Arrays.asList( - RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")), - RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField( - "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) - }); - - Dataset dataset = spark.createDataFrame(data, schema); - remover.transform(dataset).show(false); - // $example off$ - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java deleted file mode 100644 index cf9747a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.StringIndexer; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -import static org.apache.spark.sql.types.DataTypes.*; -// $example off$ - -public class JavaStringIndexerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaStringIndexerExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0, "a"), - RowFactory.create(1, "b"), - RowFactory.create(2, "c"), - RowFactory.create(3, "a"), - RowFactory.create(4, "a"), - RowFactory.create(5, "c") - ); - StructType schema = new StructType(new StructField[]{ - createStructField("id", IntegerType, false), - createStructField("category", StringType, false) - }); - Dataset df = spark.createDataFrame(data, schema); - - StringIndexer indexer = new StringIndexer() - .setInputCol("category") - .setOutputCol("categoryIndex"); - - Dataset indexed = indexer.fit(df).transform(df); - indexed.show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java deleted file mode 100644 index e9b8436..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaSummarizerExample.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.*; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.linalg.Vector; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.stat.Summarizer; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaSummarizerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaSummarizerExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(Vectors.dense(2.0, 3.0, 5.0), 1.0), - RowFactory.create(Vectors.dense(4.0, 6.0, 7.0), 2.0) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("features", new VectorUDT(), false, Metadata.empty()), - new StructField("weight", DataTypes.DoubleType, false, Metadata.empty()) - }); - - Dataset df = spark.createDataFrame(data, schema); - - Row result1 = df.select(Summarizer.metrics("mean", "variance") - .summary(new Column("features"), new Column("weight")).as("summary")) - .select("summary.mean", "summary.variance").first(); - System.out.println("with weight: mean = " + result1.getAs(0).toString() + - ", variance = " + result1.getAs(1).toString()); - - Row result2 = df.select( - Summarizer.mean(new Column("features")), - Summarizer.variance(new Column("features")) - ).first(); - System.out.println("without weight: mean = " + result2.getAs(0).toString() + - ", variance = " + result2.getAs(1).toString()); - // $example off$ - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java deleted file mode 100644 index b740cd0..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.HashingTF; -import org.apache.spark.ml.feature.IDF; -import org.apache.spark.ml.feature.IDFModel; -import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off$ - -public class JavaTfIdfExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaTfIdfExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0.0, "Hi I heard about Spark"), - RowFactory.create(0.0, "I wish Java could use case classes"), - RowFactory.create(1.0, "Logistic regression models are neat") - ); - StructType schema = new StructType(new StructField[]{ - new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), - new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) - }); - Dataset sentenceData = spark.createDataFrame(data, schema); - - Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); - Dataset wordsData = tokenizer.transform(sentenceData); - - int numFeatures = 20; - HashingTF hashingTF = new HashingTF() - .setInputCol("words") - .setOutputCol("rawFeatures") - .setNumFeatures(numFeatures); - - Dataset featurizedData = hashingTF.transform(wordsData); - // alternatively, CountVectorizer can also be used to get term frequency vectors - - IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); - IDFModel idfModel = idf.fit(featurizedData); - - Dataset rescaledData = idfModel.transform(featurizedData); - rescaledData.select("label", "features").show(); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java deleted file mode 100644 index a0979aa..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import scala.collection.mutable.WrappedArray; - -import org.apache.spark.ml.feature.RegexTokenizer; -import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; - -// col("...") is preferable to df.col("...") -import static org.apache.spark.sql.functions.callUDF; -import static org.apache.spark.sql.functions.col; -// $example off$ - -public class JavaTokenizerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaTokenizerExample") - .getOrCreate(); - - // $example on$ - List data = Arrays.asList( - RowFactory.create(0, "Hi I heard about Spark"), - RowFactory.create(1, "I wish Java could use case classes"), - RowFactory.create(2, "Logistic,regression,models,are,neat") - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), - new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) - }); - - Dataset sentenceDataFrame = spark.createDataFrame(data, schema); - - Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); - - RegexTokenizer regexTokenizer = new RegexTokenizer() - .setInputCol("sentence") - .setOutputCol("words") - .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); - - spark.udf().register( - "countTokens", (WrappedArray words) -> words.size(), DataTypes.IntegerType); - - Dataset tokenized = tokenizer.transform(sentenceDataFrame); - tokenized.select("sentence", "words") - .withColumn("tokens", callUDF("countTokens", col("words"))) - .show(false); - - Dataset regexTokenized = regexTokenizer.transform(sentenceDataFrame); - regexTokenized.select("sentence", "words") - .withColumn("tokens", callUDF("countTokens", col("words"))) - .show(false); - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java deleted file mode 100644 index 384e09c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; - -import org.apache.spark.ml.feature.VectorAssembler; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.*; -import static org.apache.spark.sql.types.DataTypes.*; -// $example off$ - -public class JavaVectorAssemblerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaVectorAssemblerExample") - .getOrCreate(); - - // $example on$ - StructType schema = createStructType(new StructField[]{ - createStructField("id", IntegerType, false), - createStructField("hour", IntegerType, false), - createStructField("mobile", DoubleType, false), - createStructField("userFeatures", new VectorUDT(), false), - createStructField("clicked", DoubleType, false) - }); - Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0); - Dataset dataset = spark.createDataFrame(Arrays.asList(row), schema); - - VectorAssembler assembler = new VectorAssembler() - .setInputCols(new String[]{"hour", "mobile", "userFeatures"}) - .setOutputCol("features"); - - Dataset output = assembler.transform(dataset); - System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " + - "'features'"); - output.select("features", "clicked").show(false); - // $example off$ - - spark.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java deleted file mode 100644 index dd9d757..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Map; - -import org.apache.spark.ml.feature.VectorIndexer; -import org.apache.spark.ml.feature.VectorIndexerModel; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off$ - -public class JavaVectorIndexerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaVectorIndexerExample") - .getOrCreate(); - - // $example on$ - Dataset data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); - - VectorIndexer indexer = new VectorIndexer() - .setInputCol("features") - .setOutputCol("indexed") - .setMaxCategories(10); - VectorIndexerModel indexerModel = indexer.fit(data); - - Map> categoryMaps = indexerModel.javaCategoryMaps(); - System.out.print("Chose " + categoryMaps.size() + " categorical features:"); - - for (Integer feature : categoryMaps.keySet()) { - System.out.print(" " + feature); - } - System.out.println(); - - // Create new column "indexed" with categorical values transformed to indices - Dataset indexedData = indexerModel.transform(data); - indexedData.show(); - // $example off$ - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java deleted file mode 100644 index d649a2c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; - -import org.apache.spark.ml.feature.VectorAssembler; -import org.apache.spark.ml.feature.VectorSizeHint; -import org.apache.spark.ml.linalg.VectorUDT; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import static org.apache.spark.sql.types.DataTypes.*; -// $example off$ - -public class JavaVectorSizeHintExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaVectorSizeHintExample") - .getOrCreate(); - - // $example on$ - StructType schema = createStructType(new StructField[]{ - createStructField("id", IntegerType, false), - createStructField("hour", IntegerType, false), - createStructField("mobile", DoubleType, false), - createStructField("userFeatures", new VectorUDT(), false), - createStructField("clicked", DoubleType, false) - }); - Row row0 = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0); - Row row1 = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0), 0.0); - Dataset dataset = spark.createDataFrame(Arrays.asList(row0, row1), schema); - - VectorSizeHint sizeHint = new VectorSizeHint() - .setInputCol("userFeatures") - .setHandleInvalid("skip") - .setSize(3); - - Dataset datasetWithSize = sizeHint.transform(dataset); - System.out.println("Rows where 'userFeatures' is not the right size are filtered out"); - datasetWithSize.show(false); - - VectorAssembler assembler = new VectorAssembler() - .setInputCols(new String[]{"hour", "mobile", "userFeatures"}) - .setOutputCol("features"); - - // This dataframe can be used by downstream transformers as before - Dataset output = assembler.transform(datasetWithSize); - System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " + - "'features'"); - output.select("features", "clicked").show(false); - // $example off$ - - spark.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java deleted file mode 100644 index 1ae48be..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -import org.apache.spark.sql.SparkSession; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.attribute.Attribute; -import org.apache.spark.ml.attribute.AttributeGroup; -import org.apache.spark.ml.attribute.NumericAttribute; -import org.apache.spark.ml.feature.VectorSlicer; -import org.apache.spark.ml.linalg.Vectors; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.types.*; -// $example off$ - -public class JavaVectorSlicerExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaVectorSlicerExample") - .getOrCreate(); - - // $example on$ - Attribute[] attrs = { - NumericAttribute.defaultAttr().withName("f1"), - NumericAttribute.defaultAttr().withName("f2"), - NumericAttribute.defaultAttr().withName("f3") - }; - AttributeGroup group = new AttributeGroup("userFeatures", attrs); - - List data = Arrays.asList( - RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), - RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) - ); - - Dataset dataset = - spark.createDataFrame(data, (new StructType()).add(group.toStructField())); - - VectorSlicer vectorSlicer = new VectorSlicer() - .setInputCol("userFeatures").setOutputCol("features"); - - vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); - // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"}) - - Dataset output = vectorSlicer.transform(dataset); - output.show(false); - // $example off$ - - spark.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java deleted file mode 100644 index fc9b459..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/ml/JavaWord2VecExample.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.ml; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.ml.feature.Word2Vec; -import org.apache.spark.ml.feature.Word2VecModel; -import org.apache.spark.ml.linalg.Vector; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.*; -// $example off$ - -public class JavaWord2VecExample { - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("JavaWord2VecExample") - .getOrCreate(); - - // $example on$ - // Input data: Each row is a bag of words from a sentence or document. - List data = Arrays.asList( - RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), - RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), - RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))) - ); - StructType schema = new StructType(new StructField[]{ - new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) - }); - Dataset documentDF = spark.createDataFrame(data, schema); - - // Learn a mapping from words to Vectors. - Word2Vec word2Vec = new Word2Vec() - .setInputCol("text") - .setOutputCol("result") - .setVectorSize(3) - .setMinCount(0); - - Word2VecModel model = word2Vec.fit(documentDF); - Dataset result = model.transform(documentDF); - - for (Row row : result.collectAsList()) { - List text = row.getList(0); - Vector vector = (Vector) row.get(1); - System.out.println("Text: " + text + " => \nVector: " + vector + "\n"); - } - // $example off$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaALS.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaALS.java deleted file mode 100644 index 95a430f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaALS.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; - -import org.apache.spark.mllib.recommendation.ALS; -import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; -import org.apache.spark.mllib.recommendation.Rating; - -import java.util.Arrays; -import java.util.regex.Pattern; - -import scala.Tuple2; - -/** - * Example using MLlib ALS from Java. - */ -public final class JavaALS { - - static class ParseRating implements Function { - private static final Pattern COMMA = Pattern.compile(","); - - @Override - public Rating call(String line) { - String[] tok = COMMA.split(line); - int x = Integer.parseInt(tok[0]); - int y = Integer.parseInt(tok[1]); - double rating = Double.parseDouble(tok[2]); - return new Rating(x, y, rating); - } - } - - static class FeaturesToString implements Function, String> { - @Override - public String call(Tuple2 element) { - return element._1() + "," + Arrays.toString(element._2()); - } - } - - public static void main(String[] args) { - - if (args.length < 4) { - System.err.println( - "Usage: JavaALS []"); - System.exit(1); - } - SparkConf sparkConf = new SparkConf().setAppName("JavaALS"); - int rank = Integer.parseInt(args[1]); - int iterations = Integer.parseInt(args[2]); - String outputDir = args[3]; - int blocks = -1; - if (args.length == 5) { - blocks = Integer.parseInt(args[4]); - } - - JavaSparkContext sc = new JavaSparkContext(sparkConf); - JavaRDD lines = sc.textFile(args[0]); - - JavaRDD ratings = lines.map(new ParseRating()); - - MatrixFactorizationModel model = ALS.train(ratings.rdd(), rank, iterations, 0.01, blocks); - - model.userFeatures().toJavaRDD().map(new FeaturesToString()).saveAsTextFile( - outputDir + "/userFeatures"); - model.productFeatures().toJavaRDD().map(new FeaturesToString()).saveAsTextFile( - outputDir + "/productFeatures"); - System.out.println("Final user/product features written to " + outputDir); - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaAssociationRulesExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaAssociationRulesExample.java deleted file mode 100644 index 5f43603..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaAssociationRulesExample.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.fpm.AssociationRules; -import org.apache.spark.mllib.fpm.FPGrowth; -import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset; -// $example off$ - -import org.apache.spark.SparkConf; - -public class JavaAssociationRulesExample { - - public static void main(String[] args) { - - SparkConf sparkConf = new SparkConf().setAppName("JavaAssociationRulesExample"); - JavaSparkContext sc = new JavaSparkContext(sparkConf); - - // $example on$ - JavaRDD> freqItemsets = sc.parallelize(Arrays.asList( - new FreqItemset<>(new String[] {"a"}, 15L), - new FreqItemset<>(new String[] {"b"}, 35L), - new FreqItemset<>(new String[] {"a", "b"}, 12L) - )); - - AssociationRules arules = new AssociationRules() - .setMinConfidence(0.8); - JavaRDD> results = arules.run(freqItemsets); - - for (AssociationRules.Rule rule : results.collect()) { - System.out.println( - rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence()); - } - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java deleted file mode 100644 index b9d0313..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.mllib.classification.LogisticRegressionModel; -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; - -public class JavaBinaryClassificationMetricsExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Java Binary Classification Metrics Example"); - SparkContext sc = new SparkContext(conf); - // $example on$ - String path = "data/mllib/sample_binary_classification_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD[] splits = - data.randomSplit(new double[]{0.6, 0.4}, 11L); - JavaRDD training = splits[0].cache(); - JavaRDD test = splits[1]; - - // Run training algorithm to build the model. - LogisticRegressionModel model = new LogisticRegressionWithLBFGS() - .setNumClasses(2) - .run(training.rdd()); - - // Clear the prediction threshold so the model will return probabilities - model.clearThreshold(); - - // Compute raw scores on the test set. - JavaPairRDD predictionAndLabels = test.mapToPair(p -> - new Tuple2<>(model.predict(p.features()), p.label())); - - // Get evaluation metrics. - BinaryClassificationMetrics metrics = - new BinaryClassificationMetrics(predictionAndLabels.rdd()); - - // Precision by threshold - JavaRDD> precision = metrics.precisionByThreshold().toJavaRDD(); - System.out.println("Precision by threshold: " + precision.collect()); - - // Recall by threshold - JavaRDD recall = metrics.recallByThreshold().toJavaRDD(); - System.out.println("Recall by threshold: " + recall.collect()); - - // F Score by threshold - JavaRDD f1Score = metrics.fMeasureByThreshold().toJavaRDD(); - System.out.println("F1 Score by threshold: " + f1Score.collect()); - - JavaRDD f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD(); - System.out.println("F2 Score by threshold: " + f2Score.collect()); - - // Precision-recall curve - JavaRDD prc = metrics.pr().toJavaRDD(); - System.out.println("Precision-recall curve: " + prc.collect()); - - // Thresholds - JavaRDD thresholds = precision.map(t -> Double.parseDouble(t._1().toString())); - - // ROC Curve - JavaRDD roc = metrics.roc().toJavaRDD(); - System.out.println("ROC curve: " + roc.collect()); - - // AUPRC - System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR()); - - // AUROC - System.out.println("Area under ROC = " + metrics.areaUnderROC()); - - // Save and load model - model.save(sc, "target/tmp/LogisticRegressionModel"); - LogisticRegressionModel.load(sc, "target/tmp/LogisticRegressionModel"); - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java deleted file mode 100644 index f878b55..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.Arrays; -import java.util.List; -// $example off$ -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.clustering.BisectingKMeans; -import org.apache.spark.mllib.clustering.BisectingKMeansModel; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -// $example off$ - -/** - * Java example for bisecting k-means clustering. - */ -public class JavaBisectingKMeansExample { - public static void main(String[] args) { - SparkConf sparkConf = new SparkConf().setAppName("JavaBisectingKMeansExample"); - JavaSparkContext sc = new JavaSparkContext(sparkConf); - - // $example on$ - List localData = Arrays.asList( - Vectors.dense(0.1, 0.1), Vectors.dense(0.3, 0.3), - Vectors.dense(10.1, 10.1), Vectors.dense(10.3, 10.3), - Vectors.dense(20.1, 20.1), Vectors.dense(20.3, 20.3), - Vectors.dense(30.1, 30.1), Vectors.dense(30.3, 30.3) - ); - JavaRDD data = sc.parallelize(localData, 2); - - BisectingKMeans bkm = new BisectingKMeans() - .setK(4); - BisectingKMeansModel model = bkm.run(data); - - System.out.println("Compute Cost: " + model.computeCost(data)); - - Vector[] clusterCenters = model.clusterCenters(); - for (int i = 0; i < clusterCenters.length; i++) { - Vector clusterCenter = clusterCenters[i]; - System.out.println("Cluster Center " + i + ": " + clusterCenter); - } - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java deleted file mode 100644 index ce354af..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.feature.ChiSqSelector; -import org.apache.spark.mllib.feature.ChiSqSelectorModel; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ - -public class JavaChiSqSelectorExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaChiSqSelectorExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - JavaRDD points = MLUtils.loadLibSVMFile(jsc.sc(), - "data/mllib/sample_libsvm_data.txt").toJavaRDD().cache(); - - // Discretize data in 16 equal bins since ChiSqSelector requires categorical features - // Although features are doubles, the ChiSqSelector treats each unique value as a category - JavaRDD discretizedData = points.map(lp -> { - double[] discretizedFeatures = new double[lp.features().size()]; - for (int i = 0; i < lp.features().size(); ++i) { - discretizedFeatures[i] = Math.floor(lp.features().apply(i) / 16); - } - return new LabeledPoint(lp.label(), Vectors.dense(discretizedFeatures)); - }); - - // Create ChiSqSelector that will select top 50 of 692 features - ChiSqSelector selector = new ChiSqSelector(50); - // Create ChiSqSelector model (selecting features) - ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd()); - // Filter the top 50 features from each feature vector - JavaRDD filteredData = discretizedData.map(lp -> - new LabeledPoint(lp.label(), transformer.transform(lp.features()))); - // $example off$ - - System.out.println("filtered data: "); - filteredData.foreach(System.out::println); - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java deleted file mode 100644 index c0fa0b3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.stat.Statistics; -// $example off$ - -public class JavaCorrelationsExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - JavaDoubleRDD seriesX = jsc.parallelizeDoubles( - Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series - - // must have the same number of partitions and cardinality as seriesX - JavaDoubleRDD seriesY = jsc.parallelizeDoubles( - Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0)); - - // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. - // If a method is not specified, Pearson's method will be used by default. - Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); - System.out.println("Correlation is: " + correlation); - - // note that each Vector is a row and not a column - JavaRDD data = jsc.parallelize( - Arrays.asList( - Vectors.dense(1.0, 10.0, 100.0), - Vectors.dense(2.0, 20.0, 200.0), - Vectors.dense(5.0, 33.0, 366.0) - ) - ); - - // calculate the correlation matrix using Pearson's method. - // Use "spearman" for Spearman's method. - // If a method is not specified, Pearson's method will be used by default. - Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); - System.out.println(correlMatrix.toString()); - // $example off$ - - jsc.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeClassificationExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeClassificationExample.java deleted file mode 100644 index 032c168..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeClassificationExample.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.HashMap; -import java.util.Map; - -import scala.Tuple2; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.tree.DecisionTree; -import org.apache.spark.mllib.tree.model.DecisionTreeModel; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ - -class JavaDecisionTreeClassificationExample { - - public static void main(String[] args) { - - // $example on$ - SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeClassificationExample"); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - - // Load and parse the data file. - String datapath = "data/mllib/sample_libsvm_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); - // Split the data into training and test sets (30% held out for testing) - JavaRDD[] splits = data.randomSplit(new double[]{0.7, 0.3}); - JavaRDD trainingData = splits[0]; - JavaRDD testData = splits[1]; - - // Set parameters. - // Empty categoricalFeaturesInfo indicates all features are continuous. - int numClasses = 2; - Map categoricalFeaturesInfo = new HashMap<>(); - String impurity = "gini"; - int maxDepth = 5; - int maxBins = 32; - - // Train a DecisionTree model for classification. - DecisionTreeModel model = DecisionTree.trainClassifier(trainingData, numClasses, - categoricalFeaturesInfo, impurity, maxDepth, maxBins); - - // Evaluate model on test instances and compute test error - JavaPairRDD predictionAndLabel = - testData.mapToPair(p -> new Tuple2<>(model.predict(p.features()), p.label())); - double testErr = - predictionAndLabel.filter(pl -> !pl._1().equals(pl._2())).count() / (double) testData.count(); - - System.out.println("Test Error: " + testErr); - System.out.println("Learned classification tree model:\n" + model.toDebugString()); - - // Save and load model - model.save(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel"); - DecisionTreeModel sameModel = DecisionTreeModel - .load(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel"); - // $example off$ - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeRegressionExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeRegressionExample.java deleted file mode 100644 index f222c38..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeRegressionExample.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.HashMap; -import java.util.Map; - -import scala.Tuple2; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.tree.DecisionTree; -import org.apache.spark.mllib.tree.model.DecisionTreeModel; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ - -class JavaDecisionTreeRegressionExample { - - public static void main(String[] args) { - - // $example on$ - SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeRegressionExample"); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - - // Load and parse the data file. - String datapath = "data/mllib/sample_libsvm_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); - // Split the data into training and test sets (30% held out for testing) - JavaRDD[] splits = data.randomSplit(new double[]{0.7, 0.3}); - JavaRDD trainingData = splits[0]; - JavaRDD testData = splits[1]; - - // Set parameters. - // Empty categoricalFeaturesInfo indicates all features are continuous. - Map categoricalFeaturesInfo = new HashMap<>(); - String impurity = "variance"; - int maxDepth = 5; - int maxBins = 32; - - // Train a DecisionTree model. - DecisionTreeModel model = DecisionTree.trainRegressor(trainingData, - categoricalFeaturesInfo, impurity, maxDepth, maxBins); - - // Evaluate model on test instances and compute test error - JavaPairRDD predictionAndLabel = - testData.mapToPair(p -> new Tuple2<>(model.predict(p.features()), p.label())); - double testMSE = predictionAndLabel.mapToDouble(pl -> { - double diff = pl._1() - pl._2(); - return diff * diff; - }).mean(); - System.out.println("Test Mean Squared Error: " + testMSE); - System.out.println("Learned regression tree model:\n" + model.toDebugString()); - - // Save and load model - model.save(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel"); - DecisionTreeModel sameModel = DecisionTreeModel - .load(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel"); - // $example off$ - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java deleted file mode 100644 index 2d45c61..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.Arrays; -// $example off$ - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.feature.ElementwiseProduct; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -// $example off$ - -public class JavaElementwiseProductExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaElementwiseProductExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - // Create some vector data; also works for sparse vectors - JavaRDD data = jsc.parallelize(Arrays.asList( - Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0))); - Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); - ElementwiseProduct transformer = new ElementwiseProduct(transformingVector); - - // Batch transform and per-row transform give the same results: - JavaRDD transformedData = transformer.transform(data); - JavaRDD transformedData2 = data.map(transformer::transform); - // $example off$ - - System.out.println("transformedData: "); - transformedData.foreach(System.out::println); - - System.out.println("transformedData2: "); - transformedData2.foreach(System.out::println); - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaGaussianMixtureExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaGaussianMixtureExample.java deleted file mode 100644 index 5792e5a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaGaussianMixtureExample.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; - -// $example on$ -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.clustering.GaussianMixture; -import org.apache.spark.mllib.clustering.GaussianMixtureModel; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -// $example off$ - -public class JavaGaussianMixtureExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaGaussianMixtureExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - // Load and parse data - String path = "data/mllib/gmm_data.txt"; - JavaRDD data = jsc.textFile(path); - JavaRDD parsedData = data.map(s -> { - String[] sarray = s.trim().split(" "); - double[] values = new double[sarray.length]; - for (int i = 0; i < sarray.length; i++) { - values[i] = Double.parseDouble(sarray[i]); - } - return Vectors.dense(values); - }); - parsedData.cache(); - - // Cluster the data into two classes using GaussianMixture - GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd()); - - // Save and load GaussianMixtureModel - gmm.save(jsc.sc(), "target/org/apache/spark/JavaGaussianMixtureExample/GaussianMixtureModel"); - GaussianMixtureModel sameModel = GaussianMixtureModel.load(jsc.sc(), - "target/org.apache.spark.JavaGaussianMixtureExample/GaussianMixtureModel"); - - // Output the parameters of the mixture model - for (int j = 0; j < gmm.k(); j++) { - System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n", - gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma()); - } - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostingClassificationExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostingClassificationExample.java deleted file mode 100644 index 521ee96..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostingClassificationExample.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.HashMap; -import java.util.Map; - -import scala.Tuple2; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.tree.GradientBoostedTrees; -import org.apache.spark.mllib.tree.configuration.BoostingStrategy; -import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ - -public class JavaGradientBoostingClassificationExample { - public static void main(String[] args) { - // $example on$ - SparkConf sparkConf = new SparkConf() - .setAppName("JavaGradientBoostedTreesClassificationExample"); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - - // Load and parse the data file. - String datapath = "data/mllib/sample_libsvm_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); - // Split the data into training and test sets (30% held out for testing) - JavaRDD[] splits = data.randomSplit(new double[]{0.7, 0.3}); - JavaRDD trainingData = splits[0]; - JavaRDD testData = splits[1]; - - // Train a GradientBoostedTrees model. - // The defaultParams for Classification use LogLoss by default. - BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Classification"); - boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice. - boostingStrategy.getTreeStrategy().setNumClasses(2); - boostingStrategy.getTreeStrategy().setMaxDepth(5); - // Empty categoricalFeaturesInfo indicates all features are continuous. - Map categoricalFeaturesInfo = new HashMap<>(); - boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo); - - GradientBoostedTreesModel model = GradientBoostedTrees.train(trainingData, boostingStrategy); - - // Evaluate model on test instances and compute test error - JavaPairRDD predictionAndLabel = - testData.mapToPair(p -> new Tuple2<>(model.predict(p.features()), p.label())); - double testErr = - predictionAndLabel.filter(pl -> !pl._1().equals(pl._2())).count() / (double) testData.count(); - System.out.println("Test Error: " + testErr); - System.out.println("Learned classification GBT model:\n" + model.toDebugString()); - - // Save and load model - model.save(jsc.sc(), "target/tmp/myGradientBoostingClassificationModel"); - GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(), - "target/tmp/myGradientBoostingClassificationModel"); - // $example off$ - - jsc.stop(); - } - -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostingRegressionExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostingRegressionExample.java deleted file mode 100644 index b345d19..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostingRegressionExample.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.HashMap; -import java.util.Map; - -import scala.Tuple2; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.tree.GradientBoostedTrees; -import org.apache.spark.mllib.tree.configuration.BoostingStrategy; -import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ - -public class JavaGradientBoostingRegressionExample { - public static void main(String[] args) { - // $example on$ - SparkConf sparkConf = new SparkConf() - .setAppName("JavaGradientBoostedTreesRegressionExample"); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - // Load and parse the data file. - String datapath = "data/mllib/sample_libsvm_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); - // Split the data into training and test sets (30% held out for testing) - JavaRDD[] splits = data.randomSplit(new double[]{0.7, 0.3}); - JavaRDD trainingData = splits[0]; - JavaRDD testData = splits[1]; - - // Train a GradientBoostedTrees model. - // The defaultParams for Regression use SquaredError by default. - BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Regression"); - boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice. - boostingStrategy.getTreeStrategy().setMaxDepth(5); - // Empty categoricalFeaturesInfo indicates all features are continuous. - Map categoricalFeaturesInfo = new HashMap<>(); - boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo); - - GradientBoostedTreesModel model = GradientBoostedTrees.train(trainingData, boostingStrategy); - - // Evaluate model on test instances and compute test error - JavaPairRDD predictionAndLabel = - testData.mapToPair(p -> new Tuple2<>(model.predict(p.features()), p.label())); - double testMSE = predictionAndLabel.mapToDouble(pl -> { - double diff = pl._1() - pl._2(); - return diff * diff; - }).mean(); - System.out.println("Test Mean Squared Error: " + testMSE); - System.out.println("Learned regression GBT model:\n" + model.toDebugString()); - - // Save and load model - model.save(jsc.sc(), "target/tmp/myGradientBoostingRegressionModel"); - GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(), - "target/tmp/myGradientBoostingRegressionModel"); - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java deleted file mode 100644 index 2732736..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; - -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Matrices; -import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.stat.Statistics; -import org.apache.spark.mllib.stat.test.ChiSqTestResult; -// $example off$ - -public class JavaHypothesisTestingExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - // a vector composed of the frequencies of events - Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); - - // compute the goodness of fit. If a second vector to test against is not supplied - // as a parameter, the test runs against a uniform distribution. - ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); - // summary of the test including the p-value, degrees of freedom, test statistic, - // the method used, and the null hypothesis. - System.out.println(goodnessOfFitTestResult + "\n"); - - // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) - Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); - - // conduct Pearson's independence test on the input contingency matrix - ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); - // summary of the test including the p-value, degrees of freedom... - System.out.println(independenceTestResult + "\n"); - - // an RDD of labeled points - JavaRDD obs = jsc.parallelize( - Arrays.asList( - new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)), - new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)), - new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)) - ) - ); - - // The contingency table is constructed from the raw (label, feature) pairs and used to conduct - // the independence test. Returns an array containing the ChiSquaredTestResult for every feature - // against the label. - ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); - int i = 1; - for (ChiSqTestResult result : featureTestResults) { - System.out.println("Column " + i + ":"); - System.out.println(result + "\n"); // summary of the test - i++; - } - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java deleted file mode 100644 index fe611c9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.mllib.stat.Statistics; -import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; -// $example off$ - -public class JavaHypothesisTestingKolmogorovSmirnovTestExample { - public static void main(String[] args) { - - SparkConf conf = - new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); - KolmogorovSmirnovTestResult testResult = - Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); - // summary of the test including the p-value, test statistic, and null hypothesis - // if our p-value indicates significance, we can reject the null hypothesis - System.out.println(testResult); - // $example off$ - - jsc.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java deleted file mode 100644 index adebafe..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaIsotonicRegressionExample.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.mllib; - -// $example on$ - -import scala.Tuple2; -import scala.Tuple3; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.regression.IsotonicRegression; -import org.apache.spark.mllib.regression.IsotonicRegressionModel; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ -import org.apache.spark.SparkConf; - -public class JavaIsotonicRegressionExample { - public static void main(String[] args) { - SparkConf sparkConf = new SparkConf().setAppName("JavaIsotonicRegressionExample"); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - // $example on$ - JavaRDD data = MLUtils.loadLibSVMFile( - jsc.sc(), "data/mllib/sample_isotonic_regression_libsvm_data.txt").toJavaRDD(); - - // Create label, feature, weight tuples from input data with weight set to default value 1.0. - JavaRDD> parsedData = data.map(point -> - new Tuple3<>(point.label(), point.features().apply(0), 1.0)); - - // Split data into training (60%) and test (40%) sets. - JavaRDD>[] splits = - parsedData.randomSplit(new double[]{0.6, 0.4}, 11L); - JavaRDD> training = splits[0]; - JavaRDD> test = splits[1]; - - // Create isotonic regression model from training data. - // Isotonic parameter defaults to true so it is only shown for demonstration - IsotonicRegressionModel model = new IsotonicRegression().setIsotonic(true).run(training); - - // Create tuples of predicted and real labels. - JavaPairRDD predictionAndLabel = test.mapToPair(point -> - new Tuple2<>(model.predict(point._2()), point._1())); - - // Calculate mean squared error between predicted and real labels. - double meanSquaredError = predictionAndLabel.mapToDouble(pl -> { - double diff = pl._1() - pl._2(); - return diff * diff; - }).mean(); - System.out.println("Mean Squared Error = " + meanSquaredError); - - // Save and load model - model.save(jsc.sc(), "target/tmp/myIsotonicRegressionModel"); - IsotonicRegressionModel sameModel = - IsotonicRegressionModel.load(jsc.sc(), "target/tmp/myIsotonicRegressionModel"); - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeansExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeansExample.java deleted file mode 100644 index f172756..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeansExample.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; - -// $example on$ -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.clustering.KMeans; -import org.apache.spark.mllib.clustering.KMeansModel; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -// $example off$ - -public class JavaKMeansExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaKMeansExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - // Load and parse data - String path = "data/mllib/kmeans_data.txt"; - JavaRDD data = jsc.textFile(path); - JavaRDD parsedData = data.map(s -> { - String[] sarray = s.split(" "); - double[] values = new double[sarray.length]; - for (int i = 0; i < sarray.length; i++) { - values[i] = Double.parseDouble(sarray[i]); - } - return Vectors.dense(values); - }); - parsedData.cache(); - - // Cluster the data into two classes using KMeans - int numClusters = 2; - int numIterations = 20; - KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations); - - System.out.println("Cluster centers:"); - for (Vector center: clusters.clusterCenters()) { - System.out.println(" " + center); - } - double cost = clusters.computeCost(parsedData.rdd()); - System.out.println("Cost: " + cost); - - // Evaluate clustering by computing Within Set Sum of Squared Errors - double WSSSE = clusters.computeCost(parsedData.rdd()); - System.out.println("Within Set Sum of Squared Errors = " + WSSSE); - - // Save and load model - clusters.save(jsc.sc(), "target/org/apache/spark/JavaKMeansExample/KMeansModel"); - KMeansModel sameModel = KMeansModel.load(jsc.sc(), - "target/org/apache/spark/JavaKMeansExample/KMeansModel"); - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java deleted file mode 100644 index 41de0d9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.stat.KernelDensity; -// $example off$ - -public class JavaKernelDensityEstimationExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - // an RDD of sample data - JavaRDD data = jsc.parallelize( - Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); - - // Construct the density estimator with the sample data - // and a standard deviation for the Gaussian kernels - KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0); - - // Find density estimates for the given values - double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0}); - - System.out.println(Arrays.toString(densities)); - // $example off$ - - jsc.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLBFGSExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLBFGSExample.java deleted file mode 100644 index 3fdc03a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLBFGSExample.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.Arrays; - -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.mllib.classification.LogisticRegressionModel; -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.optimization.*; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; -// $example off$ - -public class JavaLBFGSExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("L-BFGS Example"); - SparkContext sc = new SparkContext(conf); - - // $example on$ - String path = "data/mllib/sample_libsvm_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - int numFeatures = data.take(1).get(0).features().size(); - - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD trainingInit = data.sample(false, 0.6, 11L); - JavaRDD test = data.subtract(trainingInit); - - // Append 1 into the training data as intercept. - JavaPairRDD training = data.mapToPair(p -> - new Tuple2<>(p.label(), MLUtils.appendBias(p.features()))); - training.cache(); - - // Run training algorithm to build the model. - int numCorrections = 10; - double convergenceTol = 1e-4; - int maxNumIterations = 20; - double regParam = 0.1; - Vector initialWeightsWithIntercept = Vectors.dense(new double[numFeatures + 1]); - - Tuple2 result = LBFGS.runLBFGS( - training.rdd(), - new LogisticGradient(), - new SquaredL2Updater(), - numCorrections, - convergenceTol, - maxNumIterations, - regParam, - initialWeightsWithIntercept); - Vector weightsWithIntercept = result._1(); - double[] loss = result._2(); - - LogisticRegressionModel model = new LogisticRegressionModel( - Vectors.dense(Arrays.copyOf(weightsWithIntercept.toArray(), weightsWithIntercept.size() - 1)), - (weightsWithIntercept.toArray())[weightsWithIntercept.size() - 1]); - - // Clear the default threshold. - model.clearThreshold(); - - // Compute raw scores on the test set. - JavaPairRDD scoreAndLabels = test.mapToPair(p -> - new Tuple2<>(model.predict(p.features()), p.label())); - - // Get evaluation metrics. - BinaryClassificationMetrics metrics = - new BinaryClassificationMetrics(scoreAndLabels.rdd()); - double auROC = metrics.areaUnderROC(); - - System.out.println("Loss of each step in training process"); - for (double l : loss) { - System.out.println(l); - } - System.out.println("Area under ROC = " + auROC); - // $example off$ - - sc.stop(); - } -} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLatentDirichletAllocationExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLatentDirichletAllocationExample.java deleted file mode 100644 index 887edf8..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLatentDirichletAllocationExample.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; - -// $example on$ -import scala.Tuple2; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.clustering.DistributedLDAModel; -import org.apache.spark.mllib.clustering.LDA; -import org.apache.spark.mllib.clustering.LDAModel; -import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -// $example off$ - -public class JavaLatentDirichletAllocationExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - // Load and parse the data - String path = "data/mllib/sample_lda_data.txt"; - JavaRDD data = jsc.textFile(path); - JavaRDD parsedData = data.map(s -> { - String[] sarray = s.trim().split(" "); - double[] values = new double[sarray.length]; - for (int i = 0; i < sarray.length; i++) { - values[i] = Double.parseDouble(sarray[i]); - } - return Vectors.dense(values); - }); - // Index documents with unique IDs - JavaPairRDD corpus = - JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(Tuple2::swap)); - corpus.cache(); - - // Cluster the documents into three topics using LDA - LDAModel ldaModel = new LDA().setK(3).run(corpus); - - // Output topics. Each is a distribution over words (matching word count vectors) - System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize() - + " words):"); - Matrix topics = ldaModel.topicsMatrix(); - for (int topic = 0; topic < 3; topic++) { - System.out.print("Topic " + topic + ":"); - for (int word = 0; word < ldaModel.vocabSize(); word++) { - System.out.print(" " + topics.apply(word, topic)); - } - System.out.println(); - } - - ldaModel.save(jsc.sc(), - "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel"); - DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(), - "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel"); - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionWithSGDExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionWithSGDExample.java deleted file mode 100644 index 324a781..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionWithSGDExample.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; - -// $example on$ -import scala.Tuple2; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.regression.LinearRegressionModel; -import org.apache.spark.mllib.regression.LinearRegressionWithSGD; -// $example off$ - -/** - * Example for LinearRegressionWithSGD. - */ -public class JavaLinearRegressionWithSGDExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithSGDExample"); - JavaSparkContext sc = new JavaSparkContext(conf); - - // $example on$ - // Load and parse the data - String path = "data/mllib/ridge-data/lpsa.data"; - JavaRDD data = sc.textFile(path); - JavaRDD parsedData = data.map(line -> { - String[] parts = line.split(","); - String[] features = parts[1].split(" "); - double[] v = new double[features.length]; - for (int i = 0; i < features.length - 1; i++) { - v[i] = Double.parseDouble(features[i]); - } - return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); - }); - parsedData.cache(); - - // Building the model - int numIterations = 100; - double stepSize = 0.00000001; - LinearRegressionModel model = - LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize); - - // Evaluate model on training examples and compute training error - JavaPairRDD valuesAndPreds = parsedData.mapToPair(point -> - new Tuple2<>(model.predict(point.features()), point.label())); - - double MSE = valuesAndPreds.mapToDouble(pair -> { - double diff = pair._1() - pair._2(); - return diff * diff; - }).mean(); - System.out.println("training Mean Squared Error = " + MSE); - - // Save and load model - model.save(sc.sc(), "target/tmp/javaLinearRegressionWithSGDModel"); - LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), - "target/tmp/javaLinearRegressionWithSGDModel"); - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java deleted file mode 100644 index 26b8a6e..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; - -// $example on$ -import scala.Tuple2; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.classification.LogisticRegressionModel; -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; -import org.apache.spark.mllib.evaluation.MulticlassMetrics; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ - -/** - * Example for LogisticRegressionWithLBFGS. - */ -public class JavaLogisticRegressionWithLBFGSExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample"); - SparkContext sc = new SparkContext(conf); - // $example on$ - String path = "data/mllib/sample_libsvm_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); - JavaRDD training = splits[0].cache(); - JavaRDD test = splits[1]; - - // Run training algorithm to build the model. - LogisticRegressionModel model = new LogisticRegressionWithLBFGS() - .setNumClasses(10) - .run(training.rdd()); - - // Compute raw scores on the test set. - JavaPairRDD predictionAndLabels = test.mapToPair(p -> - new Tuple2<>(model.predict(p.features()), p.label())); - - // Get evaluation metrics. - MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); - double accuracy = metrics.accuracy(); - System.out.println("Accuracy = " + accuracy); - - // Save and load model - model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); - LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, - "target/tmp/javaLogisticRegressionWithLBFGSModel"); - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java deleted file mode 100644 index bc99dc0..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.mllib.evaluation.MultilabelMetrics; -import org.apache.spark.SparkConf; -// $example off$ - -public class JavaMultiLabelClassificationMetricsExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics Example"); - JavaSparkContext sc = new JavaSparkContext(conf); - // $example on$ - List> data = Arrays.asList( - new Tuple2<>(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}), - new Tuple2<>(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}), - new Tuple2<>(new double[]{}, new double[]{0.0}), - new Tuple2<>(new double[]{2.0}, new double[]{2.0}), - new Tuple2<>(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}), - new Tuple2<>(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}), - new Tuple2<>(new double[]{1.0}, new double[]{1.0, 2.0}) - ); - JavaRDD> scoreAndLabels = sc.parallelize(data); - - // Instantiate metrics object - MultilabelMetrics metrics = new MultilabelMetrics(scoreAndLabels.rdd()); - - // Summary stats - System.out.format("Recall = %f\n", metrics.recall()); - System.out.format("Precision = %f\n", metrics.precision()); - System.out.format("F1 measure = %f\n", metrics.f1Measure()); - System.out.format("Accuracy = %f\n", metrics.accuracy()); - - // Stats by labels - for (int i = 0; i < metrics.labels().length - 1; i++) { - System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision( - metrics.labels()[i])); - System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall( - metrics.labels()[i])); - System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure( - metrics.labels()[i])); - } - - // Micro stats - System.out.format("Micro recall = %f\n", metrics.microRecall()); - System.out.format("Micro precision = %f\n", metrics.microPrecision()); - System.out.format("Micro F1 measure = %f\n", metrics.microF1Measure()); - - // Hamming loss - System.out.format("Hamming loss = %f\n", metrics.hammingLoss()); - - // Subset accuracy - System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy()); - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java deleted file mode 100644 index 0367038..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.mllib.classification.LogisticRegressionModel; -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; -import org.apache.spark.mllib.evaluation.MulticlassMetrics; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.mllib.linalg.Matrix; -// $example off$ -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; - -public class JavaMulticlassClassificationMetricsExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics Example"); - SparkContext sc = new SparkContext(conf); - // $example on$ - String path = "data/mllib/sample_multiclass_classification_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L); - JavaRDD training = splits[0].cache(); - JavaRDD test = splits[1]; - - // Run training algorithm to build the model. - LogisticRegressionModel model = new LogisticRegressionWithLBFGS() - .setNumClasses(3) - .run(training.rdd()); - - // Compute raw scores on the test set. - JavaPairRDD predictionAndLabels = test.mapToPair(p -> - new Tuple2<>(model.predict(p.features()), p.label())); - - // Get evaluation metrics. - MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); - - // Confusion matrix - Matrix confusion = metrics.confusionMatrix(); - System.out.println("Confusion matrix: \n" + confusion); - - // Overall statistics - System.out.println("Accuracy = " + metrics.accuracy()); - - // Stats by labels - for (int i = 0; i < metrics.labels().length; i++) { - System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision( - metrics.labels()[i])); - System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall( - metrics.labels()[i])); - System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure( - metrics.labels()[i])); - } - - //Weighted stats - System.out.format("Weighted precision = %f\n", metrics.weightedPrecision()); - System.out.format("Weighted recall = %f\n", metrics.weightedRecall()); - System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure()); - System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate()); - - // Save and load model - model.save(sc, "target/tmp/LogisticRegressionModel"); - LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, - "target/tmp/LogisticRegressionModel"); - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java deleted file mode 100644 index d80dbe8..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import scala.Tuple2; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.classification.NaiveBayes; -import org.apache.spark.mllib.classification.NaiveBayesModel; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ -import org.apache.spark.SparkConf; - -public class JavaNaiveBayesExample { - public static void main(String[] args) { - SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample"); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - // $example on$ - String path = "data/mllib/sample_libsvm_data.txt"; - JavaRDD inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); - JavaRDD[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}); - JavaRDD training = tmp[0]; // training set - JavaRDD test = tmp[1]; // test set - NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0); - JavaPairRDD predictionAndLabel = - test.mapToPair(p -> new Tuple2<>(model.predict(p.features()), p.label())); - double accuracy = - predictionAndLabel.filter(pl -> pl._1().equals(pl._2())).count() / (double) test.count(); - - // Save and load model - model.save(jsc.sc(), "target/tmp/myNaiveBayesModel"); - NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel"); - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java deleted file mode 100644 index 0a7dc62..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.Arrays; -import java.util.List; -// $example off$ - -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; -// $example on$ -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.linalg.distributed.RowMatrix; -// $example off$ - -/** - * Example for compute principal components on a 'RowMatrix'. - */ -public class JavaPCAExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("PCA Example"); - SparkContext sc = new SparkContext(conf); - JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); - - // $example on$ - List data = Arrays.asList( - Vectors.sparse(5, new int[] {1, 3}, new double[] {1.0, 7.0}), - Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), - Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) - ); - - JavaRDD rows = jsc.parallelize(data); - - // Create a RowMatrix from JavaRDD. - RowMatrix mat = new RowMatrix(rows.rdd()); - - // Compute the top 4 principal components. - // Principal components are stored in a local dense matrix. - Matrix pc = mat.computePrincipalComponents(4); - - // Project the rows to the linear space spanned by the top 4 principal components. - RowMatrix projected = mat.multiply(pc); - // $example off$ - Vector[] collectPartitions = (Vector[])projected.rows().collect(); - System.out.println("Projected vector of principal component:"); - for (Vector vector : collectPartitions) { - System.out.println("\t" + vector); - } - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java deleted file mode 100644 index 5155f18..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import java.util.Arrays; - -import scala.Tuple3; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import org.apache.spark.mllib.clustering.PowerIterationClustering; -import org.apache.spark.mllib.clustering.PowerIterationClusteringModel; -// $example off$ - -/** - * Java example for graph clustering using power iteration clustering (PIC). - */ -public class JavaPowerIterationClusteringExample { - public static void main(String[] args) { - SparkConf sparkConf = new SparkConf().setAppName("JavaPowerIterationClusteringExample"); - JavaSparkContext sc = new JavaSparkContext(sparkConf); - - @SuppressWarnings("unchecked") - // $example on$ - JavaRDD> similarities = sc.parallelize(Arrays.asList( - new Tuple3<>(0L, 1L, 0.9), - new Tuple3<>(1L, 2L, 0.9), - new Tuple3<>(2L, 3L, 0.9), - new Tuple3<>(3L, 4L, 0.1), - new Tuple3<>(4L, 5L, 0.9))); - - PowerIterationClustering pic = new PowerIterationClustering() - .setK(2) - .setMaxIterations(10); - PowerIterationClusteringModel model = pic.run(similarities); - - for (PowerIterationClustering.Assignment a: model.assignments().toJavaRDD().collect()) { - System.out.println(a.id() + " -> " + a.cluster()); - } - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaPrefixSpanExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaPrefixSpanExample.java deleted file mode 100644 index 1634075..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaPrefixSpanExample.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.Arrays; -import java.util.List; -// $example off$ -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import org.apache.spark.mllib.fpm.PrefixSpan; -import org.apache.spark.mllib.fpm.PrefixSpanModel; -// $example off$ -import org.apache.spark.SparkConf; - -public class JavaPrefixSpanExample { - - public static void main(String[] args) { - - SparkConf sparkConf = new SparkConf().setAppName("JavaPrefixSpanExample"); - JavaSparkContext sc = new JavaSparkContext(sparkConf); - - // $example on$ - JavaRDD>> sequences = sc.parallelize(Arrays.asList( - Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3)), - Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1, 2)), - Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5)), - Arrays.asList(Arrays.asList(6)) - ), 2); - PrefixSpan prefixSpan = new PrefixSpan() - .setMinSupport(0.5) - .setMaxPatternLength(5); - PrefixSpanModel model = prefixSpan.run(sequences); - for (PrefixSpan.FreqSequence freqSeq: model.freqSequences().toJavaRDD().collect()) { - System.out.println(freqSeq.javaSequence() + ", " + freqSeq.freq()); - } - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestClassificationExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestClassificationExample.java deleted file mode 100644 index 6998ce2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestClassificationExample.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.HashMap; -import java.util.Map; - -import scala.Tuple2; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.tree.RandomForest; -import org.apache.spark.mllib.tree.model.RandomForestModel; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ - -public class JavaRandomForestClassificationExample { - public static void main(String[] args) { - // $example on$ - SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestClassificationExample"); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - // Load and parse the data file. - String datapath = "data/mllib/sample_libsvm_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); - // Split the data into training and test sets (30% held out for testing) - JavaRDD[] splits = data.randomSplit(new double[]{0.7, 0.3}); - JavaRDD trainingData = splits[0]; - JavaRDD testData = splits[1]; - - // Train a RandomForest model. - // Empty categoricalFeaturesInfo indicates all features are continuous. - Integer numClasses = 2; - Map categoricalFeaturesInfo = new HashMap<>(); - Integer numTrees = 3; // Use more in practice. - String featureSubsetStrategy = "auto"; // Let the algorithm choose. - String impurity = "gini"; - Integer maxDepth = 5; - Integer maxBins = 32; - Integer seed = 12345; - - RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses, - categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, - seed); - - // Evaluate model on test instances and compute test error - JavaPairRDD predictionAndLabel = - testData.mapToPair(p -> new Tuple2<>(model.predict(p.features()), p.label())); - double testErr = - predictionAndLabel.filter(pl -> !pl._1().equals(pl._2())).count() / (double) testData.count(); - System.out.println("Test Error: " + testErr); - System.out.println("Learned classification forest model:\n" + model.toDebugString()); - - // Save and load model - model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel"); - RandomForestModel sameModel = RandomForestModel.load(jsc.sc(), - "target/tmp/myRandomForestClassificationModel"); - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestRegressionExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestRegressionExample.java deleted file mode 100644 index 4a0f55f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestRegressionExample.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.HashMap; -import java.util.Map; - -import scala.Tuple2; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.tree.RandomForest; -import org.apache.spark.mllib.tree.model.RandomForestModel; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.SparkConf; -// $example off$ - -public class JavaRandomForestRegressionExample { - public static void main(String[] args) { - // $example on$ - SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestRegressionExample"); - JavaSparkContext jsc = new JavaSparkContext(sparkConf); - // Load and parse the data file. - String datapath = "data/mllib/sample_libsvm_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); - // Split the data into training and test sets (30% held out for testing) - JavaRDD[] splits = data.randomSplit(new double[]{0.7, 0.3}); - JavaRDD trainingData = splits[0]; - JavaRDD testData = splits[1]; - - // Set parameters. - // Empty categoricalFeaturesInfo indicates all features are continuous. - Map categoricalFeaturesInfo = new HashMap<>(); - int numTrees = 3; // Use more in practice. - String featureSubsetStrategy = "auto"; // Let the algorithm choose. - String impurity = "variance"; - int maxDepth = 4; - int maxBins = 32; - int seed = 12345; - // Train a RandomForest model. - RandomForestModel model = RandomForest.trainRegressor(trainingData, - categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed); - - // Evaluate model on test instances and compute test error - JavaPairRDD predictionAndLabel = - testData.mapToPair(p -> new Tuple2<>(model.predict(p.features()), p.label())); - double testMSE = predictionAndLabel.mapToDouble(pl -> { - double diff = pl._1() - pl._2(); - return diff * diff; - }).mean(); - System.out.println("Test Mean Squared Error: " + testMSE); - System.out.println("Learned regression forest model:\n" + model.toDebugString()); - - // Save and load model - model.save(jsc.sc(), "target/tmp/myRandomForestRegressionModel"); - RandomForestModel sameModel = RandomForestModel.load(jsc.sc(), - "target/tmp/myRandomForestRegressionModel"); - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java deleted file mode 100644 index dc9970d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.*; - -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.mllib.evaluation.RegressionMetrics; -import org.apache.spark.mllib.evaluation.RankingMetrics; -import org.apache.spark.mllib.recommendation.ALS; -import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; -import org.apache.spark.mllib.recommendation.Rating; -// $example off$ -import org.apache.spark.SparkConf; - -public class JavaRankingMetricsExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Java Ranking Metrics Example"); - JavaSparkContext sc = new JavaSparkContext(conf); - // $example on$ - String path = "data/mllib/sample_movielens_data.txt"; - JavaRDD data = sc.textFile(path); - JavaRDD ratings = data.map(line -> { - String[] parts = line.split("::"); - return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double - .parseDouble(parts[2]) - 2.5); - }); - ratings.cache(); - - // Train an ALS model - MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), 10, 10, 0.01); - - // Get top 10 recommendations for every user and scale ratings from 0 to 1 - JavaRDD> userRecs = model.recommendProductsForUsers(10).toJavaRDD(); - JavaRDD> userRecsScaled = userRecs.map(t -> { - Rating[] scaledRatings = new Rating[t._2().length]; - for (int i = 0; i < scaledRatings.length; i++) { - double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0); - scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating); - } - return new Tuple2<>(t._1(), scaledRatings); - }); - JavaPairRDD userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled); - - // Map ratings to 1 or 0, 1 indicating a movie that should be recommended - JavaRDD binarizedRatings = ratings.map(r -> { - double binaryRating; - if (r.rating() > 0.0) { - binaryRating = 1.0; - } else { - binaryRating = 0.0; - } - return new Rating(r.user(), r.product(), binaryRating); - }); - - // Group ratings by common user - JavaPairRDD> userMovies = binarizedRatings.groupBy(Rating::user); - - // Get true relevant documents from all user ratings - JavaPairRDD> userMoviesList = userMovies.mapValues(docs -> { - List products = new ArrayList<>(); - for (Rating r : docs) { - if (r.rating() > 0.0) { - products.add(r.product()); - } - } - return products; - }); - - // Extract the product id from each recommendation - JavaPairRDD> userRecommendedList = userRecommended.mapValues(docs -> { - List products = new ArrayList<>(); - for (Rating r : docs) { - products.add(r.product()); - } - return products; - }); - JavaRDD, List>> relevantDocs = userMoviesList.join( - userRecommendedList).values(); - - // Instantiate the metrics object - RankingMetrics metrics = RankingMetrics.of(relevantDocs); - - // Precision and NDCG at k - Integer[] kVector = {1, 3, 5}; - for (Integer k : kVector) { - System.out.format("Precision at %d = %f\n", k, metrics.precisionAt(k)); - System.out.format("NDCG at %d = %f\n", k, metrics.ndcgAt(k)); - } - - // Mean average precision - System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision()); - - // Evaluate the model using numerical ratings and regression metrics - JavaRDD> userProducts = - ratings.map(r -> new Tuple2<>(r.user(), r.product())); - - JavaPairRDD, Object> predictions = JavaPairRDD.fromJavaRDD( - model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(r -> - new Tuple2<>(new Tuple2<>(r.user(), r.product()), r.rating()))); - JavaRDD> ratesAndPreds = - JavaPairRDD.fromJavaRDD(ratings.map(r -> - new Tuple2, Object>( - new Tuple2<>(r.user(), r.product()), - r.rating()) - )).join(predictions).values(); - - // Create regression metrics object - RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd()); - - // Root mean squared error - System.out.format("RMSE = %f\n", regressionMetrics.rootMeanSquaredError()); - - // R-squared - System.out.format("R-squared = %f\n", regressionMetrics.r2()); - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRecommendationExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRecommendationExample.java deleted file mode 100644 index 1ee68da..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRecommendationExample.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.mllib.recommendation.ALS; -import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; -import org.apache.spark.mllib.recommendation.Rating; -import org.apache.spark.SparkConf; -// $example off$ - -public class JavaRecommendationExample { - public static void main(String[] args) { - // $example on$ - SparkConf conf = new SparkConf().setAppName("Java Collaborative Filtering Example"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // Load and parse the data - String path = "data/mllib/als/test.data"; - JavaRDD data = jsc.textFile(path); - JavaRDD ratings = data.map(s -> { - String[] sarray = s.split(","); - return new Rating(Integer.parseInt(sarray[0]), - Integer.parseInt(sarray[1]), - Double.parseDouble(sarray[2])); - }); - - // Build the recommendation model using ALS - int rank = 10; - int numIterations = 10; - MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01); - - // Evaluate the model on rating data - JavaRDD> userProducts = - ratings.map(r -> new Tuple2<>(r.user(), r.product())); - JavaPairRDD, Double> predictions = JavaPairRDD.fromJavaRDD( - model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD() - .map(r -> new Tuple2<>(new Tuple2<>(r.user(), r.product()), r.rating())) - ); - JavaRDD> ratesAndPreds = JavaPairRDD.fromJavaRDD( - ratings.map(r -> new Tuple2<>(new Tuple2<>(r.user(), r.product()), r.rating()))) - .join(predictions).values(); - double MSE = ratesAndPreds.mapToDouble(pair -> { - double err = pair._1() - pair._2(); - return err * err; - }).mean(); - System.out.println("Mean Squared Error = " + MSE); - - // Save and load model - model.save(jsc.sc(), "target/tmp/myCollaborativeFilter"); - MatrixFactorizationModel sameModel = MatrixFactorizationModel.load(jsc.sc(), - "target/tmp/myCollaborativeFilter"); - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java deleted file mode 100644 index 00033b5..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.regression.LinearRegressionModel; -import org.apache.spark.mllib.regression.LinearRegressionWithSGD; -import org.apache.spark.mllib.evaluation.RegressionMetrics; -import org.apache.spark.SparkConf; -// $example off$ - -public class JavaRegressionMetricsExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Java Regression Metrics Example"); - JavaSparkContext sc = new JavaSparkContext(conf); - // $example on$ - // Load and parse the data - String path = "data/mllib/sample_linear_regression_data.txt"; - JavaRDD data = sc.textFile(path); - JavaRDD parsedData = data.map(line -> { - String[] parts = line.split(" "); - double[] v = new double[parts.length - 1]; - for (int i = 1; i < parts.length; i++) { - v[i - 1] = Double.parseDouble(parts[i].split(":")[1]); - } - return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); - }); - parsedData.cache(); - - // Building the model - int numIterations = 100; - LinearRegressionModel model = LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), - numIterations); - - // Evaluate model on training examples and compute training error - JavaPairRDD valuesAndPreds = parsedData.mapToPair(point -> - new Tuple2<>(model.predict(point.features()), point.label())); - - // Instantiate metrics object - RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd()); - - // Squared error - System.out.format("MSE = %f\n", metrics.meanSquaredError()); - System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError()); - - // R-squared - System.out.format("R Squared = %f\n", metrics.r2()); - - // Mean absolute error - System.out.format("MAE = %f\n", metrics.meanAbsoluteError()); - - // Explained variance - System.out.format("Explained Variance = %f\n", metrics.explainedVariance()); - - // Save and load model - model.save(sc.sc(), "target/tmp/LogisticRegressionModel"); - LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), - "target/tmp/LogisticRegressionModel"); - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java deleted file mode 100644 index 802be39..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.Arrays; -import java.util.List; -// $example off$ - -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; -// $example on$ -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.linalg.SingularValueDecomposition; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.linalg.distributed.RowMatrix; -// $example off$ - -/** - * Example for SingularValueDecomposition. - */ -public class JavaSVDExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("SVD Example"); - SparkContext sc = new SparkContext(conf); - JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); - - // $example on$ - List data = Arrays.asList( - Vectors.sparse(5, new int[] {1, 3}, new double[] {1.0, 7.0}), - Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), - Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) - ); - - JavaRDD rows = jsc.parallelize(data); - - // Create a RowMatrix from JavaRDD. - RowMatrix mat = new RowMatrix(rows.rdd()); - - // Compute the top 5 singular values and corresponding singular vectors. - SingularValueDecomposition svd = mat.computeSVD(5, true, 1.0E-9d); - RowMatrix U = svd.U(); // The U factor is a RowMatrix. - Vector s = svd.s(); // The singular values are stored in a local dense vector. - Matrix V = svd.V(); // The V factor is a local dense matrix. - // $example off$ - Vector[] collectPartitions = (Vector[]) U.rows().collect(); - System.out.println("U factor is:"); - for (Vector vector : collectPartitions) { - System.out.println("\t" + vector); - } - System.out.println("Singular values are: " + s); - System.out.println("V factor is:\n" + V); - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVMWithSGDExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVMWithSGDExample.java deleted file mode 100644 index 866a221..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVMWithSGDExample.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; - -// $example on$ -import scala.Tuple2; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.classification.SVMModel; -import org.apache.spark.mllib.classification.SVMWithSGD; -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -// $example off$ - -/** - * Example for SVMWithSGD. - */ -public class JavaSVMWithSGDExample { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaSVMWithSGDExample"); - SparkContext sc = new SparkContext(conf); - // $example on$ - String path = "data/mllib/sample_libsvm_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD training = data.sample(false, 0.6, 11L); - training.cache(); - JavaRDD test = data.subtract(training); - - // Run training algorithm to build the model. - int numIterations = 100; - SVMModel model = SVMWithSGD.train(training.rdd(), numIterations); - - // Clear the default threshold. - model.clearThreshold(); - - // Compute raw scores on the test set. - JavaRDD> scoreAndLabels = test.map(p -> - new Tuple2<>(model.predict(p.features()), p.label())); - - // Get evaluation metrics. - BinaryClassificationMetrics metrics = - new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels)); - double auROC = metrics.areaUnderROC(); - - System.out.println("Area under ROC = " + auROC); - - // Save and load model - model.save(sc, "target/tmp/javaSVMWithSGDModel"); - SVMModel sameModel = SVMModel.load(sc, "target/tmp/javaSVMWithSGDModel"); - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSimpleFPGrowth.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSimpleFPGrowth.java deleted file mode 100644 index f9198e7..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSimpleFPGrowth.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.fpm.AssociationRules; -import org.apache.spark.mllib.fpm.FPGrowth; -import org.apache.spark.mllib.fpm.FPGrowthModel; -// $example off$ - -import org.apache.spark.SparkConf; - -public class JavaSimpleFPGrowth { - - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("FP-growth Example"); - JavaSparkContext sc = new JavaSparkContext(conf); - - // $example on$ - JavaRDD data = sc.textFile("data/mllib/sample_fpgrowth.txt"); - - JavaRDD> transactions = data.map(line -> Arrays.asList(line.split(" "))); - - FPGrowth fpg = new FPGrowth() - .setMinSupport(0.2) - .setNumPartitions(10); - FPGrowthModel model = fpg.run(transactions); - - for (FPGrowth.FreqItemset itemset: model.freqItemsets().toJavaRDD().collect()) { - System.out.println("[" + itemset.javaItems() + "], " + itemset.freq()); - } - - double minConfidence = 0.8; - for (AssociationRules.Rule rule - : model.generateAssociationRules(minConfidence).toJavaRDD().collect()) { - System.out.println( - rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence()); - } - // $example off$ - - sc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java deleted file mode 100644 index 286b95c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import com.google.common.collect.ImmutableMap; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; - -// $example on$ -import java.util.*; - -import scala.Tuple2; - -import org.apache.spark.api.java.JavaPairRDD; -// $example off$ - -public class JavaStratifiedSamplingExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - @SuppressWarnings("unchecked") - // $example on$ - List> list = Arrays.asList( - new Tuple2<>(1, 'a'), - new Tuple2<>(1, 'b'), - new Tuple2<>(2, 'c'), - new Tuple2<>(2, 'd'), - new Tuple2<>(2, 'e'), - new Tuple2<>(3, 'f') - ); - - JavaPairRDD data = jsc.parallelizePairs(list); - - // specify the exact fraction desired from each key Map - ImmutableMap fractions = ImmutableMap.of(1, 0.1, 2, 0.6, 3, 0.3); - - // Get an approximate sample from each stratum - JavaPairRDD approxSample = data.sampleByKey(false, fractions); - // Get an exact sample from each stratum - JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); - // $example off$ - - System.out.println("approxSample size is " + approxSample.collect().size()); - for (Tuple2 t : approxSample.collect()) { - System.out.println(t._1() + " " + t._2()); - } - - System.out.println("exactSample size is " + exactSample.collect().size()); - for (Tuple2 t : exactSample.collect()) { - System.out.println(t._1() + " " + t._2()); - } - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaStreamingTestExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaStreamingTestExample.java deleted file mode 100644 index 4be702c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaStreamingTestExample.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -// $example on$ -import org.apache.spark.mllib.stat.test.BinarySample; -import org.apache.spark.mllib.stat.test.StreamingTest; -import org.apache.spark.mllib.stat.test.StreamingTestResult; -// $example off$ -import org.apache.spark.SparkConf; -import org.apache.spark.streaming.Duration; -import org.apache.spark.streaming.Seconds; -import org.apache.spark.streaming.api.java.JavaDStream; -import org.apache.spark.streaming.api.java.JavaStreamingContext; -import org.apache.spark.util.Utils; - - -/** - * Perform streaming testing using Welch's 2-sample t-test on a stream of data, where the data - * stream arrives as text files in a directory. Stops when the two groups are statistically - * significant (p-value < 0.05) or after a user-specified timeout in number of batches is exceeded. - * - * The rows of the text files must be in the form `Boolean, Double`. For example: - * false, -3.92 - * true, 99.32 - * - * Usage: - * JavaStreamingTestExample - * - * To run on your local machine using the directory `dataDir` with 5 seconds between each batch and - * a timeout after 100 insignificant batches, call: - * $ bin/run-example mllib.JavaStreamingTestExample dataDir 5 100 - * - * As you add text files to `dataDir` the significance test wil continually update every - * `batchDuration` seconds until the test becomes significant (p-value < 0.05) or the number of - * batches processed exceeds `numBatchesTimeout`. - */ -public class JavaStreamingTestExample { - - private static int timeoutCounter = 0; - - public static void main(String[] args) throws Exception { - if (args.length != 3) { - System.err.println("Usage: JavaStreamingTestExample " + - " "); - System.exit(1); - } - - String dataDir = args[0]; - Duration batchDuration = Seconds.apply(Long.parseLong(args[1])); - int numBatchesTimeout = Integer.parseInt(args[2]); - - SparkConf conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample"); - JavaStreamingContext ssc = new JavaStreamingContext(conf, batchDuration); - - ssc.checkpoint(Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark").toString()); - - // $example on$ - JavaDStream data = ssc.textFileStream(dataDir).map(line -> { - String[] ts = line.split(","); - boolean label = Boolean.parseBoolean(ts[0]); - double value = Double.parseDouble(ts[1]); - return new BinarySample(label, value); - }); - - StreamingTest streamingTest = new StreamingTest() - .setPeacePeriod(0) - .setWindowSize(0) - .setTestMethod("welch"); - - JavaDStream out = streamingTest.registerStream(data); - out.print(); - // $example off$ - - // Stop processing if test becomes significant or we time out - timeoutCounter = numBatchesTimeout; - - out.foreachRDD(rdd -> { - timeoutCounter -= 1; - boolean anySignificant = !rdd.filter(v -> v.pValue() < 0.05).isEmpty(); - if (timeoutCounter <= 0 || anySignificant) { - rdd.context().stop(); - } - }); - - ssc.start(); - ssc.awaitTermination(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java deleted file mode 100644 index 278706b..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -// $example on$ -import java.util.Arrays; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; -import org.apache.spark.mllib.stat.Statistics; -// $example off$ - -public class JavaSummaryStatisticsExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - JavaRDD mat = jsc.parallelize( - Arrays.asList( - Vectors.dense(1.0, 10.0, 100.0), - Vectors.dense(2.0, 20.0, 200.0), - Vectors.dense(3.0, 30.0, 300.0) - ) - ); // an RDD of Vectors - - // Compute column summary statistics. - MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); - System.out.println(summary.mean()); // a dense vector containing the mean value for each column - System.out.println(summary.variance()); // column-wise variance - System.out.println(summary.numNonzeros()); // number of nonzeros in each column - // $example off$ - - jsc.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java deleted file mode 100644 index 97e9ca3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.sql; - -// $example on:schema_merging$ -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -// $example off:schema_merging$ -import java.util.Properties; - -// $example on:basic_parquet_example$ -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Encoders; -// $example on:schema_merging$ -// $example on:json_dataset$ -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off:json_dataset$ -// $example off:schema_merging$ -// $example off:basic_parquet_example$ -import org.apache.spark.sql.SparkSession; - -public class JavaSQLDataSourceExample { - - // $example on:schema_merging$ - public static class Square implements Serializable { - private int value; - private int square; - - // Getters and setters... - // $example off:schema_merging$ - public int getValue() { - return value; - } - - public void setValue(int value) { - this.value = value; - } - - public int getSquare() { - return square; - } - - public void setSquare(int square) { - this.square = square; - } - // $example on:schema_merging$ - } - // $example off:schema_merging$ - - // $example on:schema_merging$ - public static class Cube implements Serializable { - private int value; - private int cube; - - // Getters and setters... - // $example off:schema_merging$ - public int getValue() { - return value; - } - - public void setValue(int value) { - this.value = value; - } - - public int getCube() { - return cube; - } - - public void setCube(int cube) { - this.cube = cube; - } - // $example on:schema_merging$ - } - // $example off:schema_merging$ - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("Java Spark SQL data sources example") - .config("spark.some.config.option", "some-value") - .getOrCreate(); - - runBasicDataSourceExample(spark); - runBasicParquetExample(spark); - runParquetSchemaMergingExample(spark); - runJsonDatasetExample(spark); - runJdbcDatasetExample(spark); - - spark.stop(); - } - - private static void runBasicDataSourceExample(SparkSession spark) { - // $example on:generic_load_save_functions$ - Dataset usersDF = spark.read().load("examples/src/main/resources/users.parquet"); - usersDF.select("name", "favorite_color").write().save("namesAndFavColors.parquet"); - // $example off:generic_load_save_functions$ - // $example on:manual_load_options$ - Dataset peopleDF = - spark.read().format("json").load("examples/src/main/resources/people.json"); - peopleDF.select("name", "age").write().format("parquet").save("namesAndAges.parquet"); - // $example off:manual_load_options$ - // $example on:manual_load_options_csv$ - Dataset peopleDFCsv = spark.read().format("csv") - .option("sep", ";") - .option("inferSchema", "true") - .option("header", "true") - .load("examples/src/main/resources/people.csv"); - // $example off:manual_load_options_csv$ - // $example on:manual_save_options_orc$ - usersDF.write().format("orc") - .option("orc.bloom.filter.columns", "favorite_color") - .option("orc.dictionary.key.threshold", "1.0") - .save("users_with_options.orc"); - // $example off:manual_save_options_orc$ - // $example on:direct_sql$ - Dataset sqlDF = - spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"); - // $example off:direct_sql$ - // $example on:write_sorting_and_bucketing$ - peopleDF.write().bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed"); - // $example off:write_sorting_and_bucketing$ - // $example on:write_partitioning$ - usersDF - .write() - .partitionBy("favorite_color") - .format("parquet") - .save("namesPartByColor.parquet"); - // $example off:write_partitioning$ - // $example on:write_partition_and_bucket$ - peopleDF - .write() - .partitionBy("favorite_color") - .bucketBy(42, "name") - .saveAsTable("people_partitioned_bucketed"); - // $example off:write_partition_and_bucket$ - - spark.sql("DROP TABLE IF EXISTS people_bucketed"); - spark.sql("DROP TABLE IF EXISTS people_partitioned_bucketed"); - } - - private static void runBasicParquetExample(SparkSession spark) { - // $example on:basic_parquet_example$ - Dataset peopleDF = spark.read().json("examples/src/main/resources/people.json"); - - // DataFrames can be saved as Parquet files, maintaining the schema information - peopleDF.write().parquet("people.parquet"); - - // Read in the Parquet file created above. - // Parquet files are self-describing so the schema is preserved - // The result of loading a parquet file is also a DataFrame - Dataset parquetFileDF = spark.read().parquet("people.parquet"); - - // Parquet files can also be used to create a temporary view and then used in SQL statements - parquetFileDF.createOrReplaceTempView("parquetFile"); - Dataset namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19"); - Dataset namesDS = namesDF.map( - (MapFunction) row -> "Name: " + row.getString(0), - Encoders.STRING()); - namesDS.show(); - // +------------+ - // | value| - // +------------+ - // |Name: Justin| - // +------------+ - // $example off:basic_parquet_example$ - } - - private static void runParquetSchemaMergingExample(SparkSession spark) { - // $example on:schema_merging$ - List squares = new ArrayList<>(); - for (int value = 1; value <= 5; value++) { - Square square = new Square(); - square.setValue(value); - square.setSquare(value * value); - squares.add(square); - } - - // Create a simple DataFrame, store into a partition directory - Dataset squaresDF = spark.createDataFrame(squares, Square.class); - squaresDF.write().parquet("data/test_table/key=1"); - - List cubes = new ArrayList<>(); - for (int value = 6; value <= 10; value++) { - Cube cube = new Cube(); - cube.setValue(value); - cube.setCube(value * value * value); - cubes.add(cube); - } - - // Create another DataFrame in a new partition directory, - // adding a new column and dropping an existing column - Dataset cubesDF = spark.createDataFrame(cubes, Cube.class); - cubesDF.write().parquet("data/test_table/key=2"); - - // Read the partitioned table - Dataset mergedDF = spark.read().option("mergeSchema", true).parquet("data/test_table"); - mergedDF.printSchema(); - - // The final schema consists of all 3 columns in the Parquet files together - // with the partitioning column appeared in the partition directory paths - // root - // |-- value: int (nullable = true) - // |-- square: int (nullable = true) - // |-- cube: int (nullable = true) - // |-- key: int (nullable = true) - // $example off:schema_merging$ - } - - private static void runJsonDatasetExample(SparkSession spark) { - // $example on:json_dataset$ - // A JSON dataset is pointed to by path. - // The path can be either a single text file or a directory storing text files - Dataset people = spark.read().json("examples/src/main/resources/people.json"); - - // The inferred schema can be visualized using the printSchema() method - people.printSchema(); - // root - // |-- age: long (nullable = true) - // |-- name: string (nullable = true) - - // Creates a temporary view using the DataFrame - people.createOrReplaceTempView("people"); - - // SQL statements can be run by using the sql methods provided by spark - Dataset namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); - namesDF.show(); - // +------+ - // | name| - // +------+ - // |Justin| - // +------+ - - // Alternatively, a DataFrame can be created for a JSON dataset represented by - // a Dataset storing one JSON object per string. - List jsonData = Arrays.asList( - "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); - Dataset anotherPeopleDataset = spark.createDataset(jsonData, Encoders.STRING()); - Dataset anotherPeople = spark.read().json(anotherPeopleDataset); - anotherPeople.show(); - // +---------------+----+ - // | address|name| - // +---------------+----+ - // |[Columbus,Ohio]| Yin| - // +---------------+----+ - // $example off:json_dataset$ - } - - private static void runJdbcDatasetExample(SparkSession spark) { - // $example on:jdbc_dataset$ - // Note: JDBC loading and saving can be achieved via either the load/save or jdbc methods - // Loading data from a JDBC source - Dataset jdbcDF = spark.read() - .format("jdbc") - .option("url", "jdbc:postgresql:dbserver") - .option("dbtable", "schema.tablename") - .option("user", "username") - .option("password", "password") - .load(); - - Properties connectionProperties = new Properties(); - connectionProperties.put("user", "username"); - connectionProperties.put("password", "password"); - Dataset jdbcDF2 = spark.read() - .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties); - - // Saving data to a JDBC source - jdbcDF.write() - .format("jdbc") - .option("url", "jdbc:postgresql:dbserver") - .option("dbtable", "schema.tablename") - .option("user", "username") - .option("password", "password") - .save(); - - jdbcDF2.write() - .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties); - - // Specifying create table column data types on write - jdbcDF.write() - .option("createTableColumnTypes", "name CHAR(64), comments VARCHAR(1024)") - .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties); - // $example off:jdbc_dataset$ - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java deleted file mode 100644 index 8605852..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java +++ /dev/null @@ -1,344 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.sql; - -// $example on:programmatic_schema$ -import java.util.ArrayList; -import java.util.List; -// $example off:programmatic_schema$ -// $example on:create_ds$ -import java.util.Arrays; -import java.util.Collections; -import java.io.Serializable; -// $example off:create_ds$ - -// $example on:schema_inferring$ -// $example on:programmatic_schema$ -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; -// $example off:programmatic_schema$ -// $example on:create_ds$ -import org.apache.spark.api.java.function.MapFunction; -// $example on:create_df$ -// $example on:run_sql$ -// $example on:programmatic_schema$ -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -// $example off:programmatic_schema$ -// $example off:create_df$ -// $example off:run_sql$ -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -// $example off:create_ds$ -// $example off:schema_inferring$ -import org.apache.spark.sql.RowFactory; -// $example on:init_session$ -import org.apache.spark.sql.SparkSession; -// $example off:init_session$ -// $example on:programmatic_schema$ -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off:programmatic_schema$ -import org.apache.spark.sql.AnalysisException; - -// $example on:untyped_ops$ -// col("...") is preferable to df.col("...") -import static org.apache.spark.sql.functions.col; -// $example off:untyped_ops$ - -public class JavaSparkSQLExample { - // $example on:create_ds$ - public static class Person implements Serializable { - private String name; - private int age; - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public int getAge() { - return age; - } - - public void setAge(int age) { - this.age = age; - } - } - // $example off:create_ds$ - - public static void main(String[] args) throws AnalysisException { - // $example on:init_session$ - SparkSession spark = SparkSession - .builder() - .appName("Java Spark SQL basic example") - .config("spark.some.config.option", "some-value") - .getOrCreate(); - // $example off:init_session$ - - runBasicDataFrameExample(spark); - runDatasetCreationExample(spark); - runInferSchemaExample(spark); - runProgrammaticSchemaExample(spark); - - spark.stop(); - } - - private static void runBasicDataFrameExample(SparkSession spark) throws AnalysisException { - // $example on:create_df$ - Dataset df = spark.read().json("examples/src/main/resources/people.json"); - - // Displays the content of the DataFrame to stdout - df.show(); - // +----+-------+ - // | age| name| - // +----+-------+ - // |null|Michael| - // | 30| Andy| - // | 19| Justin| - // +----+-------+ - // $example off:create_df$ - - // $example on:untyped_ops$ - // Print the schema in a tree format - df.printSchema(); - // root - // |-- age: long (nullable = true) - // |-- name: string (nullable = true) - - // Select only the "name" column - df.select("name").show(); - // +-------+ - // | name| - // +-------+ - // |Michael| - // | Andy| - // | Justin| - // +-------+ - - // Select everybody, but increment the age by 1 - df.select(col("name"), col("age").plus(1)).show(); - // +-------+---------+ - // | name|(age + 1)| - // +-------+---------+ - // |Michael| null| - // | Andy| 31| - // | Justin| 20| - // +-------+---------+ - - // Select people older than 21 - df.filter(col("age").gt(21)).show(); - // +---+----+ - // |age|name| - // +---+----+ - // | 30|Andy| - // +---+----+ - - // Count people by age - df.groupBy("age").count().show(); - // +----+-----+ - // | age|count| - // +----+-----+ - // | 19| 1| - // |null| 1| - // | 30| 1| - // +----+-----+ - // $example off:untyped_ops$ - - // $example on:run_sql$ - // Register the DataFrame as a SQL temporary view - df.createOrReplaceTempView("people"); - - Dataset sqlDF = spark.sql("SELECT * FROM people"); - sqlDF.show(); - // +----+-------+ - // | age| name| - // +----+-------+ - // |null|Michael| - // | 30| Andy| - // | 19| Justin| - // +----+-------+ - // $example off:run_sql$ - - // $example on:global_temp_view$ - // Register the DataFrame as a global temporary view - df.createGlobalTempView("people"); - - // Global temporary view is tied to a system preserved database `global_temp` - spark.sql("SELECT * FROM global_temp.people").show(); - // +----+-------+ - // | age| name| - // +----+-------+ - // |null|Michael| - // | 30| Andy| - // | 19| Justin| - // +----+-------+ - - // Global temporary view is cross-session - spark.newSession().sql("SELECT * FROM global_temp.people").show(); - // +----+-------+ - // | age| name| - // +----+-------+ - // |null|Michael| - // | 30| Andy| - // | 19| Justin| - // +----+-------+ - // $example off:global_temp_view$ - } - - private static void runDatasetCreationExample(SparkSession spark) { - // $example on:create_ds$ - // Create an instance of a Bean class - Person person = new Person(); - person.setName("Andy"); - person.setAge(32); - - // Encoders are created for Java beans - Encoder personEncoder = Encoders.bean(Person.class); - Dataset javaBeanDS = spark.createDataset( - Collections.singletonList(person), - personEncoder - ); - javaBeanDS.show(); - // +---+----+ - // |age|name| - // +---+----+ - // | 32|Andy| - // +---+----+ - - // Encoders for most common types are provided in class Encoders - Encoder integerEncoder = Encoders.INT(); - Dataset primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder); - Dataset transformedDS = primitiveDS.map( - (MapFunction) value -> value + 1, - integerEncoder); - transformedDS.collect(); // Returns [2, 3, 4] - - // DataFrames can be converted to a Dataset by providing a class. Mapping based on name - String path = "examples/src/main/resources/people.json"; - Dataset peopleDS = spark.read().json(path).as(personEncoder); - peopleDS.show(); - // +----+-------+ - // | age| name| - // +----+-------+ - // |null|Michael| - // | 30| Andy| - // | 19| Justin| - // +----+-------+ - // $example off:create_ds$ - } - - private static void runInferSchemaExample(SparkSession spark) { - // $example on:schema_inferring$ - // Create an RDD of Person objects from a text file - JavaRDD peopleRDD = spark.read() - .textFile("examples/src/main/resources/people.txt") - .javaRDD() - .map(line -> { - String[] parts = line.split(","); - Person person = new Person(); - person.setName(parts[0]); - person.setAge(Integer.parseInt(parts[1].trim())); - return person; - }); - - // Apply a schema to an RDD of JavaBeans to get a DataFrame - Dataset peopleDF = spark.createDataFrame(peopleRDD, Person.class); - // Register the DataFrame as a temporary view - peopleDF.createOrReplaceTempView("people"); - - // SQL statements can be run by using the sql methods provided by spark - Dataset teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); - - // The columns of a row in the result can be accessed by field index - Encoder stringEncoder = Encoders.STRING(); - Dataset teenagerNamesByIndexDF = teenagersDF.map( - (MapFunction) row -> "Name: " + row.getString(0), - stringEncoder); - teenagerNamesByIndexDF.show(); - // +------------+ - // | value| - // +------------+ - // |Name: Justin| - // +------------+ - - // or by field name - Dataset teenagerNamesByFieldDF = teenagersDF.map( - (MapFunction) row -> "Name: " + row.getAs("name"), - stringEncoder); - teenagerNamesByFieldDF.show(); - // +------------+ - // | value| - // +------------+ - // |Name: Justin| - // +------------+ - // $example off:schema_inferring$ - } - - private static void runProgrammaticSchemaExample(SparkSession spark) { - // $example on:programmatic_schema$ - // Create an RDD - JavaRDD peopleRDD = spark.sparkContext() - .textFile("examples/src/main/resources/people.txt", 1) - .toJavaRDD(); - - // The schema is encoded in a string - String schemaString = "name age"; - - // Generate the schema based on the string of schema - List fields = new ArrayList<>(); - for (String fieldName : schemaString.split(" ")) { - StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); - fields.add(field); - } - StructType schema = DataTypes.createStructType(fields); - - // Convert records of the RDD (people) to Rows - JavaRDD rowRDD = peopleRDD.map((Function) record -> { - String[] attributes = record.split(","); - return RowFactory.create(attributes[0], attributes[1].trim()); - }); - - // Apply the schema to the RDD - Dataset peopleDataFrame = spark.createDataFrame(rowRDD, schema); - - // Creates a temporary view using the DataFrame - peopleDataFrame.createOrReplaceTempView("people"); - - // SQL can be run over a temporary view created using DataFrames - Dataset results = spark.sql("SELECT name FROM people"); - - // The results of SQL queries are DataFrames and support all the normal RDD operations - // The columns of a row in the result can be accessed by field index or by field name - Dataset namesDS = results.map( - (MapFunction) row -> "Name: " + row.getString(0), - Encoders.STRING()); - namesDS.show(); - // +-------------+ - // | value| - // +-------------+ - // |Name: Michael| - // | Name: Andy| - // | Name: Justin| - // +-------------+ - // $example off:programmatic_schema$ - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedTypedAggregation.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedTypedAggregation.java deleted file mode 100644 index 78e9011..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedTypedAggregation.java +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.sql; - -// $example on:typed_custom_aggregation$ -import java.io.Serializable; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.TypedColumn; -import org.apache.spark.sql.expressions.Aggregator; -// $example off:typed_custom_aggregation$ - -public class JavaUserDefinedTypedAggregation { - - // $example on:typed_custom_aggregation$ - public static class Employee implements Serializable { - private String name; - private long salary; - - // Constructors, getters, setters... - // $example off:typed_custom_aggregation$ - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public long getSalary() { - return salary; - } - - public void setSalary(long salary) { - this.salary = salary; - } - // $example on:typed_custom_aggregation$ - } - - public static class Average implements Serializable { - private long sum; - private long count; - - // Constructors, getters, setters... - // $example off:typed_custom_aggregation$ - public Average() { - } - - public Average(long sum, long count) { - this.sum = sum; - this.count = count; - } - - public long getSum() { - return sum; - } - - public void setSum(long sum) { - this.sum = sum; - } - - public long getCount() { - return count; - } - - public void setCount(long count) { - this.count = count; - } - // $example on:typed_custom_aggregation$ - } - - public static class MyAverage extends Aggregator { - // A zero value for this aggregation. Should satisfy the property that any b + zero = b - public Average zero() { - return new Average(0L, 0L); - } - // Combine two values to produce a new value. For performance, the function may modify `buffer` - // and return it instead of constructing a new object - public Average reduce(Average buffer, Employee employee) { - long newSum = buffer.getSum() + employee.getSalary(); - long newCount = buffer.getCount() + 1; - buffer.setSum(newSum); - buffer.setCount(newCount); - return buffer; - } - // Merge two intermediate values - public Average merge(Average b1, Average b2) { - long mergedSum = b1.getSum() + b2.getSum(); - long mergedCount = b1.getCount() + b2.getCount(); - b1.setSum(mergedSum); - b1.setCount(mergedCount); - return b1; - } - // Transform the output of the reduction - public Double finish(Average reduction) { - return ((double) reduction.getSum()) / reduction.getCount(); - } - // Specifies the Encoder for the intermediate value type - public Encoder bufferEncoder() { - return Encoders.bean(Average.class); - } - // Specifies the Encoder for the final output value type - public Encoder outputEncoder() { - return Encoders.DOUBLE(); - } - } - // $example off:typed_custom_aggregation$ - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("Java Spark SQL user-defined Datasets aggregation example") - .getOrCreate(); - - // $example on:typed_custom_aggregation$ - Encoder employeeEncoder = Encoders.bean(Employee.class); - String path = "examples/src/main/resources/employees.json"; - Dataset ds = spark.read().json(path).as(employeeEncoder); - ds.show(); - // +-------+------+ - // | name|salary| - // +-------+------+ - // |Michael| 3000| - // | Andy| 4500| - // | Justin| 3500| - // | Berta| 4000| - // +-------+------+ - - MyAverage myAverage = new MyAverage(); - // Convert the function to a `TypedColumn` and give it a name - TypedColumn averageSalary = myAverage.toColumn().name("average_salary"); - Dataset result = ds.select(averageSalary); - result.show(); - // +--------------+ - // |average_salary| - // +--------------+ - // | 3750.0| - // +--------------+ - // $example off:typed_custom_aggregation$ - spark.stop(); - } - -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedUntypedAggregation.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedUntypedAggregation.java deleted file mode 100644 index 6da60a1..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedUntypedAggregation.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.sql; - -// $example on:untyped_custom_aggregation$ -import java.util.ArrayList; -import java.util.List; - -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.expressions.MutableAggregationBuffer; -import org.apache.spark.sql.expressions.UserDefinedAggregateFunction; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -// $example off:untyped_custom_aggregation$ - -public class JavaUserDefinedUntypedAggregation { - - // $example on:untyped_custom_aggregation$ - public static class MyAverage extends UserDefinedAggregateFunction { - - private StructType inputSchema; - private StructType bufferSchema; - - public MyAverage() { - List inputFields = new ArrayList<>(); - inputFields.add(DataTypes.createStructField("inputColumn", DataTypes.LongType, true)); - inputSchema = DataTypes.createStructType(inputFields); - - List bufferFields = new ArrayList<>(); - bufferFields.add(DataTypes.createStructField("sum", DataTypes.LongType, true)); - bufferFields.add(DataTypes.createStructField("count", DataTypes.LongType, true)); - bufferSchema = DataTypes.createStructType(bufferFields); - } - // Data types of input arguments of this aggregate function - public StructType inputSchema() { - return inputSchema; - } - // Data types of values in the aggregation buffer - public StructType bufferSchema() { - return bufferSchema; - } - // The data type of the returned value - public DataType dataType() { - return DataTypes.DoubleType; - } - // Whether this function always returns the same output on the identical input - public boolean deterministic() { - return true; - } - // Initializes the given aggregation buffer. The buffer itself is a `Row` that in addition to - // standard methods like retrieving a value at an index (e.g., get(), getBoolean()), provides - // the opportunity to update its values. Note that arrays and maps inside the buffer are still - // immutable. - public void initialize(MutableAggregationBuffer buffer) { - buffer.update(0, 0L); - buffer.update(1, 0L); - } - // Updates the given aggregation buffer `buffer` with new input data from `input` - public void update(MutableAggregationBuffer buffer, Row input) { - if (!input.isNullAt(0)) { - long updatedSum = buffer.getLong(0) + input.getLong(0); - long updatedCount = buffer.getLong(1) + 1; - buffer.update(0, updatedSum); - buffer.update(1, updatedCount); - } - } - // Merges two aggregation buffers and stores the updated buffer values back to `buffer1` - public void merge(MutableAggregationBuffer buffer1, Row buffer2) { - long mergedSum = buffer1.getLong(0) + buffer2.getLong(0); - long mergedCount = buffer1.getLong(1) + buffer2.getLong(1); - buffer1.update(0, mergedSum); - buffer1.update(1, mergedCount); - } - // Calculates the final result - public Double evaluate(Row buffer) { - return ((double) buffer.getLong(0)) / buffer.getLong(1); - } - } - // $example off:untyped_custom_aggregation$ - - public static void main(String[] args) { - SparkSession spark = SparkSession - .builder() - .appName("Java Spark SQL user-defined DataFrames aggregation example") - .getOrCreate(); - - // $example on:untyped_custom_aggregation$ - // Register the function to access it - spark.udf().register("myAverage", new MyAverage()); - - Dataset df = spark.read().json("examples/src/main/resources/employees.json"); - df.createOrReplaceTempView("employees"); - df.show(); - // +-------+------+ - // | name|salary| - // +-------+------+ - // |Michael| 3000| - // | Andy| 4500| - // | Justin| 3500| - // | Berta| 4000| - // +-------+------+ - - Dataset result = spark.sql("SELECT myAverage(salary) as average_salary FROM employees"); - result.show(); - // +--------------+ - // |average_salary| - // +--------------+ - // | 3750.0| - // +--------------+ - // $example off:untyped_custom_aggregation$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java deleted file mode 100644 index 575a463..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.sql.hive; - -// $example on:spark_hive$ -import java.io.File; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; - -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -// $example off:spark_hive$ - -public class JavaSparkHiveExample { - - // $example on:spark_hive$ - public static class Record implements Serializable { - private int key; - private String value; - - public int getKey() { - return key; - } - - public void setKey(int key) { - this.key = key; - } - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } - } - // $example off:spark_hive$ - - public static void main(String[] args) { - // $example on:spark_hive$ - // warehouseLocation points to the default location for managed databases and tables - String warehouseLocation = new File("spark-warehouse").getAbsolutePath(); - SparkSession spark = SparkSession - .builder() - .appName("Java Spark Hive Example") - .config("spark.sql.warehouse.dir", warehouseLocation) - .enableHiveSupport() - .getOrCreate(); - - spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive"); - spark.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"); - - // Queries are expressed in HiveQL - spark.sql("SELECT * FROM src").show(); - // +---+-------+ - // |key| value| - // +---+-------+ - // |238|val_238| - // | 86| val_86| - // |311|val_311| - // ... - - // Aggregation queries are also supported. - spark.sql("SELECT COUNT(*) FROM src").show(); - // +--------+ - // |count(1)| - // +--------+ - // | 500 | - // +--------+ - - // The results of SQL queries are themselves DataFrames and support all normal functions. - Dataset sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key"); - - // The items in DataFrames are of type Row, which lets you to access each column by ordinal. - Dataset stringsDS = sqlDF.map( - (MapFunction) row -> "Key: " + row.get(0) + ", Value: " + row.get(1), - Encoders.STRING()); - stringsDS.show(); - // +--------------------+ - // | value| - // +--------------------+ - // |Key: 0, Value: val_0| - // |Key: 0, Value: val_0| - // |Key: 0, Value: val_0| - // ... - - // You can also use DataFrames to create temporary views within a SparkSession. - List records = new ArrayList<>(); - for (int key = 1; key < 100; key++) { - Record record = new Record(); - record.setKey(key); - record.setValue("val_" + key); - records.add(record); - } - Dataset recordsDF = spark.createDataFrame(records, Record.class); - recordsDF.createOrReplaceTempView("records"); - - // Queries can then join DataFrames data with data stored in Hive. - spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show(); - // +---+------+---+------+ - // |key| value|key| value| - // +---+------+---+------+ - // | 2| val_2| 2| val_2| - // | 2| val_2| 2| val_2| - // | 4| val_4| 4| val_4| - // ... - // $example off:spark_hive$ - - spark.stop(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredKafkaWordCount.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredKafkaWordCount.java deleted file mode 100644 index 4e02719..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredKafkaWordCount.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.sql.streaming; - -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.streaming.StreamingQuery; - -import java.util.Arrays; - -/** - * Consumes messages from one or more topics in Kafka and does wordcount. - * Usage: JavaStructuredKafkaWordCount - * The Kafka "bootstrap.servers" configuration. A - * comma-separated list of host:port. - * There are three kinds of type, i.e. 'assign', 'subscribe', - * 'subscribePattern'. - * |- Specific TopicPartitions to consume. Json string - * | {"topicA":[0,1],"topicB":[2,4]}. - * |- The topic list to subscribe. A comma-separated list of - * | topics. - * |- The pattern used to subscribe to topic(s). - * | Java regex string. - * |- Only one of "assign, "subscribe" or "subscribePattern" options can be - * | specified for Kafka source. - * Different value format depends on the value of 'subscribe-type'. - * - * Example: - * `$ bin/run-example \ - * sql.streaming.JavaStructuredKafkaWordCount host1:port1,host2:port2 \ - * subscribe topic1,topic2` - */ -public final class JavaStructuredKafkaWordCount { - - public static void main(String[] args) throws Exception { - if (args.length < 3) { - System.err.println("Usage: JavaStructuredKafkaWordCount " + - " "); - System.exit(1); - } - - String bootstrapServers = args[0]; - String subscribeType = args[1]; - String topics = args[2]; - - SparkSession spark = SparkSession - .builder() - .appName("JavaStructuredKafkaWordCount") - .getOrCreate(); - - // Create DataSet representing the stream of input lines from kafka - Dataset lines = spark - .readStream() - .format("kafka") - .option("kafka.bootstrap.servers", bootstrapServers) - .option(subscribeType, topics) - .load() - .selectExpr("CAST(value AS STRING)") - .as(Encoders.STRING()); - - // Generate running word count - Dataset wordCounts = lines.flatMap( - (FlatMapFunction) x -> Arrays.asList(x.split(" ")).iterator(), - Encoders.STRING()).groupBy("value").count(); - - // Start running the query that prints the running counts to the console - StreamingQuery query = wordCounts.writeStream() - .outputMode("complete") - .format("console") - .start(); - - query.awaitTermination(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredNetworkWordCount.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredNetworkWordCount.java deleted file mode 100644 index 3af7869..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredNetworkWordCount.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.sql.streaming; - -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.sql.*; -import org.apache.spark.sql.streaming.StreamingQuery; - -import java.util.Arrays; - -/** - * Counts words in UTF8 encoded, '\n' delimited text received from the network. - * - * Usage: JavaStructuredNetworkWordCount - * and describe the TCP server that Structured Streaming - * would connect to receive data. - * - * To run this on your local machine, you need to first run a Netcat server - * `$ nc -lk 9999` - * and then run the example - * `$ bin/run-example sql.streaming.JavaStructuredNetworkWordCount - * localhost 9999` - */ -public final class JavaStructuredNetworkWordCount { - - public static void main(String[] args) throws Exception { - if (args.length < 2) { - System.err.println("Usage: JavaStructuredNetworkWordCount "); - System.exit(1); - } - - String host = args[0]; - int port = Integer.parseInt(args[1]); - - SparkSession spark = SparkSession - .builder() - .appName("JavaStructuredNetworkWordCount") - .getOrCreate(); - - // Create DataFrame representing the stream of input lines from connection to host:port - Dataset lines = spark - .readStream() - .format("socket") - .option("host", host) - .option("port", port) - .load(); - - // Split the lines into words - Dataset words = lines.as(Encoders.STRING()).flatMap( - (FlatMapFunction) x -> Arrays.asList(x.split(" ")).iterator(), - Encoders.STRING()); - - // Generate running word count - Dataset wordCounts = words.groupBy("value").count(); - - // Start running the query that prints the running counts to the console - StreamingQuery query = wordCounts.writeStream() - .outputMode("complete") - .format("console") - .start(); - - query.awaitTermination(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredNetworkWordCountWindowed.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredNetworkWordCountWindowed.java deleted file mode 100644 index 93ec5e2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredNetworkWordCountWindowed.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.sql.streaming; - -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.sql.*; -import org.apache.spark.sql.streaming.StreamingQuery; -import scala.Tuple2; - -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.List; - -/** - * Counts words in UTF8 encoded, '\n' delimited text received from the network over a - * sliding window of configurable duration. Each line from the network is tagged - * with a timestamp that is used to determine the windows into which it falls. - * - * Usage: JavaStructuredNetworkWordCountWindowed - * [] - * and describe the TCP server that Structured Streaming - * would connect to receive data. - * gives the size of window, specified as integer number of seconds - * gives the amount of time successive windows are offset from one another, - * given in the same units as above. should be less than or equal to - * . If the two are equal, successive windows have no overlap. If - * is not provided, it defaults to . - * - * To run this on your local machine, you need to first run a Netcat server - * `$ nc -lk 9999` - * and then run the example - * `$ bin/run-example sql.streaming.JavaStructuredNetworkWordCountWindowed - * localhost 9999 []` - * - * One recommended , pair is 10, 5 - */ -public final class JavaStructuredNetworkWordCountWindowed { - - public static void main(String[] args) throws Exception { - if (args.length < 3) { - System.err.println("Usage: JavaStructuredNetworkWordCountWindowed " + - " []"); - System.exit(1); - } - - String host = args[0]; - int port = Integer.parseInt(args[1]); - int windowSize = Integer.parseInt(args[2]); - int slideSize = (args.length == 3) ? windowSize : Integer.parseInt(args[3]); - if (slideSize > windowSize) { - System.err.println(" must be less than or equal to "); - } - String windowDuration = windowSize + " seconds"; - String slideDuration = slideSize + " seconds"; - - SparkSession spark = SparkSession - .builder() - .appName("JavaStructuredNetworkWordCountWindowed") - .getOrCreate(); - - // Create DataFrame representing the stream of input lines from connection to host:port - Dataset lines = spark - .readStream() - .format("socket") - .option("host", host) - .option("port", port) - .option("includeTimestamp", true) - .load(); - - // Split the lines into words, retaining timestamps - Dataset words = lines - .as(Encoders.tuple(Encoders.STRING(), Encoders.TIMESTAMP())) - .flatMap((FlatMapFunction, Tuple2>) t -> { - List> result = new ArrayList<>(); - for (String word : t._1.split(" ")) { - result.add(new Tuple2<>(word, t._2)); - } - return result.iterator(); - }, - Encoders.tuple(Encoders.STRING(), Encoders.TIMESTAMP()) - ).toDF("word", "timestamp"); - - // Group the data by window and word and compute the count of each group - Dataset windowedCounts = words.groupBy( - functions.window(words.col("timestamp"), windowDuration, slideDuration), - words.col("word") - ).count().orderBy("window"); - - // Start running the query that prints the windowed word counts to the console - StreamingQuery query = windowedCounts.writeStream() - .outputMode("complete") - .format("console") - .option("truncate", "false") - .start(); - - query.awaitTermination(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java deleted file mode 100644 index 943e3d8..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.examples.sql.streaming; - -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.MapGroupsWithStateFunction; -import org.apache.spark.sql.*; -import org.apache.spark.sql.streaming.GroupState; -import org.apache.spark.sql.streaming.GroupStateTimeout; -import org.apache.spark.sql.streaming.StreamingQuery; - -import java.io.Serializable; -import java.sql.Timestamp; -import java.util.*; - -/** - * Counts words in UTF8 encoded, '\n' delimited text received from the network. - *

- * Usage: JavaStructuredNetworkWordCount - * and describe the TCP server that Structured Streaming - * would connect to receive data. - *

- * To run this on your local machine, you need to first run a Netcat server - * `$ nc -lk 9999` - * and then run the example - * `$ bin/run-example sql.streaming.JavaStructuredSessionization - * localhost 9999` - */ -public final class JavaStructuredSessionization { - - public static void main(String[] args) throws Exception { - if (args.length < 2) { - System.err.println("Usage: JavaStructuredSessionization "); - System.exit(1); - } - - String host = args[0]; - int port = Integer.parseInt(args[1]); - - SparkSession spark = SparkSession - .builder() - .appName("JavaStructuredSessionization") - .getOrCreate(); - - // Create DataFrame representing the stream of input lines from connection to host:port - Dataset lines = spark - .readStream() - .format("socket") - .option("host", host) - .option("port", port) - .option("includeTimestamp", true) - .load(); - - FlatMapFunction linesToEvents = - new FlatMapFunction() { - @Override - public Iterator call(LineWithTimestamp lineWithTimestamp) { - ArrayList eventList = new ArrayList(); - for (String word : lineWithTimestamp.getLine().split(" ")) { - eventList.add(new Event(word, lineWithTimestamp.getTimestamp())); - } - return eventList.iterator(); - } - }; - - // Split the lines into words, treat words as sessionId of events - Dataset events = lines - .withColumnRenamed("value", "line") - .as(Encoders.bean(LineWithTimestamp.class)) - .flatMap(linesToEvents, Encoders.bean(Event.class)); - - // Sessionize the events. Track number of events, start and end timestamps of session, and - // and report session updates. - // - // Step 1: Define the state update function - MapGroupsWithStateFunction stateUpdateFunc = - new MapGroupsWithStateFunction() { - @Override public SessionUpdate call( - String sessionId, Iterator events, GroupState state) { - // If timed out, then remove session and send final update - if (state.hasTimedOut()) { - SessionUpdate finalUpdate = new SessionUpdate( - sessionId, state.get().calculateDuration(), state.get().getNumEvents(), true); - state.remove(); - return finalUpdate; - - } else { - // Find max and min timestamps in events - long maxTimestampMs = Long.MIN_VALUE; - long minTimestampMs = Long.MAX_VALUE; - int numNewEvents = 0; - while (events.hasNext()) { - Event e = events.next(); - long timestampMs = e.getTimestamp().getTime(); - maxTimestampMs = Math.max(timestampMs, maxTimestampMs); - minTimestampMs = Math.min(timestampMs, minTimestampMs); - numNewEvents += 1; - } - SessionInfo updatedSession = new SessionInfo(); - - // Update start and end timestamps in session - if (state.exists()) { - SessionInfo oldSession = state.get(); - updatedSession.setNumEvents(oldSession.numEvents + numNewEvents); - updatedSession.setStartTimestampMs(oldSession.startTimestampMs); - updatedSession.setEndTimestampMs(Math.max(oldSession.endTimestampMs, maxTimestampMs)); - } else { - updatedSession.setNumEvents(numNewEvents); - updatedSession.setStartTimestampMs(minTimestampMs); - updatedSession.setEndTimestampMs(maxTimestampMs); - } - state.update(updatedSession); - // Set timeout such that the session will be expired if no data received for 10 seconds - state.setTimeoutDuration("10 seconds"); - return new SessionUpdate( - sessionId, state.get().calculateDuration(), state.get().getNumEvents(), false); - } - } - }; - - // Step 2: Apply the state update function to the events streaming Dataset grouped by sessionId - Dataset sessionUpdates = events - .groupByKey( - new MapFunction() { - @Override public String call(Event event) { - return event.getSessionId(); - } - }, Encoders.STRING()) - .mapGroupsWithState( - stateUpdateFunc, - Encoders.bean(SessionInfo.class), - Encoders.bean(SessionUpdate.class), - GroupStateTimeout.ProcessingTimeTimeout()); - - // Start running the query that prints the session updates to the console - StreamingQuery query = sessionUpdates - .writeStream() - .outputMode("update") - .format("console") - .start(); - - query.awaitTermination(); - } - - /** - * User-defined data type representing the raw lines with timestamps. - */ - public static class LineWithTimestamp implements Serializable { - private String line; - private Timestamp timestamp; - - public Timestamp getTimestamp() { return timestamp; } - public void setTimestamp(Timestamp timestamp) { this.timestamp = timestamp; } - - public String getLine() { return line; } - public void setLine(String sessionId) { this.line = sessionId; } - } - - /** - * User-defined data type representing the input events - */ - public static class Event implements Serializable { - private String sessionId; - private Timestamp timestamp; - - public Event() { } - public Event(String sessionId, Timestamp timestamp) { - this.sessionId = sessionId; - this.timestamp = timestamp; - } - - public Timestamp getTimestamp() { return timestamp; } - public void setTimestamp(Timestamp timestamp) { this.timestamp = timestamp; } - - public String getSessionId() { return sessionId; } - public void setSessionId(String sessionId) { this.sessionId = sessionId; } - } - - /** - * User-defined data type for storing a session information as state in mapGroupsWithState. - */ - public static class SessionInfo implements Serializable { - private int numEvents = 0; - private long startTimestampMs = -1; - private long endTimestampMs = -1; - - public int getNumEvents() { return numEvents; } - public void setNumEvents(int numEvents) { this.numEvents = numEvents; } - - public long getStartTimestampMs() { return startTimestampMs; } - public void setStartTimestampMs(long startTimestampMs) { - this.startTimestampMs = startTimestampMs; - } - - public long getEndTimestampMs() { return endTimestampMs; } - public void setEndTimestampMs(long endTimestampMs) { this.endTimestampMs = endTimestampMs; } - - public long calculateDuration() { return endTimestampMs - startTimestampMs; } - - @Override public String toString() { - return "SessionInfo(numEvents = " + numEvents + - ", timestamps = " + startTimestampMs + " to " + endTimestampMs + ")"; - } - } - - /** - * User-defined data type representing the update information returned by mapGroupsWithState. - */ - public static class SessionUpdate implements Serializable { - private String id; - private long durationMs; - private int numEvents; - private boolean expired; - - public SessionUpdate() { } - - public SessionUpdate(String id, long durationMs, int numEvents, boolean expired) { - this.id = id; - this.durationMs = durationMs; - this.numEvents = numEvents; - this.expired = expired; - } - - public String getId() { return id; } - public void setId(String id) { this.id = id; } - - public long getDurationMs() { return durationMs; } - public void setDurationMs(long durationMs) { this.durationMs = durationMs; } - - public int getNumEvents() { return numEvents; } - public void setNumEvents(int numEvents) { this.numEvents = numEvents; } - - public boolean isExpired() { return expired; } - public void setExpired(boolean expired) { this.expired = expired; } - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java deleted file mode 100644 index 47692ec..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaCustomReceiver.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.streaming; - -import com.google.common.io.Closeables; - -import org.apache.spark.SparkConf; -import org.apache.spark.storage.StorageLevel; -import org.apache.spark.streaming.Duration; -import org.apache.spark.streaming.api.java.JavaDStream; -import org.apache.spark.streaming.api.java.JavaPairDStream; -import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; -import org.apache.spark.streaming.api.java.JavaStreamingContext; -import org.apache.spark.streaming.receiver.Receiver; -import scala.Tuple2; - -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.net.ConnectException; -import java.net.Socket; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.regex.Pattern; - -/** - * Custom Receiver that receives data over a socket. Received bytes is interpreted as - * text and \n delimited lines are considered as records. They are then counted and printed. - * - * Usage: JavaCustomReceiver - * is the Spark master URL. In local mode, should be 'local[n]' with n > 1. - * and of the TCP server that Spark Streaming would connect to receive data. - * - * To run this on your local machine, you need to first run a Netcat server - * `$ nc -lk 9999` - * and then run the example - * `$ bin/run-example org.apache.spark.examples.streaming.JavaCustomReceiver localhost 9999` - */ - -public class JavaCustomReceiver extends Receiver { - private static final Pattern SPACE = Pattern.compile(" "); - - public static void main(String[] args) throws Exception { - if (args.length < 2) { - System.err.println("Usage: JavaCustomReceiver "); - System.exit(1); - } - - StreamingExamples.setStreamingLogLevels(); - - // Create the context with a 1 second batch size - SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver"); - JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000)); - - // Create an input stream with the custom receiver on target ip:port and count the - // words in input stream of \n delimited text (eg. generated by 'nc') - JavaReceiverInputDStream lines = ssc.receiverStream( - new JavaCustomReceiver(args[0], Integer.parseInt(args[1]))); - JavaDStream words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator()); - JavaPairDStream wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1)) - .reduceByKey((i1, i2) -> i1 + i2); - - wordCounts.print(); - ssc.start(); - ssc.awaitTermination(); - } - - // ============= Receiver code that receives data over a socket ============== - - String host = null; - int port = -1; - - public JavaCustomReceiver(String host_ , int port_) { - super(StorageLevel.MEMORY_AND_DISK_2()); - host = host_; - port = port_; - } - - @Override - public void onStart() { - // Start the thread that receives data over a connection - new Thread(this::receive).start(); - } - - @Override - public void onStop() { - // There is nothing much to do as the thread calling receive() - // is designed to stop by itself isStopped() returns false - } - - /** Create a socket connection and receive data until receiver is stopped */ - private void receive() { - try { - Socket socket = null; - BufferedReader reader = null; - try { - // connect to the server - socket = new Socket(host, port); - reader = new BufferedReader( - new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)); - // Until stopped or connection broken continue reading - String userInput; - while (!isStopped() && (userInput = reader.readLine()) != null) { - System.out.println("Received data '" + userInput + "'"); - store(userInput); - } - } finally { - Closeables.close(reader, /* swallowIOException = */ true); - Closeables.close(socket, /* swallowIOException = */ true); - } - // Restart in an attempt to connect again when server is active again - restart("Trying to connect again"); - } catch(ConnectException ce) { - // restart if could not connect to server - restart("Could not connect", ce); - } catch(Throwable t) { - restart("Error receiving data", t); - } - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java deleted file mode 100644 index 748bf58..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.streaming; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Arrays; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; - -import scala.Tuple2; - -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.common.serialization.StringDeserializer; - -import org.apache.spark.SparkConf; -import org.apache.spark.streaming.api.java.*; -import org.apache.spark.streaming.kafka010.ConsumerStrategies; -import org.apache.spark.streaming.kafka010.KafkaUtils; -import org.apache.spark.streaming.kafka010.LocationStrategies; -import org.apache.spark.streaming.Durations; - -/** - * Consumes messages from one or more topics in Kafka and does wordcount. - * Usage: JavaDirectKafkaWordCount - * is a list of one or more Kafka brokers - * is a consumer group name to consume from topics - * is a list of one or more kafka topics to consume from - * - * Example: - * $ bin/run-example streaming.JavaDirectKafkaWordCount broker1-host:port,broker2-host:port \ - * consumer-group topic1,topic2 - */ - -public final class JavaDirectKafkaWordCount { - private static final Pattern SPACE = Pattern.compile(" "); - - public static void main(String[] args) throws Exception { - if (args.length < 3) { - System.err.println("Usage: JavaDirectKafkaWordCount \n" + - " is a list of one or more Kafka brokers\n" + - " is a consumer group name to consume from topics\n" + - " is a list of one or more kafka topics to consume from\n\n"); - System.exit(1); - } - - StreamingExamples.setStreamingLogLevels(); - - String brokers = args[0]; - String groupId = args[1]; - String topics = args[2]; - - // Create context with a 2 seconds batch interval - SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount"); - JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); - - Set topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); - Map kafkaParams = new HashMap<>(); - kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers); - kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); - kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); - kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); - - // Create direct kafka stream with brokers and topics - JavaInputDStream> messages = KafkaUtils.createDirectStream( - jssc, - LocationStrategies.PreferConsistent(), - ConsumerStrategies.Subscribe(topicsSet, kafkaParams)); - - // Get the lines, split them into words, count the words and print - JavaDStream lines = messages.map(ConsumerRecord::value); - JavaDStream words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator()); - JavaPairDStream wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1)) - .reduceByKey((i1, i2) -> i1 + i2); - wordCounts.print(); - - // Start the computation - jssc.start(); - jssc.awaitTermination(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaNetworkWordCount.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaNetworkWordCount.java deleted file mode 100644 index b217672..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaNetworkWordCount.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.streaming; - -import java.util.Arrays; -import java.util.regex.Pattern; - -import scala.Tuple2; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.StorageLevels; -import org.apache.spark.streaming.Durations; -import org.apache.spark.streaming.api.java.JavaDStream; -import org.apache.spark.streaming.api.java.JavaPairDStream; -import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; -import org.apache.spark.streaming.api.java.JavaStreamingContext; - -/** - * Counts words in UTF8 encoded, '\n' delimited text received from the network every second. - * - * Usage: JavaNetworkWordCount - * and describe the TCP server that Spark Streaming would connect to receive data. - * - * To run this on your local machine, you need to first run a Netcat server - * `$ nc -lk 9999` - * and then run the example - * `$ bin/run-example org.apache.spark.examples.streaming.JavaNetworkWordCount localhost 9999` - */ -public final class JavaNetworkWordCount { - private static final Pattern SPACE = Pattern.compile(" "); - - public static void main(String[] args) throws Exception { - if (args.length < 2) { - System.err.println("Usage: JavaNetworkWordCount "); - System.exit(1); - } - - StreamingExamples.setStreamingLogLevels(); - - // Create the context with a 1 second batch size - SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount"); - JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); - - // Create a JavaReceiverInputDStream on target ip:port and count the - // words in input stream of \n delimited text (eg. generated by 'nc') - // Note that no duplication in storage level only for running locally. - // Replication necessary in distributed scenario for fault tolerance. - JavaReceiverInputDStream lines = ssc.socketTextStream( - args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); - JavaDStream words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator()); - JavaPairDStream wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1)) - .reduceByKey((i1, i2) -> i1 + i2); - - wordCounts.print(); - ssc.start(); - ssc.awaitTermination(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaQueueStream.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaQueueStream.java deleted file mode 100644 index e86f8ab..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaQueueStream.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.streaming; - -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.Queue; - -import scala.Tuple2; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.streaming.Duration; -import org.apache.spark.streaming.api.java.JavaDStream; -import org.apache.spark.streaming.api.java.JavaPairDStream; -import org.apache.spark.streaming.api.java.JavaStreamingContext; - -public final class JavaQueueStream { - private JavaQueueStream() { - } - - public static void main(String[] args) throws Exception { - - StreamingExamples.setStreamingLogLevels(); - SparkConf sparkConf = new SparkConf().setAppName("JavaQueueStream"); - - // Create the context - JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000)); - - // Create the queue through which RDDs can be pushed to - // a QueueInputDStream - - // Create and push some RDDs into the queue - List list = new ArrayList<>(); - for (int i = 0; i < 1000; i++) { - list.add(i); - } - - Queue> rddQueue = new LinkedList<>(); - for (int i = 0; i < 30; i++) { - rddQueue.add(ssc.sparkContext().parallelize(list)); - } - - // Create the QueueInputDStream and use it do some processing - JavaDStream inputStream = ssc.queueStream(rddQueue); - JavaPairDStream mappedStream = inputStream.mapToPair( - i -> new Tuple2<>(i % 10, 1)); - JavaPairDStream reducedStream = mappedStream.reduceByKey( - (i1, i2) -> i1 + i2); - - reducedStream.print(); - ssc.start(); - ssc.awaitTermination(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaRecord.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaRecord.java deleted file mode 100644 index e63697a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaRecord.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.streaming; - -/** Java Bean class to be used with the example JavaSqlNetworkWordCount. */ -public class JavaRecord implements java.io.Serializable { - private String word; - - public String getWord() { - return word; - } - - public void setWord(String word) { - this.word = word; - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaRecoverableNetworkWordCount.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaRecoverableNetworkWordCount.java deleted file mode 100644 index 45a876d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaRecoverableNetworkWordCount.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.streaming; - -import java.io.File; -import java.nio.charset.Charset; -import java.util.Arrays; -import java.util.List; -import java.util.regex.Pattern; - -import scala.Tuple2; - -import com.google.common.io.Files; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.*; -import org.apache.spark.broadcast.Broadcast; -import org.apache.spark.streaming.Durations; -import org.apache.spark.streaming.api.java.JavaDStream; -import org.apache.spark.streaming.api.java.JavaPairDStream; -import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; -import org.apache.spark.streaming.api.java.JavaStreamingContext; -import org.apache.spark.util.LongAccumulator; - -/** - * Use this singleton to get or register a Broadcast variable. - */ -class JavaWordBlacklist { - - private static volatile Broadcast> instance = null; - - public static Broadcast> getInstance(JavaSparkContext jsc) { - if (instance == null) { - synchronized (JavaWordBlacklist.class) { - if (instance == null) { - List wordBlacklist = Arrays.asList("a", "b", "c"); - instance = jsc.broadcast(wordBlacklist); - } - } - } - return instance; - } -} - -/** - * Use this singleton to get or register an Accumulator. - */ -class JavaDroppedWordsCounter { - - private static volatile LongAccumulator instance = null; - - public static LongAccumulator getInstance(JavaSparkContext jsc) { - if (instance == null) { - synchronized (JavaDroppedWordsCounter.class) { - if (instance == null) { - instance = jsc.sc().longAccumulator("WordsInBlacklistCounter"); - } - } - } - return instance; - } -} - -/** - * Counts words in text encoded with UTF8 received from the network every second. This example also - * shows how to use lazily instantiated singleton instances for Accumulator and Broadcast so that - * they can be registered on driver failures. - * - * Usage: JavaRecoverableNetworkWordCount - * and describe the TCP server that Spark Streaming would connect to receive - * data. directory to HDFS-compatible file system which checkpoint data - * file to which the word counts will be appended - * - * and must be absolute paths - * - * To run this on your local machine, you need to first run a Netcat server - * - * `$ nc -lk 9999` - * - * and run the example as - * - * `$ ./bin/run-example org.apache.spark.examples.streaming.JavaRecoverableNetworkWordCount \ - * localhost 9999 ~/checkpoint/ ~/out` - * - * If the directory ~/checkpoint/ does not exist (e.g. running for the first time), it will create - * a new StreamingContext (will print "Creating new context" to the console). Otherwise, if - * checkpoint data exists in ~/checkpoint/, then it will create StreamingContext from - * the checkpoint data. - * - * Refer to the online documentation for more details. - */ -public final class JavaRecoverableNetworkWordCount { - private static final Pattern SPACE = Pattern.compile(" "); - - private static JavaStreamingContext createContext(String ip, - int port, - String checkpointDirectory, - String outputPath) { - - // If you do not see this printed, that means the StreamingContext has been loaded - // from the new checkpoint - System.out.println("Creating new context"); - File outputFile = new File(outputPath); - if (outputFile.exists()) { - outputFile.delete(); - } - SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount"); - // Create the context with a 1 second batch size - JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); - ssc.checkpoint(checkpointDirectory); - - // Create a socket stream on target ip:port and count the - // words in input stream of \n delimited text (eg. generated by 'nc') - JavaReceiverInputDStream lines = ssc.socketTextStream(ip, port); - JavaDStream words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator()); - JavaPairDStream wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1)) - .reduceByKey((i1, i2) -> i1 + i2); - - wordCounts.foreachRDD((rdd, time) -> { - // Get or register the blacklist Broadcast - Broadcast> blacklist = - JavaWordBlacklist.getInstance(new JavaSparkContext(rdd.context())); - // Get or register the droppedWordsCounter Accumulator - LongAccumulator droppedWordsCounter = - JavaDroppedWordsCounter.getInstance(new JavaSparkContext(rdd.context())); - // Use blacklist to drop words and use droppedWordsCounter to count them - String counts = rdd.filter(wordCount -> { - if (blacklist.value().contains(wordCount._1())) { - droppedWordsCounter.add(wordCount._2()); - return false; - } else { - return true; - } - }).collect().toString(); - String output = "Counts at time " + time + " " + counts; - System.out.println(output); - System.out.println("Dropped " + droppedWordsCounter.value() + " word(s) totally"); - System.out.println("Appending to " + outputFile.getAbsolutePath()); - Files.append(output + "\n", outputFile, Charset.defaultCharset()); - }); - - return ssc; - } - - public static void main(String[] args) throws Exception { - if (args.length != 4) { - System.err.println("You arguments were " + Arrays.asList(args)); - System.err.println( - "Usage: JavaRecoverableNetworkWordCount \n" + - " . and describe the TCP server that Spark\n" + - " Streaming would connect to receive data. directory to\n" + - " HDFS-compatible file system which checkpoint data file to which\n" + - " the word counts will be appended\n" + - "\n" + - "In local mode, should be 'local[n]' with n > 1\n" + - "Both and must be absolute paths"); - System.exit(1); - } - - String ip = args[0]; - int port = Integer.parseInt(args[1]); - String checkpointDirectory = args[2]; - String outputPath = args[3]; - - // Function to create JavaStreamingContext without any output operations - // (used to detect the new context) - Function0 createContextFunc = - () -> createContext(ip, port, checkpointDirectory, outputPath); - - JavaStreamingContext ssc = - JavaStreamingContext.getOrCreate(checkpointDirectory, createContextFunc); - ssc.start(); - ssc.awaitTermination(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java deleted file mode 100644 index 948d1a2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaSqlNetworkWordCount.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.streaming; - -import java.util.Arrays; -import java.util.regex.Pattern; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.api.java.StorageLevels; -import org.apache.spark.streaming.Durations; -import org.apache.spark.streaming.api.java.JavaDStream; -import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; -import org.apache.spark.streaming.api.java.JavaStreamingContext; - -/** - * Use DataFrames and SQL to count words in UTF8 encoded, '\n' delimited text received from the - * network every second. - * - * Usage: JavaSqlNetworkWordCount - * and describe the TCP server that Spark Streaming would connect to receive data. - * - * To run this on your local machine, you need to first run a Netcat server - * `$ nc -lk 9999` - * and then run the example - * `$ bin/run-example org.apache.spark.examples.streaming.JavaSqlNetworkWordCount localhost 9999` - */ -public final class JavaSqlNetworkWordCount { - private static final Pattern SPACE = Pattern.compile(" "); - - public static void main(String[] args) throws Exception { - if (args.length < 2) { - System.err.println("Usage: JavaNetworkWordCount "); - System.exit(1); - } - - StreamingExamples.setStreamingLogLevels(); - - // Create the context with a 1 second batch size - SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount"); - JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); - - // Create a JavaReceiverInputDStream on target ip:port and count the - // words in input stream of \n delimited text (eg. generated by 'nc') - // Note that no duplication in storage level only for running locally. - // Replication necessary in distributed scenario for fault tolerance. - JavaReceiverInputDStream lines = ssc.socketTextStream( - args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); - JavaDStream words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator()); - - // Convert RDDs of the words DStream to DataFrame and run SQL query - words.foreachRDD((rdd, time) -> { - SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); - - // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame - JavaRDD rowRDD = rdd.map(word -> { - JavaRecord record = new JavaRecord(); - record.setWord(word); - return record; - }); - Dataset wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class); - - // Creates a temporary view using the DataFrame - wordsDataFrame.createOrReplaceTempView("words"); - - // Do word count on table using SQL and print it - Dataset wordCountsDataFrame = - spark.sql("select word, count(*) as total from words group by word"); - System.out.println("========= " + time + "========="); - wordCountsDataFrame.show(); - }); - - ssc.start(); - ssc.awaitTermination(); - } -} - -/** Lazily instantiated singleton instance of SparkSession */ -class JavaSparkSessionSingleton { - private static transient SparkSession instance = null; - public static SparkSession getInstance(SparkConf sparkConf) { - if (instance == null) { - instance = SparkSession - .builder() - .config(sparkConf) - .getOrCreate(); - } - return instance; - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java deleted file mode 100644 index 9d8bd7f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.streaming; - -import java.util.Arrays; -import java.util.List; -import java.util.regex.Pattern; - -import scala.Tuple2; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.*; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.Optional; -import org.apache.spark.api.java.StorageLevels; -import org.apache.spark.streaming.Durations; -import org.apache.spark.streaming.State; -import org.apache.spark.streaming.StateSpec; -import org.apache.spark.streaming.api.java.*; - -/** - * Counts words cumulatively in UTF8 encoded, '\n' delimited text received from the network every - * second starting with initial value of word count. - * Usage: JavaStatefulNetworkWordCount - * and describe the TCP server that Spark Streaming would connect to receive - * data. - *

- * To run this on your local machine, you need to first run a Netcat server - * `$ nc -lk 9999` - * and then run the example - * `$ bin/run-example - * org.apache.spark.examples.streaming.JavaStatefulNetworkWordCount localhost 9999` - */ -public class JavaStatefulNetworkWordCount { - private static final Pattern SPACE = Pattern.compile(" "); - - public static void main(String[] args) throws Exception { - if (args.length < 2) { - System.err.println("Usage: JavaStatefulNetworkWordCount "); - System.exit(1); - } - - StreamingExamples.setStreamingLogLevels(); - - // Create the context with a 1 second batch size - SparkConf sparkConf = new SparkConf().setAppName("JavaStatefulNetworkWordCount"); - JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); - ssc.checkpoint("."); - - // Initial state RDD input to mapWithState - @SuppressWarnings("unchecked") - List> tuples = - Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1)); - JavaPairRDD initialRDD = ssc.sparkContext().parallelizePairs(tuples); - - JavaReceiverInputDStream lines = ssc.socketTextStream( - args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER_2); - - JavaDStream words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator()); - - JavaPairDStream wordsDstream = words.mapToPair(s -> new Tuple2<>(s, 1)); - - // Update the cumulative count function - Function3, State, Tuple2> mappingFunc = - (word, one, state) -> { - int sum = one.orElse(0) + (state.exists() ? state.get() : 0); - Tuple2 output = new Tuple2<>(word, sum); - state.update(sum); - return output; - }; - - // DStream made of get cumulative counts that get updated in every batch - JavaMapWithStateDStream> stateDstream = - wordsDstream.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD)); - - stateDstream.print(); - ssc.start(); - ssc.awaitTermination(); - } -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/als.py b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/als.py deleted file mode 100755 index 6d32418..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/als.py +++ /dev/null @@ -1,108 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -This is an example implementation of ALS for learning how to use Spark. Please refer to -pyspark.ml.recommendation.ALS for more conventional use. - -This example requires numpy (http://www.numpy.org/) -""" -from __future__ import print_function - -import sys - -import numpy as np -from numpy.random import rand -from numpy import matrix -from pyspark.sql import SparkSession - -LAMBDA = 0.01 # regularization -np.random.seed(42) - - -def rmse(R, ms, us): - diff = R - ms * us.T - return np.sqrt(np.sum(np.power(diff, 2)) / (M * U)) - - -def update(i, mat, ratings): - uu = mat.shape[0] - ff = mat.shape[1] - - XtX = mat.T * mat - Xty = mat.T * ratings[i, :].T - - for j in range(ff): - XtX[j, j] += LAMBDA * uu - - return np.linalg.solve(XtX, Xty) - - -if __name__ == "__main__": - - """ - Usage: als [M] [U] [F] [iterations] [partitions]" - """ - - print("""WARN: This is a naive implementation of ALS and is given as an - example. Please use pyspark.ml.recommendation.ALS for more - conventional use.""", file=sys.stderr) - - spark = SparkSession\ - .builder\ - .appName("PythonALS")\ - .getOrCreate() - - sc = spark.sparkContext - - M = int(sys.argv[1]) if len(sys.argv) > 1 else 100 - U = int(sys.argv[2]) if len(sys.argv) > 2 else 500 - F = int(sys.argv[3]) if len(sys.argv) > 3 else 10 - ITERATIONS = int(sys.argv[4]) if len(sys.argv) > 4 else 5 - partitions = int(sys.argv[5]) if len(sys.argv) > 5 else 2 - - print("Running ALS with M=%d, U=%d, F=%d, iters=%d, partitions=%d\n" % - (M, U, F, ITERATIONS, partitions)) - - R = matrix(rand(M, F)) * matrix(rand(U, F).T) - ms = matrix(rand(M, F)) - us = matrix(rand(U, F)) - - Rb = sc.broadcast(R) - msb = sc.broadcast(ms) - usb = sc.broadcast(us) - - for i in range(ITERATIONS): - ms = sc.parallelize(range(M), partitions) \ - .map(lambda x: update(x, usb.value, Rb.value)) \ - .collect() - # collect() returns a list, so array ends up being - # a 3-d array, we take the first 2 dims for the matrix - ms = matrix(np.array(ms)[:, :, 0]) - msb = sc.broadcast(ms) - - us = sc.parallelize(range(U), partitions) \ - .map(lambda x: update(x, msb.value, Rb.value.T)) \ - .collect() - us = matrix(np.array(us)[:, :, 0]) - usb = sc.broadcast(us) - - error = rmse(R, ms, us) - print("Iteration %d:" % i) - print("\nRMSE: %5.4f\n" % error) - - spark.stop() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/avro_inputformat.py b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/avro_inputformat.py deleted file mode 100644 index a18722c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/avro_inputformat.py +++ /dev/null @@ -1,91 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Read data file users.avro in local Spark distro: - -$ cd $SPARK_HOME -$ ./bin/spark-submit --driver-class-path /path/to/example/jar \ -> ./examples/src/main/python/avro_inputformat.py \ -> examples/src/main/resources/users.avro -{u'favorite_color': None, u'name': u'Alyssa', u'favorite_numbers': [3, 9, 15, 20]} -{u'favorite_color': u'red', u'name': u'Ben', u'favorite_numbers': []} - -To read name and favorite_color fields only, specify the following reader schema: - -$ cat examples/src/main/resources/user.avsc -{"namespace": "example.avro", - "type": "record", - "name": "User", - "fields": [ - {"name": "name", "type": "string"}, - {"name": "favorite_color", "type": ["string", "null"]} - ] -} - -$ ./bin/spark-submit --driver-class-path /path/to/example/jar \ -> ./examples/src/main/python/avro_inputformat.py \ -> examples/src/main/resources/users.avro examples/src/main/resources/user.avsc -{u'favorite_color': None, u'name': u'Alyssa'} -{u'favorite_color': u'red', u'name': u'Ben'} -""" -from __future__ import print_function - -import sys - -from functools import reduce -from pyspark.sql import SparkSession - -if __name__ == "__main__": - if len(sys.argv) != 2 and len(sys.argv) != 3: - print(""" - Usage: avro_inputformat [reader_schema_file] - - Run with example jar: - ./bin/spark-submit --driver-class-path /path/to/example/jar \ - /path/to/examples/avro_inputformat.py [reader_schema_file] - Assumes you have Avro data stored in . Reader schema can be optionally specified - in [reader_schema_file]. - """, file=sys.stderr) - sys.exit(-1) - - path = sys.argv[1] - - spark = SparkSession\ - .builder\ - .appName("AvroKeyInputFormat")\ - .getOrCreate() - - sc = spark.sparkContext - - conf = None - if len(sys.argv) == 3: - schema_rdd = sc.textFile(sys.argv[2], 1).collect() - conf = {"avro.schema.input.key": reduce(lambda x, y: x + y, schema_rdd)} - - avro_rdd = sc.newAPIHadoopFile( - path, - "org.apache.avro.mapreduce.AvroKeyInputFormat", - "org.apache.avro.mapred.AvroKey", - "org.apache.hadoop.io.NullWritable", - keyConverter="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter", - conf=conf) - output = avro_rdd.map(lambda x: x[0]).collect() - for k in output: - print(k) - - spark.stop() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/kmeans.py b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/kmeans.py deleted file mode 100755 index a42d711..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/kmeans.py +++ /dev/null @@ -1,86 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -The K-means algorithm written from scratch against PySpark. In practice, -one may prefer to use the KMeans algorithm in ML, as shown in -examples/src/main/python/ml/kmeans_example.py. - -This example requires NumPy (http://www.numpy.org/). -""" -from __future__ import print_function - -import sys - -import numpy as np -from pyspark.sql import SparkSession - - -def parseVector(line): - return np.array([float(x) for x in line.split(' ')]) - - -def closestPoint(p, centers): - bestIndex = 0 - closest = float("+inf") - for i in range(len(centers)): - tempDist = np.sum((p - centers[i]) ** 2) - if tempDist < closest: - closest = tempDist - bestIndex = i - return bestIndex - - -if __name__ == "__main__": - - if len(sys.argv) != 4: - print("Usage: kmeans ", file=sys.stderr) - sys.exit(-1) - - print("""WARN: This is a naive implementation of KMeans Clustering and is given - as an example! Please refer to examples/src/main/python/ml/kmeans_example.py for an - example on how to use ML's KMeans implementation.""", file=sys.stderr) - - spark = SparkSession\ - .builder\ - .appName("PythonKMeans")\ - .getOrCreate() - - lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0]) - data = lines.map(parseVector).cache() - K = int(sys.argv[2]) - convergeDist = float(sys.argv[3]) - - kPoints = data.takeSample(False, K, 1) - tempDist = 1.0 - - while tempDist > convergeDist: - closest = data.map( - lambda p: (closestPoint(p, kPoints), (p, 1))) - pointStats = closest.reduceByKey( - lambda p1_c1, p2_c2: (p1_c1[0] + p2_c2[0], p1_c1[1] + p2_c2[1])) - newPoints = pointStats.map( - lambda st: (st[0], st[1][0] / st[1][1])).collect() - - tempDist = sum(np.sum((kPoints[iK] - p) ** 2) for (iK, p) in newPoints) - - for (iK, p) in newPoints: - kPoints[iK] = p - - print("Final centers: " + str(kPoints)) - - spark.stop() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/logistic_regression.py b/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/logistic_regression.py deleted file mode 100755 index bcc4e0f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/examples/src/main/python/logistic_regression.py +++ /dev/null @@ -1,88 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -A logistic regression implementation that uses NumPy (http://www.numpy.org) -to act on batches of input data using efficient matrix operations. - -In practice, one may prefer to use the LogisticRegression algorithm in -ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py. -""" -from __future__ import print_function - -import sys - -import numpy as np -from pyspark.sql import SparkSession - - -D = 10 # Number of dimensions - - -# Read a batch of points from the input file into a NumPy matrix object. We operate on batches to -# make further computations faster. -# The data file contains lines of the form

MOZILLA PUBLIC LICENSE
Version - 1.1 -

-


-
-

1. Definitions. -

    1.0.1. "Commercial Use" means distribution or otherwise making the - Covered Code available to a third party. -

    1.1. ''Contributor'' means each entity that creates or contributes - to the creation of Modifications. -

    1.2. ''Contributor Version'' means the combination of the Original - Code, prior Modifications used by a Contributor, and the Modifications made by - that particular Contributor. -

    1.3. ''Covered Code'' means the Original Code or Modifications or - the combination of the Original Code and Modifications, in each case including - portions thereof. -

    1.4. ''Electronic Distribution Mechanism'' means a mechanism - generally accepted in the software development community for the electronic - transfer of data. -

    1.5. ''Executable'' means Covered Code in any form other than Source - Code. -

    1.6. ''Initial Developer'' means the individual or entity identified - as the Initial Developer in the Source Code notice required by Exhibit - A. -

    1.7. ''Larger Work'' means a work which combines Covered Code or - portions thereof with code not governed by the terms of this License. -

    1.8. ''License'' means this document. -

    1.8.1. "Licensable" means having the right to grant, to the maximum - extent possible, whether at the time of the initial grant or subsequently - acquired, any and all of the rights conveyed herein. -

    1.9. ''Modifications'' means any addition to or deletion from the - substance or structure of either the Original Code or any previous - Modifications. When Covered Code is released as a series of files, a - Modification is: -

      A. Any addition to or deletion from the contents of a file - containing Original Code or previous Modifications. -

      B. Any new file that contains any part of the Original Code or - previous Modifications.
       

    1.10. ''Original Code'' -means Source Code of computer software code which is described in the Source -Code notice required by Exhibit A as Original Code, and which, at the -time of its release under this License is not already Covered Code governed by -this License. -

    1.10.1. "Patent Claims" means any patent claim(s), now owned or - hereafter acquired, including without limitation,  method, process, and - apparatus claims, in any patent Licensable by grantor. -

    1.11. ''Source Code'' means the preferred form of the Covered Code - for making modifications to it, including all modules it contains, plus any - associated interface definition files, scripts used to control compilation and - installation of an Executable, or source code differential comparisons against - either the Original Code or another well known, available Covered Code of the - Contributor's choice. The Source Code can be in a compressed or archival form, - provided the appropriate decompression or de-archiving software is widely - available for no charge. -

    1.12. "You'' (or "Your")  means an individual or a legal entity - exercising rights under, and complying with all of the terms of, this License - or a future version of this License issued under Section 6.1. For legal - entities, "You'' includes any entity which controls, is controlled by, or is - under common control with You. For purposes of this definition, "control'' - means (a) the power, direct or indirect, to cause the direction or management - of such entity, whether by contract or otherwise, or (b) ownership of more - than fifty percent (50%) of the outstanding shares or beneficial ownership of - such entity.

2. Source Code License. -
    2.1. The Initial Developer Grant.
    The Initial Developer hereby - grants You a world-wide, royalty-free, non-exclusive license, subject to third - party intellectual property claims: -
      (a)  under intellectual property rights (other than - patent or trademark) Licensable by Initial Developer to use, reproduce, - modify, display, perform, sublicense and distribute the Original Code (or - portions thereof) with or without Modifications, and/or as part of a Larger - Work; and -

      (b) under Patents Claims infringed by the making, using or selling - of Original Code, to make, have made, use, practice, sell, and offer for - sale, and/or otherwise dispose of the Original Code (or portions thereof). -

        -
        (c) the licenses granted in this Section 2.1(a) and (b) - are effective on the date Initial Developer first distributes Original Code - under the terms of this License. -

        (d) Notwithstanding Section 2.1(b) above, no patent license is - granted: 1) for code that You delete from the Original Code; 2) separate - from the Original Code;  or 3) for infringements caused by: i) the - modification of the Original Code or ii) the combination of the Original - Code with other software or devices.
         

      2.2. Contributor - Grant.
      Subject to third party intellectual property claims, each - Contributor hereby grants You a world-wide, royalty-free, non-exclusive - license -

        (a)  under intellectual property rights (other - than patent or trademark) Licensable by Contributor, to use, reproduce, - modify, display, perform, sublicense and distribute the Modifications - created by such Contributor (or portions thereof) either on an unmodified - basis, with other Modifications, as Covered Code and/or as part of a Larger - Work; and -

        (b) under Patent Claims infringed by the making, using, or selling - of  Modifications made by that Contributor either alone and/or in combination with its Contributor Version (or portions of such - combination), to make, use, sell, offer for sale, have made, and/or - otherwise dispose of: 1) Modifications made by that Contributor (or portions - thereof); and 2) the combination of  Modifications made by that - Contributor with its Contributor Version (or portions of such - combination). -

        (c) the licenses granted in Sections 2.2(a) and 2.2(b) are - effective on the date Contributor first makes Commercial Use of the Covered - Code. -

        (d)    Notwithstanding Section 2.2(b) above, no - patent license is granted: 1) for any code that Contributor has deleted from - the Contributor Version; 2)  separate from the Contributor - Version;  3)  for infringements caused by: i) third party - modifications of Contributor Version or ii)  the combination of - Modifications made by that Contributor with other software  (except as - part of the Contributor Version) or other devices; or 4) under Patent Claims - infringed by Covered Code in the absence of Modifications made by that - Contributor.

    -


    3. Distribution Obligations. -

      3.1. Application of License.
      The Modifications which You create - or to which You contribute are governed by the terms of this License, - including without limitation Section 2.2. The Source Code version of - Covered Code may be distributed only under the terms of this License or a - future version of this License released under Section 6.1, and You must - include a copy of this License with every copy of the Source Code You - distribute. You may not offer or impose any terms on any Source Code version - that alters or restricts the applicable version of this License or the - recipients' rights hereunder. However, You may include an additional document - offering the additional rights described in Section 3.5. -

      3.2. Availability of Source Code.
      Any Modification which You - create or to which You contribute must be made available in Source Code form - under the terms of this License either on the same media as an Executable - version or via an accepted Electronic Distribution Mechanism to anyone to whom - you made an Executable version available; and if made available via Electronic - Distribution Mechanism, must remain available for at least twelve (12) months - after the date it initially became available, or at least six (6) months after - a subsequent version of that particular Modification has been made available - to such recipients. You are responsible for ensuring that the Source Code - version remains available even if the Electronic Distribution Mechanism is - maintained by a third party. -

      3.3. Description of Modifications.
      You must cause all Covered - Code to which You contribute to contain a file documenting the changes You - made to create that Covered Code and the date of any change. You must include - a prominent statement that the Modification is derived, directly or - indirectly, from Original Code provided by the Initial Developer and including - the name of the Initial Developer in (a) the Source Code, and (b) in any - notice in an Executable version or related documentation in which You describe - the origin or ownership of the Covered Code. -

      3.4. Intellectual Property Matters -

        (a) Third Party Claims.
        If Contributor has knowledge that a - license under a third party's intellectual property rights is required to - exercise the rights granted by such Contributor under Sections 2.1 or 2.2, - Contributor must include a text file with the Source Code distribution - titled "LEGAL'' which describes the claim and the party making the claim in - sufficient detail that a recipient will know whom to contact. If Contributor - obtains such knowledge after the Modification is made available as described - in Section 3.2, Contributor shall promptly modify the LEGAL file in all - copies Contributor makes available thereafter and shall take other steps - (such as notifying appropriate mailing lists or newsgroups) reasonably - calculated to inform those who received the Covered Code that new knowledge - has been obtained. -

        (b) Contributor APIs.
        If Contributor's Modifications include - an application programming interface and Contributor has knowledge of patent - licenses which are reasonably necessary to implement that API, Contributor - must also include this information in the LEGAL file. -
         

                -(c)    Representations. -
        Contributor represents that, except as disclosed pursuant to Section - 3.4(a) above, Contributor believes that Contributor's Modifications are - Contributor's original creation(s) and/or Contributor has sufficient rights - to grant the rights conveyed by this License.
      -


      3.5. Required Notices.
      You must duplicate the notice in - Exhibit A in each file of the Source Code.  If it is not possible - to put such notice in a particular Source Code file due to its structure, then - You must include such notice in a location (such as a relevant directory) - where a user would be likely to look for such a notice.  If You created - one or more Modification(s) You may add your name as a Contributor to the - notice described in Exhibit A.  You must also duplicate this - License in any documentation for the Source Code where You describe - recipients' rights or ownership rights relating to Covered Code.  You may - choose to offer, and to charge a fee for, warranty, support, indemnity or - liability obligations to one or more recipients of Covered Code. However, You - may do so only on Your own behalf, and not on behalf of the Initial Developer - or any Contributor. You must make it absolutely clear than any such warranty, - support, indemnity or liability obligation is offered by You alone, and You - hereby agree to indemnify the Initial Developer and every Contributor for any - liability incurred by the Initial Developer or such Contributor as a result of - warranty, support, indemnity or liability terms You offer. -

      3.6. Distribution of Executable Versions.
      You may distribute - Covered Code in Executable form only if the requirements of Section - 3.1-3.5 have been met for that Covered Code, and if You include a - notice stating that the Source Code version of the Covered Code is available - under the terms of this License, including a description of how and where You - have fulfilled the obligations of Section 3.2. The notice must be - conspicuously included in any notice in an Executable version, related - documentation or collateral in which You describe recipients' rights relating - to the Covered Code. You may distribute the Executable version of Covered Code - or ownership rights under a license of Your choice, which may contain terms - different from this License, provided that You are in compliance with the - terms of this License and that the license for the Executable version does not - attempt to limit or alter the recipient's rights in the Source Code version - from the rights set forth in this License. If You distribute the Executable - version under a different license You must make it absolutely clear that any - terms which differ from this License are offered by You alone, not by the - Initial Developer or any Contributor. You hereby agree to indemnify the - Initial Developer and every Contributor for any liability incurred by the - Initial Developer or such Contributor as a result of any such terms You offer. - -

      3.7. Larger Works.
      You may create a Larger Work by combining - Covered Code with other code not governed by the terms of this License and - distribute the Larger Work as a single product. In such a case, You must make - sure the requirements of this License are fulfilled for the Covered - Code.

    4. Inability to Comply Due to Statute or Regulation. -
      If it is impossible for You to comply with any of the terms of this - License with respect to some or all of the Covered Code due to statute, - judicial order, or regulation then You must: (a) comply with the terms of this - License to the maximum extent possible; and (b) describe the limitations and - the code they affect. Such description must be included in the LEGAL file - described in Section 3.4 and must be included with all distributions of - the Source Code. Except to the extent prohibited by statute or regulation, - such description must be sufficiently detailed for a recipient of ordinary - skill to be able to understand it.
    5. Application of this License. -
      This License applies to code to which the Initial Developer has attached - the notice in Exhibit A and to related Covered Code.
    6. Versions - of the License. -
      6.1. New Versions.
      Netscape Communications Corporation - (''Netscape'') may publish revised and/or new versions of the License from - time to time. Each version will be given a distinguishing version number. -

      6.2. Effect of New Versions.
      Once Covered Code has been - published under a particular version of the License, You may always continue - to use it under the terms of that version. You may also choose to use such - Covered Code under the terms of any subsequent version of the License - published by Netscape. No one other than Netscape has the right to modify the - terms applicable to Covered Code created under this License. -

      6.3. Derivative Works.
      If You create or use a modified version - of this License (which you may only do in order to apply it to code which is - not already Covered Code governed by this License), You must (a) rename Your - license so that the phrases ''Mozilla'', ''MOZILLAPL'', ''MOZPL'', - ''Netscape'', "MPL", ''NPL'' or any confusingly similar phrase do not appear - in your license (except to note that your license differs from this License) - and (b) otherwise make it clear that Your version of the license contains - terms which differ from the Mozilla Public License and Netscape Public - License. (Filling in the name of the Initial Developer, Original Code or - Contributor in the notice described in Exhibit A shall not of - themselves be deemed to be modifications of this License.)

    7. - DISCLAIMER OF WARRANTY. -
      COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS'' BASIS, WITHOUT - WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, WITHOUT - LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF DEFECTS, MERCHANTABLE, - FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE - QUALITY AND PERFORMANCE OF THE COVERED CODE IS WITH YOU. SHOULD ANY COVERED - CODE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR ANY - OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, REPAIR OR - CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS - LICENSE. NO USE OF ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS - DISCLAIMER.
    8. TERMINATION. -
      8.1.  This License and the rights granted hereunder will - terminate automatically if You fail to comply with terms herein and fail to - cure such breach within 30 days of becoming aware of the breach. All - sublicenses to the Covered Code which are properly granted shall survive any - termination of this License. Provisions which, by their nature, must remain in - effect beyond the termination of this License shall survive. -

      8.2.  If You initiate litigation by asserting a patent - infringement claim (excluding declatory judgment actions) against Initial - Developer or a Contributor (the Initial Developer or Contributor against whom - You file such action is referred to as "Participant")  alleging that: -

      (a)  such Participant's Contributor Version directly or - indirectly infringes any patent, then any and all rights granted by such - Participant to You under Sections 2.1 and/or 2.2 of this License shall, upon - 60 days notice from Participant terminate prospectively, unless if within 60 - days after receipt of notice You either: (i)  agree in writing to pay - Participant a mutually agreeable reasonable royalty for Your past and future - use of Modifications made by such Participant, or (ii) withdraw Your - litigation claim with respect to the Contributor Version against such - Participant.  If within 60 days of notice, a reasonable royalty and - payment arrangement are not mutually agreed upon in writing by the parties or - the litigation claim is not withdrawn, the rights granted by Participant to - You under Sections 2.1 and/or 2.2 automatically terminate at the expiration of - the 60 day notice period specified above. -

      (b)  any software, hardware, or device, other than such - Participant's Contributor Version, directly or indirectly infringes any - patent, then any rights granted to You by such Participant under Sections - 2.1(b) and 2.2(b) are revoked effective as of the date You first made, used, - sold, distributed, or had made, Modifications made by that Participant. -

      8.3.  If You assert a patent infringement claim against - Participant alleging that such Participant's Contributor Version directly or - indirectly infringes any patent where such claim is resolved (such as by - license or settlement) prior to the initiation of patent infringement - litigation, then the reasonable value of the licenses granted by such - Participant under Sections 2.1 or 2.2 shall be taken into account in - determining the amount or value of any payment or license. -

      8.4.  In the event of termination under Sections 8.1 or 8.2 - above,  all end user license agreements (excluding distributors and - resellers) which have been validly granted by You or any distributor hereunder - prior to termination shall survive termination.

    9. LIMITATION OF - LIABILITY. -
      UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING - NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL DEVELOPER, ANY - OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE, OR ANY SUPPLIER OF ANY - OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL, - INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT - LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR - MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH - PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS - LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL - INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW - PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR - LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND - LIMITATION MAY NOT APPLY TO YOU.
    10. U.S. GOVERNMENT END USERS. -
      The Covered Code is a ''commercial item,'' as that term is defined in 48 - C.F.R. 2.101 (Oct. 1995), consisting of ''commercial computer software'' and - ''commercial computer software documentation,'' as such terms are used in 48 - C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48 C.F.R. - 227.7202-1 through 227.7202-4 (June 1995), all U.S. Government End Users - acquire Covered Code with only those rights set forth herein.
    11. - MISCELLANEOUS. -
      This License represents the complete agreement concerning subject matter - hereof. If any provision of this License is held to be unenforceable, such - provision shall be reformed only to the extent necessary to make it - enforceable. This License shall be governed by California law provisions - (except to the extent applicable law, if any, provides otherwise), excluding - its conflict-of-law provisions. With respect to disputes in which at least one - party is a citizen of, or an entity chartered or registered to do business in - the United States of America, any litigation relating to this License shall be - subject to the jurisdiction of the Federal Courts of the Northern District of - California, with venue lying in Santa Clara County, California, with the - losing party responsible for costs, including without limitation, court costs - and reasonable attorneys' fees and expenses. The application of the United - Nations Convention on Contracts for the International Sale of Goods is - expressly excluded. Any law or regulation which provides that the language of - a contract shall be construed against the drafter shall not apply to this - License.
    12. RESPONSIBILITY FOR CLAIMS. -
      As between Initial Developer and the Contributors, each party is - responsible for claims and damages arising, directly or indirectly, out of its - utilization of rights under this License and You agree to work with Initial - Developer and Contributors to distribute such responsibility on an equitable - basis. Nothing herein is intended or shall be deemed to constitute any - admission of liability.
    13. MULTIPLE-LICENSED CODE. -
      Initial Developer may designate portions of the Covered Code as - "Multiple-Licensed".  "Multiple-Licensed" means that the Initial - Developer permits you to utilize portions of the Covered Code under Your - choice of the MPL or the alternative licenses, if any, specified by the - Initial Developer in the file described in Exhibit A.
    -


    EXHIBIT A -Mozilla Public License. -

      The contents of this file are subject to the Mozilla Public License - Version 1.1 (the "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at -
      http://www.mozilla.org/MPL/ -

      Software distributed under the License is distributed on an "AS IS" basis, - WITHOUT WARRANTY OF
      ANY KIND, either express or implied. See the License - for the specific language governing rights and
      limitations under the - License. -

      The Original Code is Javassist. -

      The Initial Developer of the Original Code is Shigeru Chiba. - Portions created by the Initial Developer are
        - Copyright (C) 1999- Shigeru Chiba. All Rights Reserved. -

      Contributor(s): __Bill Burke, Jason T. Greene______________. - -

      Alternatively, the contents of this software may be used under the - terms of the GNU Lesser General Public License Version 2.1 or later - (the "LGPL"), or the Apache License Version 2.0 (the "AL"), - in which case the provisions of the LGPL or the AL are applicable - instead of those above. If you wish to allow use of your version of - this software only under the terms of either the LGPL or the AL, and not to allow others to - use your version of this software under the terms of the MPL, indicate - your decision by deleting the provisions above and replace them with - the notice and other provisions required by the LGPL or the AL. If you do not - delete the provisions above, a recipient may use your version of this - software under the terms of any one of the MPL, the LGPL or the AL. - -

    - - \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-javolution.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-javolution.txt deleted file mode 100644 index b64af4d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-javolution.txt +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Javolution - Java(tm) Solution for Real-Time and Embedded Systems - * Copyright (c) 2012, Javolution (http://javolution.org/) - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jline.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jline.txt deleted file mode 100644 index 2ec539d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jline.txt +++ /dev/null @@ -1,32 +0,0 @@ -Copyright (c) 2002-2006, Marc Prud'hommeaux -All rights reserved. - -Redistribution and use in source and binary forms, with or -without modification, are permitted provided that the following -conditions are met: - -Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with -the distribution. - -Neither the name of JLine nor the names of its contributors -may be used to endorse or promote products derived from this -software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, -BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY -AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED -AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED -OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jodd.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jodd.txt deleted file mode 100644 index cc6b458..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jodd.txt +++ /dev/null @@ -1,24 +0,0 @@ -Copyright (c) 2003-present, Jodd Team (https://jodd.org) -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-join.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-join.txt deleted file mode 100644 index 1d91609..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-join.txt +++ /dev/null @@ -1,30 +0,0 @@ -Copyright (c) 2011, Douban Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - - * Neither the name of the Douban Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jquery.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jquery.txt deleted file mode 100644 index 4593054..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jquery.txt +++ /dev/null @@ -1,20 +0,0 @@ -Copyright JS Foundation and other contributors, https://js.foundation/ - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-json-formatter.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-json-formatter.txt deleted file mode 100644 index 5193348..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-json-formatter.txt +++ /dev/null @@ -1,6 +0,0 @@ -Copyright 2014 Mohsen Azimi - -Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jtransforms.html b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jtransforms.html deleted file mode 100644 index 351c174..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-jtransforms.html +++ /dev/null @@ -1,388 +0,0 @@ - - -Mozilla Public License version 1.1 - - - - -

    Mozilla Public License Version 1.1

    -

    1. Definitions.

    -
    -
    1.0.1. "Commercial Use" -
    means distribution or otherwise making the Covered Code available to a third party. -
    1.1. "Contributor" -
    means each entity that creates or contributes to the creation of Modifications. -
    1.2. "Contributor Version" -
    means the combination of the Original Code, prior Modifications used by a Contributor, - and the Modifications made by that particular Contributor. -
    1.3. "Covered Code" -
    means the Original Code or Modifications or the combination of the Original Code and - Modifications, in each case including portions thereof. -
    1.4. "Electronic Distribution Mechanism" -
    means a mechanism generally accepted in the software development community for the - electronic transfer of data. -
    1.5. "Executable" -
    means Covered Code in any form other than Source Code. -
    1.6. "Initial Developer" -
    means the individual or entity identified as the Initial Developer in the Source Code - notice required by Exhibit A. -
    1.7. "Larger Work" -
    means a work which combines Covered Code or portions thereof with code not governed - by the terms of this License. -
    1.8. "License" -
    means this document. -
    1.8.1. "Licensable" -
    means having the right to grant, to the maximum extent possible, whether at the - time of the initial grant or subsequently acquired, any and all of the rights - conveyed herein. -
    1.9. "Modifications" -
    -

    means any addition to or deletion from the substance or structure of either the - Original Code or any previous Modifications. When Covered Code is released as a - series of files, a Modification is: -

      -
    1. Any addition to or deletion from the contents of a file - containing Original Code or previous Modifications. -
    2. Any new file that contains any part of the Original Code or - previous Modifications. -
    -
    1.10. "Original Code" -
    means Source Code of computer software code which is described in the Source Code - notice required by Exhibit A as Original Code, and which, - at the time of its release under this License is not already Covered Code governed - by this License. -
    1.10.1. "Patent Claims" -
    means any patent claim(s), now owned or hereafter acquired, including without - limitation, method, process, and apparatus claims, in any patent Licensable by - grantor. -
    1.11. "Source Code" -
    means the preferred form of the Covered Code for making modifications to it, - including all modules it contains, plus any associated interface definition files, - scripts used to control compilation and installation of an Executable, or source - code differential comparisons against either the Original Code or another well known, - available Covered Code of the Contributor's choice. The Source Code can be in a - compressed or archival form, provided the appropriate decompression or de-archiving - software is widely available for no charge. -
    1.12. "You" (or "Your") -
    means an individual or a legal entity exercising rights under, and complying with - all of the terms of, this License or a future version of this License issued under - Section 6.1. For legal entities, "You" includes any entity - which controls, is controlled by, or is under common control with You. For purposes of - this definition, "control" means (a) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or otherwise, or (b) - ownership of more than fifty percent (50%) of the outstanding shares or beneficial - ownership of such entity. -
    -

    2. Source Code License.

    -

    2.1. The Initial Developer Grant.

    -

    The Initial Developer hereby grants You a world-wide, royalty-free, non-exclusive - license, subject to third party intellectual property claims: -

      -
    1. under intellectual property rights (other than patent or - trademark) Licensable by Initial Developer to use, reproduce, modify, display, perform, - sublicense and distribute the Original Code (or portions thereof) with or without - Modifications, and/or as part of a Larger Work; and -
    2. under Patents Claims infringed by the making, using or selling - of Original Code, to make, have made, use, practice, sell, and offer for sale, and/or - otherwise dispose of the Original Code (or portions thereof). -
    3. the licenses granted in this Section 2.1 - (a) and (b) are effective on - the date Initial Developer first distributes Original Code under the terms of this - License. -
    4. Notwithstanding Section 2.1 (b) - above, no patent license is granted: 1) for code that You delete from the Original Code; - 2) separate from the Original Code; or 3) for infringements caused by: i) the - modification of the Original Code or ii) the combination of the Original Code with other - software or devices. -
    -

    2.2. Contributor Grant.

    -

    Subject to third party intellectual property claims, each Contributor hereby grants You - a world-wide, royalty-free, non-exclusive license -

      -
    1. under intellectual property rights (other than patent or trademark) - Licensable by Contributor, to use, reproduce, modify, display, perform, sublicense and - distribute the Modifications created by such Contributor (or portions thereof) either on - an unmodified basis, with other Modifications, as Covered Code and/or as part of a Larger - Work; and -
    2. under Patent Claims infringed by the making, using, or selling of - Modifications made by that Contributor either alone and/or in combination with its - Contributor Version (or portions of such combination), to make, use, sell, offer for - sale, have made, and/or otherwise dispose of: 1) Modifications made by that Contributor - (or portions thereof); and 2) the combination of Modifications made by that Contributor - with its Contributor Version (or portions of such combination). -
    3. the licenses granted in Sections 2.2 - (a) and 2.2 (b) are effective - on the date Contributor first makes Commercial Use of the Covered Code. -
    4. Notwithstanding Section 2.2 (b) - above, no patent license is granted: 1) for any code that Contributor has deleted from - the Contributor Version; 2) separate from the Contributor Version; 3) for infringements - caused by: i) third party modifications of Contributor Version or ii) the combination of - Modifications made by that Contributor with other software (except as part of the - Contributor Version) or other devices; or 4) under Patent Claims infringed by Covered Code - in the absence of Modifications made by that Contributor. -
    -

    3. Distribution Obligations.

    -

    3.1. Application of License.

    -

    The Modifications which You create or to which You contribute are governed by the terms - of this License, including without limitation Section 2.2. The - Source Code version of Covered Code may be distributed only under the terms of this License - or a future version of this License released under Section 6.1, - and You must include a copy of this License with every copy of the Source Code You - distribute. You may not offer or impose any terms on any Source Code version that alters or - restricts the applicable version of this License or the recipients' rights hereunder. - However, You may include an additional document offering the additional rights described in - Section 3.5. -

    3.2. Availability of Source Code.

    -

    Any Modification which You create or to which You contribute must be made available in - Source Code form under the terms of this License either on the same media as an Executable - version or via an accepted Electronic Distribution Mechanism to anyone to whom you made an - Executable version available; and if made available via Electronic Distribution Mechanism, - must remain available for at least twelve (12) months after the date it initially became - available, or at least six (6) months after a subsequent version of that particular - Modification has been made available to such recipients. You are responsible for ensuring - that the Source Code version remains available even if the Electronic Distribution - Mechanism is maintained by a third party. -

    3.3. Description of Modifications.

    -

    You must cause all Covered Code to which You contribute to contain a file documenting the - changes You made to create that Covered Code and the date of any change. You must include a - prominent statement that the Modification is derived, directly or indirectly, from Original - Code provided by the Initial Developer and including the name of the Initial Developer in - (a) the Source Code, and (b) in any notice in an Executable version or related documentation - in which You describe the origin or ownership of the Covered Code. -

    3.4. Intellectual Property Matters

    -

    (a) Third Party Claims

    -

    If Contributor has knowledge that a license under a third party's intellectual property - rights is required to exercise the rights granted by such Contributor under Sections - 2.1 or 2.2, Contributor must include a - text file with the Source Code distribution titled "LEGAL" which describes the claim and the - party making the claim in sufficient detail that a recipient will know whom to contact. If - Contributor obtains such knowledge after the Modification is made available as described in - Section 3.2, Contributor shall promptly modify the LEGAL file in - all copies Contributor makes available thereafter and shall take other steps (such as - notifying appropriate mailing lists or newsgroups) reasonably calculated to inform those who - received the Covered Code that new knowledge has been obtained. -

    (b) Contributor APIs

    -

    If Contributor's Modifications include an application programming interface and Contributor - has knowledge of patent licenses which are reasonably necessary to implement that - API, Contributor must also include this information in the - legal file. -

    (c) Representations.

    -

    Contributor represents that, except as disclosed pursuant to Section 3.4 - (a) above, Contributor believes that Contributor's Modifications - are Contributor's original creation(s) and/or Contributor has sufficient rights to grant the - rights conveyed by this License. -

    3.5. Required Notices.

    -

    You must duplicate the notice in Exhibit A in each file of the - Source Code. If it is not possible to put such notice in a particular Source Code file due to - its structure, then You must include such notice in a location (such as a relevant directory) - where a user would be likely to look for such a notice. If You created one or more - Modification(s) You may add your name as a Contributor to the notice described in - Exhibit A. You must also duplicate this License in any documentation - for the Source Code where You describe recipients' rights or ownership rights relating to - Covered Code. You may choose to offer, and to charge a fee for, warranty, support, indemnity - or liability obligations to one or more recipients of Covered Code. However, You may do so - only on Your own behalf, and not on behalf of the Initial Developer or any Contributor. You - must make it absolutely clear than any such warranty, support, indemnity or liability - obligation is offered by You alone, and You hereby agree to indemnify the Initial Developer - and every Contributor for any liability incurred by the Initial Developer or such Contributor - as a result of warranty, support, indemnity or liability terms You offer. -

    3.6. Distribution of Executable Versions.

    -

    You may distribute Covered Code in Executable form only if the requirements of Sections - 3.1, 3.2, - 3.3, 3.4 and - 3.5 have been met for that Covered Code, and if You include a - notice stating that the Source Code version of the Covered Code is available under the terms - of this License, including a description of how and where You have fulfilled the obligations - of Section 3.2. The notice must be conspicuously included in any - notice in an Executable version, related documentation or collateral in which You describe - recipients' rights relating to the Covered Code. You may distribute the Executable version of - Covered Code or ownership rights under a license of Your choice, which may contain terms - different from this License, provided that You are in compliance with the terms of this - License and that the license for the Executable version does not attempt to limit or alter the - recipient's rights in the Source Code version from the rights set forth in this License. If - You distribute the Executable version under a different license You must make it absolutely - clear that any terms which differ from this License are offered by You alone, not by the - Initial Developer or any Contributor. You hereby agree to indemnify the Initial Developer and - every Contributor for any liability incurred by the Initial Developer or such Contributor as - a result of any such terms You offer. -

    3.7. Larger Works.

    -

    You may create a Larger Work by combining Covered Code with other code not governed by the - terms of this License and distribute the Larger Work as a single product. In such a case, - You must make sure the requirements of this License are fulfilled for the Covered Code. -

    4. Inability to Comply Due to Statute or Regulation.

    -

    If it is impossible for You to comply with any of the terms of this License with respect to - some or all of the Covered Code due to statute, judicial order, or regulation then You must: - (a) comply with the terms of this License to the maximum extent possible; and (b) describe - the limitations and the code they affect. Such description must be included in the - legal file described in Section - 3.4 and must be included with all distributions of the Source Code. - Except to the extent prohibited by statute or regulation, such description must be - sufficiently detailed for a recipient of ordinary skill to be able to understand it. -

    5. Application of this License.

    -

    This License applies to code to which the Initial Developer has attached the notice in - Exhibit A and to related Covered Code. -

    6. Versions of the License.

    -

    6.1. New Versions

    -

    Netscape Communications Corporation ("Netscape") may publish revised and/or new versions - of the License from time to time. Each version will be given a distinguishing version number. -

    6.2. Effect of New Versions

    -

    Once Covered Code has been published under a particular version of the License, You may - always continue to use it under the terms of that version. You may also choose to use such - Covered Code under the terms of any subsequent version of the License published by Netscape. - No one other than Netscape has the right to modify the terms applicable to Covered Code - created under this License. -

    6.3. Derivative Works

    -

    If You create or use a modified version of this License (which you may only do in order to - apply it to code which is not already Covered Code governed by this License), You must (a) - rename Your license so that the phrases "Mozilla", "MOZILLAPL", "MOZPL", "Netscape", "MPL", - "NPL" or any confusingly similar phrase do not appear in your license (except to note that - your license differs from this License) and (b) otherwise make it clear that Your version of - the license contains terms which differ from the Mozilla Public License and Netscape Public - License. (Filling in the name of the Initial Developer, Original Code or Contributor in the - notice described in Exhibit A shall not of themselves be deemed to - be modifications of this License.) -

    7. Disclaimer of warranty

    -

    Covered code is provided under this license on an "as is" - basis, without warranty of any kind, either expressed or implied, including, without - limitation, warranties that the covered code is free of defects, merchantable, fit for a - particular purpose or non-infringing. The entire risk as to the quality and performance of - the covered code is with you. Should any covered code prove defective in any respect, you - (not the initial developer or any other contributor) assume the cost of any necessary - servicing, repair or correction. This disclaimer of warranty constitutes an essential part - of this license. No use of any covered code is authorized hereunder except under this - disclaimer. -

    8. Termination

    -

    8.1. This License and the rights granted hereunder will terminate - automatically if You fail to comply with terms herein and fail to cure such breach - within 30 days of becoming aware of the breach. All sublicenses to the Covered Code which - are properly granted shall survive any termination of this License. Provisions which, by - their nature, must remain in effect beyond the termination of this License shall survive. -

    8.2. If You initiate litigation by asserting a patent infringement - claim (excluding declatory judgment actions) against Initial Developer or a Contributor - (the Initial Developer or Contributor against whom You file such action is referred to - as "Participant") alleging that: -

      -
    1. such Participant's Contributor Version directly or indirectly - infringes any patent, then any and all rights granted by such Participant to You under - Sections 2.1 and/or 2.2 of this - License shall, upon 60 days notice from Participant terminate prospectively, unless if - within 60 days after receipt of notice You either: (i) agree in writing to pay - Participant a mutually agreeable reasonable royalty for Your past and future use of - Modifications made by such Participant, or (ii) withdraw Your litigation claim with - respect to the Contributor Version against such Participant. If within 60 days of - notice, a reasonable royalty and payment arrangement are not mutually agreed upon in - writing by the parties or the litigation claim is not withdrawn, the rights granted by - Participant to You under Sections 2.1 and/or - 2.2 automatically terminate at the expiration of the 60 day - notice period specified above. -
    2. any software, hardware, or device, other than such Participant's - Contributor Version, directly or indirectly infringes any patent, then any rights - granted to You by such Participant under Sections 2.1(b) - and 2.2(b) are revoked effective as of the date You first - made, used, sold, distributed, or had made, Modifications made by that Participant. -
    -

    8.3. If You assert a patent infringement claim against Participant - alleging that such Participant's Contributor Version directly or indirectly infringes - any patent where such claim is resolved (such as by license or settlement) prior to the - initiation of patent infringement litigation, then the reasonable value of the licenses - granted by such Participant under Sections 2.1 or - 2.2 shall be taken into account in determining the amount or - value of any payment or license. -

    8.4. In the event of termination under Sections - 8.1 or 8.2 above, all end user - license agreements (excluding distributors and resellers) which have been validly - granted by You or any distributor hereunder prior to termination shall survive - termination. -

    9. Limitation of liability

    -

    Under no circumstances and under no legal theory, whether - tort (including negligence), contract, or otherwise, shall you, the initial developer, - any other contributor, or any distributor of covered code, or any supplier of any of - such parties, be liable to any person for any indirect, special, incidental, or - consequential damages of any character including, without limitation, damages for loss - of goodwill, work stoppage, computer failure or malfunction, or any and all other - commercial damages or losses, even if such party shall have been informed of the - possibility of such damages. This limitation of liability shall not apply to liability - for death or personal injury resulting from such party's negligence to the extent - applicable law prohibits such limitation. Some jurisdictions do not allow the exclusion - or limitation of incidental or consequential damages, so this exclusion and limitation - may not apply to you. -

    10. U.S. government end users

    -

    The Covered Code is a "commercial item," as that term is defined in 48 - C.F.R. 2.101 (Oct. 1995), consisting of - "commercial computer software" and "commercial computer software documentation," as such - terms are used in 48 C.F.R. 12.212 (Sept. - 1995). Consistent with 48 C.F.R. 12.212 and 48 C.F.R. - 227.7202-1 through 227.7202-4 (June 1995), all U.S. Government End Users - acquire Covered Code with only those rights set forth herein. -

    11. Miscellaneous

    -

    This License represents the complete agreement concerning subject matter hereof. If - any provision of this License is held to be unenforceable, such provision shall be - reformed only to the extent necessary to make it enforceable. This License shall be - governed by California law provisions (except to the extent applicable law, if any, - provides otherwise), excluding its conflict-of-law provisions. With respect to - disputes in which at least one party is a citizen of, or an entity chartered or - registered to do business in the United States of America, any litigation relating to - this License shall be subject to the jurisdiction of the Federal Courts of the - Northern District of California, with venue lying in Santa Clara County, California, - with the losing party responsible for costs, including without limitation, court - costs and reasonable attorneys' fees and expenses. The application of the United - Nations Convention on Contracts for the International Sale of Goods is expressly - excluded. Any law or regulation which provides that the language of a contract - shall be construed against the drafter shall not apply to this License. -

    12. Responsibility for claims

    -

    As between Initial Developer and the Contributors, each party is responsible for - claims and damages arising, directly or indirectly, out of its utilization of rights - under this License and You agree to work with Initial Developer and Contributors to - distribute such responsibility on an equitable basis. Nothing herein is intended or - shall be deemed to constitute any admission of liability. -

    13. Multiple-licensed code

    -

    Initial Developer may designate portions of the Covered Code as - "Multiple-Licensed". "Multiple-Licensed" means that the Initial Developer permits - you to utilize portions of the Covered Code under Your choice of the MPL - or the alternative licenses, if any, specified by the Initial Developer in the file - described in Exhibit A. -

    Exhibit A - Mozilla Public License.

    -
    "The contents of this file are subject to the Mozilla Public License
    -Version 1.1 (the "License"); you may not use this file except in
    -compliance with the License. You may obtain a copy of the License at
    -http://www.mozilla.org/MPL/
    -
    -Software distributed under the License is distributed on an "AS IS"
    -basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
    -License for the specific language governing rights and limitations
    -under the License.
    -
    -The Original Code is JTransforms.
    -
    -The Initial Developer of the Original Code is
    -Piotr Wendykier, Emory University.
    -Portions created by the Initial Developer are Copyright (C) 2007-2009
    -the Initial Developer. All Rights Reserved.
    -
    -Alternatively, the contents of this file may be used under the terms of
    -either the GNU General Public License Version 2 or later (the "GPL"), or
    -the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
    -in which case the provisions of the GPL or the LGPL are applicable instead
    -of those above. If you wish to allow use of your version of this file only
    -under the terms of either the GPL or the LGPL, and not to allow others to
    -use your version of this file under the terms of the MPL, indicate your
    -decision by deleting the provisions above and replace them with the notice
    -and other provisions required by the GPL or the LGPL. If you do not delete
    -the provisions above, a recipient may use your version of this file under
    -the terms of any one of the MPL, the GPL or the LGPL.
    -

    NOTE: The text of this Exhibit A may differ slightly from the text of - the notices in the Source Code files of the Original Code. You should - use the text of this Exhibit A rather than the text found in the - Original Code Source Code for Your Modifications. - -

    \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-kryo.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-kryo.txt deleted file mode 100644 index 3f6a160..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-kryo.txt +++ /dev/null @@ -1,10 +0,0 @@ -Copyright (c) 2008, Nathan Sweet -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-leveldbjni.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-leveldbjni.txt deleted file mode 100644 index b4dabb9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-leveldbjni.txt +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2011 FuseSource Corp. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of FuseSource Corp. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-machinist.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-machinist.txt deleted file mode 100644 index 68cc3a3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-machinist.txt +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2011-2014 Erik Osheim, Tom Switzer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-matchMedia-polyfill.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-matchMedia-polyfill.txt deleted file mode 100644 index 2fd0bc2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-matchMedia-polyfill.txt +++ /dev/null @@ -1 +0,0 @@ -matchMedia() polyfill - Test a CSS media type/query in JS. Authors & copyright (c) 2012: Scott Jehl, Paul Irish, Nicholas Zakas. Dual MIT/BSD license \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-minlog.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-minlog.txt deleted file mode 100644 index 3f6a160..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-minlog.txt +++ /dev/null @@ -1,10 +0,0 @@ -Copyright (c) 2008, Nathan Sweet -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-modernizr.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-modernizr.txt deleted file mode 100644 index 2bf24b9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-modernizr.txt +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-mustache.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-mustache.txt deleted file mode 100644 index 038cbb9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-mustache.txt +++ /dev/null @@ -1,11 +0,0 @@ -The MIT License - -Copyright (c) 2009 Chris Wanstrath (Ruby) -Copyright (c) 2010-2014 Jan Lehnardt (JavaScript) -Copyright (c) 2010-2015 The mustache.js community - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-netlib.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-netlib.txt deleted file mode 100644 index 75783ed..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-netlib.txt +++ /dev/null @@ -1,49 +0,0 @@ -Copyright (c) 2013 Samuel Halliday -Copyright (c) 1992-2011 The University of Tennessee and The University - of Tennessee Research Foundation. All rights - reserved. -Copyright (c) 2000-2011 The University of California Berkeley. All - rights reserved. -Copyright (c) 2006-2011 The University of Colorado Denver. All rights - reserved. - -$COPYRIGHT$ - -Additional copyrights may follow - -$HEADER$ - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -- Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -- Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer listed - in this license in the documentation and/or other materials - provided with the distribution. - -- Neither the name of the copyright holders nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -The copyright holders provide no reassurances that the source code -provided does not infringe any patent, copyright, or any other -intellectual property rights of third parties. The copyright holders -disclaim any liability to any recipient for claims brought against -recipient by any third party for infringement of that parties -intellectual property rights. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-paranamer.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-paranamer.txt deleted file mode 100644 index fca1847..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-paranamer.txt +++ /dev/null @@ -1,28 +0,0 @@ -[ ParaNamer used to be 'Pubic Domain', but since it includes a small piece of ASM it is now the same license as that: BSD ] - - Copyright (c) 2006 Paul Hammant & ThoughtWorks Inc - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - 3. Neither the name of the copyright holders nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-pmml-model.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-pmml-model.txt deleted file mode 100644 index 69411d1..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-pmml-model.txt +++ /dev/null @@ -1,10 +0,0 @@ -Copyright (c) 2009, University of Tartu -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-protobuf.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-protobuf.txt deleted file mode 100644 index b4350ec..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-protobuf.txt +++ /dev/null @@ -1,42 +0,0 @@ -This license applies to all parts of Protocol Buffers except the following: - - - Atomicops support for generic gcc, located in - src/google/protobuf/stubs/atomicops_internals_generic_gcc.h. - This file is copyrighted by Red Hat Inc. - - - Atomicops support for AIX/POWER, located in - src/google/protobuf/stubs/atomicops_internals_aix.h. - This file is copyrighted by Bloomberg Finance LP. - -Copyright 2014, Google Inc. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Code generated by the Protocol Buffer compiler is owned by the owner -of the input file used when generating it. This code is not -standalone and requires a support library to be linked with it. This -support library is itself covered by the above license. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-py4j.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-py4j.txt deleted file mode 100644 index 70af3e6..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-py4j.txt +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -- Redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. - -- Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -- The name of the author may not be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-pyrolite.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-pyrolite.txt deleted file mode 100644 index 9457c7a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-pyrolite.txt +++ /dev/null @@ -1,28 +0,0 @@ - -Pyro - Python Remote Objects -Software License, copyright, and disclaimer - - Pyro is Copyright (c) by Irmen de Jong (irmen@razorvine.net). - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - -This is the "MIT Software License" which is OSI-certified, and GPL-compatible. -See http://www.opensource.org/licenses/mit-license.php - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-reflectasm.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-reflectasm.txt deleted file mode 100644 index 3f6a160..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-reflectasm.txt +++ /dev/null @@ -1,10 +0,0 @@ -Copyright (c) 2008, Nathan Sweet -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Esoteric Software nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-respond.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-respond.txt deleted file mode 100644 index dea4ff9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-respond.txt +++ /dev/null @@ -1,22 +0,0 @@ -Copyright (c) 2012 Scott Jehl - -Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation -files (the "Software"), to deal in the Software without -restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-sbt-launch-lib.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-sbt-launch-lib.txt deleted file mode 100644 index 3b9156b..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-sbt-launch-lib.txt +++ /dev/null @@ -1,26 +0,0 @@ -// Generated from http://www.opensource.org/licenses/bsd-license.php -Copyright (c) 2011, Paul Phillips. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - * Neither the name of the author nor the names of its contributors may be - used to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-scala.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-scala.txt deleted file mode 100644 index 4846076..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-scala.txt +++ /dev/null @@ -1,30 +0,0 @@ -Copyright (c) 2002-2013 EPFL -Copyright (c) 2011-2013 Typesafe, Inc. - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -- Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -- Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -- Neither the name of the EPFL nor the names of its contributors may be - used to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-scopt.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-scopt.txt deleted file mode 100644 index e92e9b5..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-scopt.txt +++ /dev/null @@ -1,9 +0,0 @@ -This project is licensed under the MIT license. - -Copyright (c) scopt contributors - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-slf4j.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-slf4j.txt deleted file mode 100644 index 6548cd3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-slf4j.txt +++ /dev/null @@ -1,21 +0,0 @@ -Copyright (c) 2004-2013 QOS.ch - All rights reserved. - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-sorttable.js.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-sorttable.js.txt deleted file mode 100644 index b31a5b2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-sorttable.js.txt +++ /dev/null @@ -1,16 +0,0 @@ -Copyright (c) 1997-2007 Stuart Langridge - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-spire.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-spire.txt deleted file mode 100644 index 40af774..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-spire.txt +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2011-2012 Erik Osheim, Tom Switzer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-vis.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-vis.txt deleted file mode 100644 index 18b7323..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-vis.txt +++ /dev/null @@ -1,22 +0,0 @@ -vis.js -https://github.com/almende/vis - -A dynamic, browser-based visualization library. - -@version 4.16.1 -@date 2016-04-18 - -@license -Copyright (C) 2011-2016 Almende B.V, http://almende.com - -Vis.js is dual licensed under both - -* The Apache 2.0 License - http://www.apache.org/licenses/LICENSE-2.0 - -and - -* The MIT License - http://opensource.org/licenses/MIT - -Vis.js may be distributed under either license. \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-xmlenc.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-xmlenc.txt deleted file mode 100644 index 3a70c9b..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-xmlenc.txt +++ /dev/null @@ -1,27 +0,0 @@ -Copyright 2003-2005, Ernst de Haan -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-zstd-jni.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-zstd-jni.txt deleted file mode 100644 index 32c6bbd..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-zstd-jni.txt +++ /dev/null @@ -1,26 +0,0 @@ -Zstd-jni: JNI bindings to Zstd Library - -Copyright (c) 2015-2016, Luben Karavelov/ All rights reserved. - -BSD License - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, this - list of conditions and the following disclaimer in the documentation and/or - other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-zstd.txt b/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-zstd.txt deleted file mode 100644 index a793a80..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/licenses/LICENSE-zstd.txt +++ /dev/null @@ -1,30 +0,0 @@ -BSD License - -For Zstandard software - -Copyright (c) 2016-present, Facebook, Inc. All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name Facebook nor the names of its contributors may be used to - endorse or promote products derived from this software without specific - prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/.coveragerc b/scripts/spark-2.4.3-bin-hadoop2.7/python/.coveragerc deleted file mode 100644 index b3339cd..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/.coveragerc +++ /dev/null @@ -1,21 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -[run] -branch = true -parallel = true -data_file = ${COVERAGE_DIR}/coverage_data/coverage diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/.gitignore b/scripts/spark-2.4.3-bin-hadoop2.7/python/.gitignore deleted file mode 100644 index 52128cf..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -*.pyc -docs/_build/ -pyspark.egg-info -build/ -dist/ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/MANIFEST.in b/scripts/spark-2.4.3-bin-hadoop2.7/python/MANIFEST.in deleted file mode 100644 index 40f1fb2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/MANIFEST.in +++ /dev/null @@ -1,24 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -global-exclude *.py[cod] __pycache__ .DS_Store -recursive-include deps/jars *.jar -graft deps/bin -recursive-include deps/data *.data *.txt -recursive-include deps/licenses *.txt -recursive-include deps/examples *.py -recursive-include lib *.zip -include README.md diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/README.md b/scripts/spark-2.4.3-bin-hadoop2.7/python/README.md deleted file mode 100644 index c020d84..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Apache Spark - -Spark is a fast and general cluster computing system for Big Data. It provides -high-level APIs in Scala, Java, Python, and R, and an optimized engine that -supports general computation graphs for data analysis. It also supports a -rich set of higher-level tools including Spark SQL for SQL and DataFrames, -MLlib for machine learning, GraphX for graph processing, -and Spark Streaming for stream processing. - - - -## Online Documentation - -You can find the latest Spark documentation, including a programming -guide, on the [project web page](http://spark.apache.org/documentation.html) - - -## Python Packaging - -This README file only contains basic information related to pip installed PySpark. -This packaging is currently experimental and may change in future versions (although we will do our best to keep compatibility). -Using PySpark requires the Spark JARs, and if you are building this from source please see the builder instructions at -["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html). - -The Python packaging for Spark is not intended to replace all of the other use cases. This Python packaged version of Spark is suitable for interacting with an existing cluster (be it Spark standalone, YARN, or Mesos) - but does not contain the tools required to set up your own standalone Spark cluster. You can download the full version of Spark from the [Apache Spark downloads page](http://spark.apache.org/downloads.html). - - -**NOTE:** If you are using this with a Spark standalone cluster you must ensure that the version (including minor version) matches or you may experience odd errors. - -## Python Requirements - -At its core PySpark depends on Py4J (currently version 0.10.7), but some additional sub-packages have their own extra requirements for some features (including numpy, pandas, and pyarrow). diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/Makefile b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/Makefile deleted file mode 100644 index 1ed1f33..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/Makefile +++ /dev/null @@ -1,204 +0,0 @@ -# Makefile for Sphinx documentation -# - -ifndef SPHINXBUILD -ifndef SPHINXPYTHON -SPHINXBUILD = sphinx-build -endif -endif - -ifdef SPHINXBUILD -# User-friendly check for sphinx-build. -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif -else -# Note that there is an issue with Python version and Sphinx in PySpark documentation generation. -# Please remove this check below when this issue is fixed. See SPARK-24530 for more details. -PYTHON_VERSION_CHECK = $(shell $(SPHINXPYTHON) -c 'import sys; print(sys.version_info < (3, 0, 0))') -ifeq ($(PYTHON_VERSION_CHECK), True) -$(error Note that Python 3 is required to generate PySpark documentation correctly for now. Current Python executable was less than Python 3. See SPARK-24530. To force Sphinx to use a specific Python executable, please set SPHINXPYTHON to point to the Python 3 executable.) -endif -# Check if Sphinx is installed. -ifeq ($(shell $(SPHINXPYTHON) -c 'import sphinx' >/dev/null 2>&1; echo $$?), 1) -$(error Python executable '$(SPHINXPYTHON)' did not have Sphinx installed. Make sure you have Sphinx installed, then set the SPHINXPYTHON environment variable to point to the Python executable having Sphinx installed. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif -# Use 'SPHINXPYTHON -msphinx' instead of 'sphinx-build'. See https://github.com/sphinx-doc/sphinx/pull/3523 for more details. -SPHINXBUILD = $(SPHINXPYTHON) -msphinx -endif - -# You can set these variables from the command line. -SPHINXOPTS ?= -PAPER ?= -BUILDDIR ?= _build -# You can set SPHINXBUILD to specify Sphinx build executable or SPHINXPYTHON to specify the Python executable used in Sphinx. -# They follow: -# 1. if SPHINXPYTHON is set, use Python. If SPHINXBUILD is set, use sphinx-build. -# 2. If both are set, SPHINXBUILD has a higher priority over SPHINXPYTHON -# 3. By default, SPHINXBUILD is used as 'sphinx-build'. - -export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.10.7-src.zip) - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyspark.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyspark.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/pyspark" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyspark" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/_static/pyspark.css b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/_static/pyspark.css deleted file mode 100644 index 41106f2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/_static/pyspark.css +++ /dev/null @@ -1,90 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -body { - background-color: #ffffff; -} - -div.sphinxsidebar { - width: 274px; -} - -div.bodywrapper { - margin: 0 0 0 274px; -} - -div.sphinxsidebar ul { - margin-right: 10px; -} - -div.sphinxsidebar li a { - word-break: break-all; -} - -span.pys-tag { - font-size: 11px; - font-weight: bold; - margin: 0 0 0 2px; - padding: 1px 3px 1px 3px; - -moz-border-radius: 3px; - -webkit-border-radius: 3px; - border-radius: 3px; - text-align: center; - text-decoration: none; -} - -span.pys-tag-experimental { - background-color: rgb(37, 112, 128); - color: rgb(255, 255, 255); -} - -span.pys-tag-deprecated { - background-color: rgb(238, 238, 238); - color: rgb(62, 67, 73); -} - -div.pys-note-experimental { - background-color: rgb(88, 151, 165); - border-color: rgb(59, 115, 127); - color: rgb(255, 255, 255); -} - -div.pys-note-deprecated { -} - -.hasTooltip { - position:relative; -} -.hasTooltip span { - display:none; -} - -.hasTooltip:hover span.tooltip { - display: inline-block; - -moz-border-radius: 2px; - -webkit-border-radius: 2px; - border-radius: 2px; - background-color: rgb(250, 250, 250); - color: rgb(68, 68, 68); - font-weight: normal; - box-shadow: 1px 1px 3px rgb(127, 127, 127); - position: absolute; - padding: 0 3px 0 3px; - top: 1.3em; - left: 14px; - z-index: 9999 -} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/_static/pyspark.js b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/_static/pyspark.js deleted file mode 100644 index 75e4c42..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/_static/pyspark.js +++ /dev/null @@ -1,99 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -$(function (){ - - function startsWith(s, prefix) { - return s && s.indexOf(prefix) === 0; - } - - function buildSidebarLinkMap() { - var linkMap = {}; - $('div.sphinxsidebar a.reference.internal').each(function (i,a) { - var href = $(a).attr('href'); - if (startsWith(href, '#module-')) { - var id = href.substr(8); - linkMap[id] = [$(a), null]; - } - }) - return linkMap; - }; - - function getAdNoteDivs(dd) { - var noteDivs = {}; - dd.find('> div.admonition.note > p.last').each(function (i, p) { - var text = $(p).text(); - if (!noteDivs.experimental && startsWith(text, 'Experimental')) { - noteDivs.experimental = $(p).parent(); - } - if (!noteDivs.deprecated && startsWith(text, 'Deprecated')) { - noteDivs.deprecated = $(p).parent(); - } - }); - return noteDivs; - } - - function getParentId(name) { - var last_idx = name.lastIndexOf('.'); - return last_idx == -1? '': name.substr(0, last_idx); - } - - function buildTag(text, cls, tooltip) { - return '' + text + '' - + tooltip + '' - } - - - var sidebarLinkMap = buildSidebarLinkMap(); - - $('dl.class, dl.function').each(function (i,dl) { - - dl = $(dl); - dt = dl.children('dt').eq(0); - dd = dl.children('dd').eq(0); - var id = dt.attr('id'); - var desc = dt.find('> .descname').text(); - var adNoteDivs = getAdNoteDivs(dd); - - if (id) { - var parent_id = getParentId(id); - - var r = sidebarLinkMap[parent_id]; - if (r) { - if (r[1] === null) { - r[1] = $('
      '); - r[0].parent().append(r[1]); - } - var tags = ''; - if (adNoteDivs.experimental) { - tags += buildTag('E', 'pys-tag-experimental', 'Experimental'); - adNoteDivs.experimental.addClass('pys-note pys-note-experimental'); - } - if (adNoteDivs.deprecated) { - tags += buildTag('D', 'pys-tag-deprecated', 'Deprecated'); - adNoteDivs.deprecated.addClass('pys-note pys-note-deprecated'); - } - var li = $('
    • '); - var a = $('' + desc + ''); - li.append(a); - li.append(tags); - r[1].append(li); - sidebarLinkMap[id] = [a, null]; - } - } - }); -}); diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/_templates/layout.html b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/_templates/layout.html deleted file mode 100644 index ab36eba..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/_templates/layout.html +++ /dev/null @@ -1,6 +0,0 @@ -{% extends "!layout.html" %} -{% set script_files = script_files + ["_static/pyspark.js"] %} -{% set css_files = css_files + ['_static/pyspark.css'] %} -{% block rootrellink %} - {{ super() }} -{% endblock %} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/conf.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/conf.py deleted file mode 100644 index 50fb317..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/conf.py +++ /dev/null @@ -1,340 +0,0 @@ -# -*- coding: utf-8 -*- -# -# pyspark documentation build configuration file, created by -# sphinx-quickstart on Thu Aug 28 15:17:47 2014. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -needs_sphinx = '1.2' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.viewcode', - 'epytext', - 'sphinx.ext.mathjax', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'PySpark' -copyright = u'' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = 'master' -# The full version, including alpha/beta/rc tags. -release = os.environ.get('RELEASE_VERSION', version) - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - -# -- Options for autodoc -------------------------------------------------- - -# Look at the first line of the docstring for function and method signatures. -autodoc_docstring_signature = True - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'nature' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -html_logo = "../../docs/img/spark-logo-hd.png" - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -html_domain_indices = False - -# If false, no index is generated. -html_use_index = False - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'pysparkdoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - ('index', 'pyspark.tex', u'pyspark Documentation', - u'Author', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'pyspark', u'pyspark Documentation', - [u'Author'], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'pyspark', u'pyspark Documentation', - u'Author', 'pyspark', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False - - -# -- Options for Epub output ---------------------------------------------- - -# Bibliographic Dublin Core info. -epub_title = u'pyspark' -epub_author = u'Author' -epub_publisher = u'Author' -epub_copyright = u'2014, Author' - -# The basename for the epub file. It defaults to the project name. -#epub_basename = u'pyspark' - -# The HTML theme for the epub output. Since the default themes are not optimized -# for small screen space, using the same theme for HTML and epub output is -# usually not wise. This defaults to 'epub', a theme designed to save visual -# space. -#epub_theme = 'epub' - -# The language of the text. It defaults to the language option -# or en if the language is not set. -#epub_language = '' - -# The scheme of the identifier. Typical schemes are ISBN or URL. -#epub_scheme = '' - -# The unique identifier of the text. This can be a ISBN number -# or the project homepage. -#epub_identifier = '' - -# A unique identification for the text. -#epub_uid = '' - -# A tuple containing the cover image and cover page html template filenames. -#epub_cover = () - -# A sequence of (type, uri, title) tuples for the guide element of content.opf. -#epub_guide = () - -# HTML files that should be inserted before the pages created by sphinx. -# The format is a list of tuples containing the path and title. -#epub_pre_files = [] - -# HTML files shat should be inserted after the pages created by sphinx. -# The format is a list of tuples containing the path and title. -#epub_post_files = [] - -# A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] - -# The depth of the table of contents in toc.ncx. -#epub_tocdepth = 3 - -# Allow duplicate toc entries. -#epub_tocdup = True - -# Choose between 'default' and 'includehidden'. -#epub_tocscope = 'default' - -# Fix unsupported image types using the PIL. -#epub_fix_images = False - -# Scale large images. -#epub_max_image_width = 0 - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#epub_show_urls = 'inline' - -# If false, no index is generated. -#epub_use_index = True - -# Skip sample endpoint link (not expected to resolve) -linkcheck_ignore = [r'https://kinesis.us-east-1.amazonaws.com'] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/epytext.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/epytext.py deleted file mode 100644 index 4bbbf65..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/epytext.py +++ /dev/null @@ -1,30 +0,0 @@ -import re - -RULES = ( - (r"<(!BLANKLINE)[\w.]+>", r""), - (r"L{([\w.()]+)}", r":class:`\1`"), - (r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"), - (r"C{([\w.()]+)}", r":class:`\1`"), - (r"[IBCM]{([^}]+)}", r"`\1`"), - ('pyspark.rdd.RDD', 'RDD'), -) - - -def _convert_epytext(line): - """ - >>> _convert_epytext("L{A}") - :class:`A` - """ - line = line.replace('@', ':') - for p, sub in RULES: - line = re.sub(p, sub, line) - return line - - -def _process_docstring(app, what, name, obj, options, lines): - for i in range(len(lines)): - lines[i] = _convert_epytext(lines[i]) - - -def setup(app): - app.connect("autodoc-process-docstring", _process_docstring) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/index.rst b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/index.rst deleted file mode 100644 index 0e7b623..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/index.rst +++ /dev/null @@ -1,52 +0,0 @@ -.. pyspark documentation master file, created by - sphinx-quickstart on Thu Aug 28 15:17:47 2014. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to Spark Python API Docs! -=================================== - -Contents: - -.. toctree:: - :maxdepth: 2 - - pyspark - pyspark.sql - pyspark.streaming - pyspark.ml - pyspark.mllib - - -Core classes: ---------------- - - :class:`pyspark.SparkContext` - - Main entry point for Spark functionality. - - :class:`pyspark.RDD` - - A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. - - :class:`pyspark.streaming.StreamingContext` - - Main entry point for Spark Streaming functionality. - - :class:`pyspark.streaming.DStream` - - A Discretized Stream (DStream), the basic abstraction in Spark Streaming. - - :class:`pyspark.sql.SparkSession` - - Main entry point for DataFrame and SQL functionality. - - :class:`pyspark.sql.DataFrame` - - A distributed collection of data grouped into named columns. - - -Indices and tables -================== - -* :ref:`search` diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/make.bat b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/make.bat deleted file mode 100644 index c011e82..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/make.bat +++ /dev/null @@ -1,6 +0,0 @@ -@ECHO OFF - -rem This is the entry point for running Sphinx documentation. To avoid polluting the -rem environment, it just launches a new cmd to do the real work. - -cmd /V /E /C %~dp0make2.bat %* diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/make2.bat b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/make2.bat deleted file mode 100644 index 7bcaeaf..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/make2.bat +++ /dev/null @@ -1,243 +0,0 @@ -@ECHO OFF - -REM Command file for Sphinx documentation - - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set BUILDDIR=_build -set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . -set I18NSPHINXOPTS=%SPHINXOPTS% . -if NOT "%PAPER%" == "" ( - set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% - set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% -) - -if "%1" == "" goto help - -if "%1" == "help" ( - :help - echo.Please use `make ^` where ^ is one of - echo. html to make standalone HTML files - echo. dirhtml to make HTML files named index.html in directories - echo. singlehtml to make a single large HTML file - echo. pickle to make pickle files - echo. json to make JSON files - echo. htmlhelp to make HTML files and a HTML help project - echo. qthelp to make HTML files and a qthelp project - echo. devhelp to make HTML files and a Devhelp project - echo. epub to make an epub - echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter - echo. text to make text files - echo. man to make manual pages - echo. texinfo to make Texinfo files - echo. gettext to make PO message catalogs - echo. changes to make an overview over all changed/added/deprecated items - echo. xml to make Docutils-native XML files - echo. pseudoxml to make pseudoxml-XML files for display purposes - echo. linkcheck to check all external links for integrity - echo. doctest to run all doctests embedded in the documentation if enabled - goto end -) - -if "%1" == "clean" ( - for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i - del /q /s %BUILDDIR%\* - goto end -) - - -%SPHINXBUILD% 2> nul -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "html" ( - %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/html. - goto end -) - -if "%1" == "dirhtml" ( - %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. - goto end -) - -if "%1" == "singlehtml" ( - %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. - goto end -) - -if "%1" == "pickle" ( - %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the pickle files. - goto end -) - -if "%1" == "json" ( - %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the JSON files. - goto end -) - -if "%1" == "htmlhelp" ( - %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run HTML Help Workshop with the ^ -.hhp project file in %BUILDDIR%/htmlhelp. - goto end -) - -if "%1" == "qthelp" ( - %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run "qcollectiongenerator" with the ^ -.qhcp project file in %BUILDDIR%/qthelp, like this: - echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pyspark.qhcp - echo.To view the help file: - echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pyspark.ghc - goto end -) - -if "%1" == "devhelp" ( - %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. - goto end -) - -if "%1" == "epub" ( - %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The epub file is in %BUILDDIR%/epub. - goto end -) - -if "%1" == "latex" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "latexpdf" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - cd %BUILDDIR%/latex - make all-pdf - cd %BUILDDIR%/.. - echo. - echo.Build finished; the PDF files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "latexpdfja" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - cd %BUILDDIR%/latex - make all-pdf-ja - cd %BUILDDIR%/.. - echo. - echo.Build finished; the PDF files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "text" ( - %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The text files are in %BUILDDIR%/text. - goto end -) - -if "%1" == "man" ( - %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The manual pages are in %BUILDDIR%/man. - goto end -) - -if "%1" == "texinfo" ( - %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. - goto end -) - -if "%1" == "gettext" ( - %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The message catalogs are in %BUILDDIR%/locale. - goto end -) - -if "%1" == "changes" ( - %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes - if errorlevel 1 exit /b 1 - echo. - echo.The overview file is in %BUILDDIR%/changes. - goto end -) - -if "%1" == "linkcheck" ( - %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck - if errorlevel 1 exit /b 1 - echo. - echo.Link check complete; look for any errors in the above output ^ -or in %BUILDDIR%/linkcheck/output.txt. - goto end -) - -if "%1" == "doctest" ( - %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest - if errorlevel 1 exit /b 1 - echo. - echo.Testing of doctests in the sources finished, look at the ^ -results in %BUILDDIR%/doctest/output.txt. - goto end -) - -if "%1" == "xml" ( - %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The XML files are in %BUILDDIR%/xml. - goto end -) - -if "%1" == "pseudoxml" ( - %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. - goto end -) - -:end diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.ml.rst b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.ml.rst deleted file mode 100644 index 6a5d817..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.ml.rst +++ /dev/null @@ -1,114 +0,0 @@ -pyspark.ml package -================== - -ML Pipeline APIs ----------------- - -.. automodule:: pyspark.ml - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.param module ------------------------ - -.. automodule:: pyspark.ml.param - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.feature module -------------------------- - -.. automodule:: pyspark.ml.feature - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.classification module --------------------------------- - -.. automodule:: pyspark.ml.classification - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.clustering module ----------------------------- - -.. automodule:: pyspark.ml.clustering - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.linalg module ----------------------------- - -.. automodule:: pyspark.ml.linalg - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.recommendation module --------------------------------- - -.. automodule:: pyspark.ml.recommendation - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.regression module ----------------------------- - -.. automodule:: pyspark.ml.regression - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.stat module ----------------------- - -.. automodule:: pyspark.ml.stat - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.tuning module ------------------------- - -.. automodule:: pyspark.ml.tuning - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.evaluation module ----------------------------- - -.. automodule:: pyspark.ml.evaluation - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.fpm module ----------------------------- - -.. automodule:: pyspark.ml.fpm - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.image module ----------------------------- - -.. automodule:: pyspark.ml.image - :members: - :undoc-members: - :inherited-members: - -pyspark.ml.util module ----------------------------- - -.. automodule:: pyspark.ml.util - :members: - :undoc-members: - :inherited-members: diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.mllib.rst b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.mllib.rst deleted file mode 100644 index 2d54ab1..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.mllib.rst +++ /dev/null @@ -1,99 +0,0 @@ -pyspark.mllib package -===================== - -pyspark.mllib.classification module ------------------------------------ - -.. automodule:: pyspark.mllib.classification - :members: - :undoc-members: - :inherited-members: - -pyspark.mllib.clustering module -------------------------------- - -.. automodule:: pyspark.mllib.clustering - :members: - :undoc-members: - -pyspark.mllib.evaluation module -------------------------------- - -.. automodule:: pyspark.mllib.evaluation - :members: - :undoc-members: - -pyspark.mllib.feature module -------------------------------- - -.. automodule:: pyspark.mllib.feature - :members: - :undoc-members: - :show-inheritance: - -pyspark.mllib.fpm module ------------------------- - -.. automodule:: pyspark.mllib.fpm - :members: - :undoc-members: - -pyspark.mllib.linalg module ---------------------------- - -.. automodule:: pyspark.mllib.linalg - :members: - :undoc-members: - :show-inheritance: - -pyspark.mllib.linalg.distributed module ---------------------------------------- - -.. automodule:: pyspark.mllib.linalg.distributed - :members: - :undoc-members: - :show-inheritance: - -pyspark.mllib.random module ---------------------------- - -.. automodule:: pyspark.mllib.random - :members: - :undoc-members: - -pyspark.mllib.recommendation module ------------------------------------ - -.. automodule:: pyspark.mllib.recommendation - :members: - :undoc-members: - -pyspark.mllib.regression module -------------------------------- - -.. automodule:: pyspark.mllib.regression - :members: - :undoc-members: - :inherited-members: - -pyspark.mllib.stat module -------------------------- - -.. automodule:: pyspark.mllib.stat - :members: - :undoc-members: - -pyspark.mllib.tree module -------------------------- - -.. automodule:: pyspark.mllib.tree - :members: - :undoc-members: - :inherited-members: - -pyspark.mllib.util module -------------------------- - -.. automodule:: pyspark.mllib.util - :members: - :undoc-members: diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.rst b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.rst deleted file mode 100644 index 0df12c4..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.rst +++ /dev/null @@ -1,20 +0,0 @@ -pyspark package -=============== - -Subpackages ------------ - -.. toctree:: - :maxdepth: 1 - - pyspark.sql - pyspark.streaming - pyspark.ml - pyspark.mllib - -Contents --------- - -.. automodule:: pyspark - :members: - :undoc-members: diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.sql.rst b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.sql.rst deleted file mode 100644 index 5c3b7e2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.sql.rst +++ /dev/null @@ -1,30 +0,0 @@ -pyspark.sql module -================== - -Module Context --------------- - -.. automodule:: pyspark.sql - :members: - :undoc-members: - :exclude-members: builder -.. We need `exclude-members` to prevent default description generations - as a workaround for old Sphinx (< 1.6.6). - -pyspark.sql.types module ------------------------- -.. automodule:: pyspark.sql.types - :members: - :undoc-members: - -pyspark.sql.functions module ----------------------------- -.. automodule:: pyspark.sql.functions - :members: - :undoc-members: - -pyspark.sql.streaming module ----------------------------- -.. automodule:: pyspark.sql.streaming - :members: - :undoc-members: diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.streaming.rst b/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.streaming.rst deleted file mode 100644 index 25ceaba..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/docs/pyspark.streaming.rst +++ /dev/null @@ -1,31 +0,0 @@ -pyspark.streaming module -======================== - -Module contents ---------------- - -.. automodule:: pyspark.streaming - :members: - :undoc-members: - :show-inheritance: - -pyspark.streaming.kafka module ------------------------------- -.. automodule:: pyspark.streaming.kafka - :members: - :undoc-members: - :show-inheritance: - -pyspark.streaming.kinesis module --------------------------------- -.. automodule:: pyspark.streaming.kinesis - :members: - :undoc-members: - :show-inheritance: - -pyspark.streaming.flume.module ------------------------------- -.. automodule:: pyspark.streaming.flume - :members: - :undoc-members: - :show-inheritance: diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/lib/PY4J_LICENSE.txt b/scripts/spark-2.4.3-bin-hadoop2.7/python/lib/PY4J_LICENSE.txt deleted file mode 100644 index a70279c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/lib/PY4J_LICENSE.txt +++ /dev/null @@ -1,27 +0,0 @@ - -Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -- Redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. - -- Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -- The name of the author may not be used to endorse or promote products -derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip b/scripts/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip deleted file mode 100644 index 128e321..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip b/scripts/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip deleted file mode 100644 index 9cf3cd2..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pylintrc b/scripts/spark-2.4.3-bin-hadoop2.7/python/pylintrc deleted file mode 100644 index 6a67577..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pylintrc +++ /dev/null @@ -1,404 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -[MASTER] - -# Specify a configuration file. -#rcfile= - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Profiled execution. -profile=no - -# Add files or directories to the blacklist. They should be base names, not -# paths. -ignore=pyspark.heapq3 - -# Pickle collected data for later comparisons. -persistent=yes - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - -# Use multiple processes to speed up Pylint. -jobs=1 - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code -extension-pkg-whitelist= - -# Allow optimization of some AST trees. This will activate a peephole AST -# optimizer, which will apply various small optimizations. For instance, it can -# be used to obtain the result of joining multiple strings with the addition -# operator. Joining a lot of strings can lead to a maximum recursion error in -# Pylint and this flag can prevent that. It has one side effect, the resulting -# AST will be different than the one from reality. -optimize-ast=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -confidence= - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time. See also the "--disable" option for examples. -enable= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" - -# These errors are arranged in order of number of warning given in pylint. -# If you would like to improve the code quality of pyspark, remove any of these disabled errors -# run ./dev/lint-python and see if the errors raised by pylint can be fixed. - -disable=invalid-name,missing-docstring,protected-access,unused-argument,no-member,unused-wildcard-import,redefined-builtin,too-many-arguments,unused-variable,too-few-public-methods,bad-continuation,duplicate-code,redefined-outer-name,too-many-ancestors,import-error,superfluous-parens,unused-import,line-too-long,no-name-in-module,unnecessary-lambda,import-self,no-self-use,unidiomatic-typecheck,fixme,too-many-locals,cyclic-import,too-many-branches,bare-except,wildcard-import,dangerous-default-value,broad-except,too-many-public-methods,deprecated-lambda,anomalous-backslash-in-string,too-many-lines,reimported,too-many-statements,bad-whitespace,unpacking-non-sequence,too-many-instance-attributes,abstract-method,old-style-class,global-statement,attribute-defined-outside-init,arguments-differ,undefined-all-variable,no-init,useless-else-on-loop,super-init-not-called,notimplemented-raised,too-many-return-statements,pointless-string-statement,global-variable-undefined,bad-classmethod-argument,too-many-format-args,parse-error,no-self-argument,pointless-statement,undefined-variable,undefined-loop-variable - - -[REPORTS] - -# Set the output format. Available formats are text, parseable, colorized, msvs -# (visual studio) and html. You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Put messages in a separate file for each module / package specified on the -# command line instead of printing them on stdout. Reports (if any) will be -# written in a file name "pylint_global.[txt|html]". -files-output=no - -# Tells whether to display a full report or only the messages -reports=no - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Add a comment according to your evaluation note. This is used by the global -# evaluation report (RP0004). -comment=no - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME,XXX,TODO - - -[BASIC] - -# Required attributes for module, separated by a comma -required-attributes= - -# List of builtins function names that should not be used, separated by a comma -bad-functions= - -# Good variable names which should always be accepted, separated by a comma -good-names=i,j,k,ex,Run,_ - -# Bad variable names which should always be refused, separated by a comma -bad-names=baz,toto,tutu,tata - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# Regular expression matching correct function names -function-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming hint for function names -function-name-hint=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct variable names -variable-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming hint for variable names -variable-name-hint=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct constant names -const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Naming hint for constant names -const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Regular expression matching correct attribute names -attr-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming hint for attribute names -attr-name-hint=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct argument names -argument-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming hint for argument names -argument-name-hint=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct class attribute names -class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Naming hint for class attribute names -class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Regular expression matching correct inline iteration names -inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ - -# Naming hint for inline iteration names -inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ - -# Regular expression matching correct class names -class-rgx=[A-Z_][a-zA-Z0-9]+$ - -# Naming hint for class names -class-name-hint=[A-Z_][a-zA-Z0-9]+$ - -# Regular expression matching correct module names -module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Naming hint for module names -module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Regular expression matching correct method names -method-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming hint for method names -method-name-hint=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=__.*__ - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - - -[FORMAT] - -# Maximum number of characters on a single line. -max-line-length=100 - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - -# List of optional constructs for which whitespace checking is disabled -no-space-check=trailing-comma,dict-separator - -# Maximum number of lines in a module -max-module-lines=1000 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - - -[SIMILARITIES] - -# Minimum lines number of a similarity. -min-similarity-lines=4 - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - - -[VARIABLES] - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=_$|dummy - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_,_cb - - -[SPELLING] - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging - - -[TYPECHECK] - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis -ignored-modules= - -# List of classes names for which member attributes should not be checked -# (useful for classes with attributes dynamically set). -ignored-classes=SQLObject - -# When zope mode is activated, add a predefined set of Zope acquired attributes -# to generated-members. -zope=no - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E0201 when accessed. Python regular -# expressions are accepted. -generated-members=REQUEST,acl_users,aq_parent - - -[CLASSES] - -# List of interface methods to ignore, separated by a comma. This is used for -# instance to not check methods defines in Zope's Interface base class. -ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__,__new__,setUp - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict,_fields,_replace,_source,_make - - -[IMPORTS] - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=regsub,TERMIOS,Bastion,rexec - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - - -[DESIGN] - -# Maximum number of arguments for function / method -max-args=5 - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore -ignored-argument-names=_.* - -# Maximum number of locals for function / method body -max-locals=15 - -# Maximum number of return / yield for function / method body -max-returns=6 - -# Maximum number of branch for function / method body -max-branches=12 - -# Maximum number of statements in function / method body -max-statements=50 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/__init__.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/__init__.py deleted file mode 100644 index ee153af..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/__init__.py +++ /dev/null @@ -1,122 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -PySpark is the Python API for Spark. - -Public classes: - - - :class:`SparkContext`: - Main entry point for Spark functionality. - - :class:`RDD`: - A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. - - :class:`Broadcast`: - A broadcast variable that gets reused across tasks. - - :class:`Accumulator`: - An "add-only" shared variable that tasks can only add values to. - - :class:`SparkConf`: - For configuring Spark. - - :class:`SparkFiles`: - Access files shipped with jobs. - - :class:`StorageLevel`: - Finer-grained cache persistence levels. - - :class:`TaskContext`: - Information about the current running task, available on the workers and experimental. - - :class:`RDDBarrier`: - Wraps an RDD under a barrier stage for barrier execution. - - :class:`BarrierTaskContext`: - A :class:`TaskContext` that provides extra info and tooling for barrier execution. - - :class:`BarrierTaskInfo`: - Information about a barrier task. -""" - -from functools import wraps -import types - -from pyspark.conf import SparkConf -from pyspark.context import SparkContext -from pyspark.rdd import RDD, RDDBarrier -from pyspark.files import SparkFiles -from pyspark.storagelevel import StorageLevel -from pyspark.accumulators import Accumulator, AccumulatorParam -from pyspark.broadcast import Broadcast -from pyspark.serializers import MarshalSerializer, PickleSerializer -from pyspark.status import * -from pyspark.taskcontext import TaskContext, BarrierTaskContext, BarrierTaskInfo -from pyspark.profiler import Profiler, BasicProfiler -from pyspark.version import __version__ -from pyspark._globals import _NoValue - - -def since(version): - """ - A decorator that annotates a function to append the version of Spark the function was added. - """ - import re - indent_p = re.compile(r'\n( +)') - - def deco(f): - indents = indent_p.findall(f.__doc__) - indent = ' ' * (min(len(m) for m in indents) if indents else 0) - f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version) - return f - return deco - - -def copy_func(f, name=None, sinceversion=None, doc=None): - """ - Returns a function with same code, globals, defaults, closure, and - name (or provide a new name). - """ - # See - # http://stackoverflow.com/questions/6527633/how-can-i-make-a-deepcopy-of-a-function-in-python - fn = types.FunctionType(f.__code__, f.__globals__, name or f.__name__, f.__defaults__, - f.__closure__) - # in case f was given attrs (note this dict is a shallow copy): - fn.__dict__.update(f.__dict__) - if doc is not None: - fn.__doc__ = doc - if sinceversion is not None: - fn = since(sinceversion)(fn) - return fn - - -def keyword_only(func): - """ - A decorator that forces keyword arguments in the wrapped method - and saves actual input keyword arguments in `_input_kwargs`. - - .. note:: Should only be used to wrap a method where first arg is `self` - """ - @wraps(func) - def wrapper(self, *args, **kwargs): - if len(args) > 0: - raise TypeError("Method %s forces keyword arguments." % func.__name__) - self._input_kwargs = kwargs - return func(self, **kwargs) - return wrapper - - -# for back compatibility -from pyspark.sql import SQLContext, HiveContext, Row - -__all__ = [ - "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast", - "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer", - "StatusTracker", "SparkJobInfo", "SparkStageInfo", "Profiler", "BasicProfiler", "TaskContext", - "RDDBarrier", "BarrierTaskContext", "BarrierTaskInfo", -] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/_globals.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/_globals.py deleted file mode 100644 index 8e6099d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/_globals.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Module defining global singleton classes. - -This module raises a RuntimeError if an attempt to reload it is made. In that -way the identities of the classes defined here are fixed and will remain so -even if pyspark itself is reloaded. In particular, a function like the following -will still work correctly after pyspark is reloaded: - - def foo(arg=pyspark._NoValue): - if arg is pyspark._NoValue: - ... - -See gh-7844 for a discussion of the reload problem that motivated this module. - -Note that this approach is taken after from NumPy. -""" - -__ALL__ = ['_NoValue'] - - -# Disallow reloading this module so as to preserve the identities of the -# classes defined here. -if '_is_loaded' in globals(): - raise RuntimeError('Reloading pyspark._globals is not allowed') -_is_loaded = True - - -class _NoValueType(object): - """Special keyword value. - - The instance of this class may be used as the default value assigned to a - deprecated keyword in order to check if it has been given a user defined - value. - - This class was copied from NumPy. - """ - __instance = None - - def __new__(cls): - # ensure that only one instance exists - if not cls.__instance: - cls.__instance = super(_NoValueType, cls).__new__(cls) - return cls.__instance - - # needed for python 2 to preserve identity through a pickle - def __reduce__(self): - return (self.__class__, ()) - - def __repr__(self): - return "" - - -_NoValue = _NoValueType() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/accumulators.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/accumulators.py deleted file mode 100644 index 855d8fb..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/accumulators.py +++ /dev/null @@ -1,302 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" ->>> from pyspark.context import SparkContext ->>> sc = SparkContext('local', 'test') ->>> a = sc.accumulator(1) ->>> a.value -1 ->>> a.value = 2 ->>> a.value -2 ->>> a += 5 ->>> a.value -7 - ->>> sc.accumulator(1.0).value -1.0 - ->>> sc.accumulator(1j).value -1j - ->>> rdd = sc.parallelize([1,2,3]) ->>> def f(x): -... global a -... a += x ->>> rdd.foreach(f) ->>> a.value -13 - ->>> b = sc.accumulator(0) ->>> def g(x): -... b.add(x) ->>> rdd.foreach(g) ->>> b.value -6 - ->>> from pyspark.accumulators import AccumulatorParam ->>> class VectorAccumulatorParam(AccumulatorParam): -... def zero(self, value): -... return [0.0] * len(value) -... def addInPlace(self, val1, val2): -... for i in range(len(val1)): -... val1[i] += val2[i] -... return val1 ->>> va = sc.accumulator([1.0, 2.0, 3.0], VectorAccumulatorParam()) ->>> va.value -[1.0, 2.0, 3.0] ->>> def g(x): -... global va -... va += [x] * 3 ->>> rdd.foreach(g) ->>> va.value -[7.0, 8.0, 9.0] - ->>> rdd.map(lambda x: a.value).collect() # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): - ... -Py4JJavaError:... - ->>> def h(x): -... global a -... a.value = 7 ->>> rdd.foreach(h) # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): - ... -Py4JJavaError:... - ->>> sc.accumulator([1.0, 2.0, 3.0]) # doctest: +IGNORE_EXCEPTION_DETAIL -Traceback (most recent call last): - ... -TypeError:... -""" - -import sys -import select -import struct -if sys.version < '3': - import SocketServer -else: - import socketserver as SocketServer -import threading -from pyspark.serializers import read_int, PickleSerializer - - -__all__ = ['Accumulator', 'AccumulatorParam'] - - -pickleSer = PickleSerializer() - -# Holds accumulators registered on the current machine, keyed by ID. This is then used to send -# the local accumulator updates back to the driver program at the end of a task. -_accumulatorRegistry = {} - - -def _deserialize_accumulator(aid, zero_value, accum_param): - from pyspark.accumulators import _accumulatorRegistry - # If this certain accumulator was deserialized, don't overwrite it. - if aid in _accumulatorRegistry: - return _accumulatorRegistry[aid] - else: - accum = Accumulator(aid, zero_value, accum_param) - accum._deserialized = True - _accumulatorRegistry[aid] = accum - return accum - - -class Accumulator(object): - - """ - A shared variable that can be accumulated, i.e., has a commutative and associative "add" - operation. Worker tasks on a Spark cluster can add values to an Accumulator with the C{+=} - operator, but only the driver program is allowed to access its value, using C{value}. - Updates from the workers get propagated automatically to the driver program. - - While C{SparkContext} supports accumulators for primitive data types like C{int} and - C{float}, users can also define accumulators for custom types by providing a custom - L{AccumulatorParam} object. Refer to the doctest of this module for an example. - """ - - def __init__(self, aid, value, accum_param): - """Create a new Accumulator with a given initial value and AccumulatorParam object""" - from pyspark.accumulators import _accumulatorRegistry - self.aid = aid - self.accum_param = accum_param - self._value = value - self._deserialized = False - _accumulatorRegistry[aid] = self - - def __reduce__(self): - """Custom serialization; saves the zero value from our AccumulatorParam""" - param = self.accum_param - return (_deserialize_accumulator, (self.aid, param.zero(self._value), param)) - - @property - def value(self): - """Get the accumulator's value; only usable in driver program""" - if self._deserialized: - raise Exception("Accumulator.value cannot be accessed inside tasks") - return self._value - - @value.setter - def value(self, value): - """Sets the accumulator's value; only usable in driver program""" - if self._deserialized: - raise Exception("Accumulator.value cannot be accessed inside tasks") - self._value = value - - def add(self, term): - """Adds a term to this accumulator's value""" - self._value = self.accum_param.addInPlace(self._value, term) - - def __iadd__(self, term): - """The += operator; adds a term to this accumulator's value""" - self.add(term) - return self - - def __str__(self): - return str(self._value) - - def __repr__(self): - return "Accumulator" % (self.aid, self._value) - - -class AccumulatorParam(object): - - """ - Helper object that defines how to accumulate values of a given type. - """ - - def zero(self, value): - """ - Provide a "zero value" for the type, compatible in dimensions with the - provided C{value} (e.g., a zero vector) - """ - raise NotImplementedError - - def addInPlace(self, value1, value2): - """ - Add two values of the accumulator's data type, returning a new value; - for efficiency, can also update C{value1} in place and return it. - """ - raise NotImplementedError - - -class AddingAccumulatorParam(AccumulatorParam): - - """ - An AccumulatorParam that uses the + operators to add values. Designed for simple types - such as integers, floats, and lists. Requires the zero value for the underlying type - as a parameter. - """ - - def __init__(self, zero_value): - self.zero_value = zero_value - - def zero(self, value): - return self.zero_value - - def addInPlace(self, value1, value2): - value1 += value2 - return value1 - - -# Singleton accumulator params for some standard types -INT_ACCUMULATOR_PARAM = AddingAccumulatorParam(0) -FLOAT_ACCUMULATOR_PARAM = AddingAccumulatorParam(0.0) -COMPLEX_ACCUMULATOR_PARAM = AddingAccumulatorParam(0.0j) - - -class _UpdateRequestHandler(SocketServer.StreamRequestHandler): - - """ - This handler will keep polling updates from the same socket until the - server is shutdown. - """ - - def handle(self): - from pyspark.accumulators import _accumulatorRegistry - auth_token = self.server.auth_token - - def poll(func): - while not self.server.server_shutdown: - # Poll every 1 second for new data -- don't block in case of shutdown. - r, _, _ = select.select([self.rfile], [], [], 1) - if self.rfile in r: - if func(): - break - - def accum_updates(): - num_updates = read_int(self.rfile) - for _ in range(num_updates): - (aid, update) = pickleSer._read_with_length(self.rfile) - _accumulatorRegistry[aid] += update - # Write a byte in acknowledgement - self.wfile.write(struct.pack("!b", 1)) - return False - - def authenticate_and_accum_updates(): - received_token = self.rfile.read(len(auth_token)) - if isinstance(received_token, bytes): - received_token = received_token.decode("utf-8") - if (received_token == auth_token): - accum_updates() - # we've authenticated, we can break out of the first loop now - return True - else: - raise Exception( - "The value of the provided token to the AccumulatorServer is not correct.") - - if auth_token is not None: - # first we keep polling till we've received the authentication token - poll(authenticate_and_accum_updates) - # now we've authenticated if needed, don't need to check for the token anymore - poll(accum_updates) - - -class AccumulatorServer(SocketServer.TCPServer): - - def __init__(self, server_address, RequestHandlerClass, auth_token): - SocketServer.TCPServer.__init__(self, server_address, RequestHandlerClass) - self.auth_token = auth_token - - """ - A simple TCP server that intercepts shutdown() in order to interrupt - our continuous polling on the handler. - """ - server_shutdown = False - - def shutdown(self): - self.server_shutdown = True - SocketServer.TCPServer.shutdown(self) - self.server_close() - - -def _start_update_server(auth_token): - """Start a TCP server to receive accumulator updates in a daemon thread, and returns it""" - server = AccumulatorServer(("localhost", 0), _UpdateRequestHandler, auth_token) - thread = threading.Thread(target=server.serve_forever) - thread.daemon = True - thread.start() - return server - -if __name__ == "__main__": - import doctest - (failure_count, test_count) = doctest.testmod() - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/broadcast.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/broadcast.py deleted file mode 100644 index 29358b5..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/broadcast.py +++ /dev/null @@ -1,204 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import gc -import os -import socket -import sys -from tempfile import NamedTemporaryFile -import threading - -from pyspark.cloudpickle import print_exec -from pyspark.java_gateway import local_connect_and_auth -from pyspark.serializers import ChunkedStream -from pyspark.util import _exception_message - -if sys.version < '3': - import cPickle as pickle -else: - import pickle - unicode = str - -__all__ = ['Broadcast'] - - -# Holds broadcasted data received from Java, keyed by its id. -_broadcastRegistry = {} - - -def _from_id(bid): - from pyspark.broadcast import _broadcastRegistry - if bid not in _broadcastRegistry: - raise Exception("Broadcast variable '%s' not loaded!" % bid) - return _broadcastRegistry[bid] - - -class Broadcast(object): - - """ - A broadcast variable created with L{SparkContext.broadcast()}. - Access its value through C{.value}. - - Examples: - - >>> from pyspark.context import SparkContext - >>> sc = SparkContext('local', 'test') - >>> b = sc.broadcast([1, 2, 3, 4, 5]) - >>> b.value - [1, 2, 3, 4, 5] - >>> sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect() - [1, 2, 3, 4, 5, 1, 2, 3, 4, 5] - >>> b.unpersist() - - >>> large_broadcast = sc.broadcast(range(10000)) - """ - - def __init__(self, sc=None, value=None, pickle_registry=None, path=None, - sock_file=None): - """ - Should not be called directly by users -- use L{SparkContext.broadcast()} - instead. - """ - if sc is not None: - # we're on the driver. We want the pickled data to end up in a file (maybe encrypted) - f = NamedTemporaryFile(delete=False, dir=sc._temp_dir) - self._path = f.name - self._sc = sc - self._python_broadcast = sc._jvm.PythonRDD.setupBroadcast(self._path) - if sc._encryption_enabled: - # with encryption, we ask the jvm to do the encryption for us, we send it data - # over a socket - port, auth_secret = self._python_broadcast.setupEncryptionServer() - (encryption_sock_file, _) = local_connect_and_auth(port, auth_secret) - broadcast_out = ChunkedStream(encryption_sock_file, 8192) - else: - # no encryption, we can just write pickled data directly to the file from python - broadcast_out = f - self.dump(value, broadcast_out) - if sc._encryption_enabled: - self._python_broadcast.waitTillDataReceived() - self._jbroadcast = sc._jsc.broadcast(self._python_broadcast) - self._pickle_registry = pickle_registry - else: - # we're on an executor - self._jbroadcast = None - self._sc = None - self._python_broadcast = None - if sock_file is not None: - # the jvm is doing decryption for us. Read the value - # immediately from the sock_file - self._value = self.load(sock_file) - else: - # the jvm just dumps the pickled data in path -- we'll unpickle lazily when - # the value is requested - assert(path is not None) - self._path = path - - def dump(self, value, f): - try: - pickle.dump(value, f, 2) - except pickle.PickleError: - raise - except Exception as e: - msg = "Could not serialize broadcast: %s: %s" \ - % (e.__class__.__name__, _exception_message(e)) - print_exec(sys.stderr) - raise pickle.PicklingError(msg) - f.close() - - def load_from_path(self, path): - with open(path, 'rb', 1 << 20) as f: - return self.load(f) - - def load(self, file): - # "file" could also be a socket - gc.disable() - try: - return pickle.load(file) - finally: - gc.enable() - - @property - def value(self): - """ Return the broadcasted value - """ - if not hasattr(self, "_value") and self._path is not None: - # we only need to decrypt it here when encryption is enabled and - # if its on the driver, since executor decryption is handled already - if self._sc is not None and self._sc._encryption_enabled: - port, auth_secret = self._python_broadcast.setupDecryptionServer() - (decrypted_sock_file, _) = local_connect_and_auth(port, auth_secret) - self._python_broadcast.waitTillBroadcastDataSent() - return self.load(decrypted_sock_file) - else: - self._value = self.load_from_path(self._path) - return self._value - - def unpersist(self, blocking=False): - """ - Delete cached copies of this broadcast on the executors. If the - broadcast is used after this is called, it will need to be - re-sent to each executor. - - :param blocking: Whether to block until unpersisting has completed - """ - if self._jbroadcast is None: - raise Exception("Broadcast can only be unpersisted in driver") - self._jbroadcast.unpersist(blocking) - - def destroy(self): - """ - Destroy all data and metadata related to this broadcast variable. - Use this with caution; once a broadcast variable has been destroyed, - it cannot be used again. This method blocks until destroy has - completed. - """ - if self._jbroadcast is None: - raise Exception("Broadcast can only be destroyed in driver") - self._jbroadcast.destroy() - os.unlink(self._path) - - def __reduce__(self): - if self._jbroadcast is None: - raise Exception("Broadcast can only be serialized in driver") - self._pickle_registry.add(self) - return _from_id, (self._jbroadcast.id(),) - - -class BroadcastPickleRegistry(threading.local): - """ Thread-local registry for broadcast variables that have been pickled - """ - - def __init__(self): - self.__dict__.setdefault("_registry", set()) - - def __iter__(self): - for bcast in self._registry: - yield bcast - - def add(self, bcast): - self._registry.add(bcast) - - def clear(self): - self._registry.clear() - - -if __name__ == "__main__": - import doctest - (failure_count, test_count) = doctest.testmod() - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py deleted file mode 100644 index 88519d7..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py +++ /dev/null @@ -1,1078 +0,0 @@ -""" -This class is defined to override standard pickle functionality - -The goals of it follow: --Serialize lambdas and nested functions to compiled byte code --Deal with main module correctly --Deal with other non-serializable objects - -It does not include an unpickler, as standard python unpickling suffices. - -This module was extracted from the `cloud` package, developed by `PiCloud, Inc. -`_. - -Copyright (c) 2012, Regents of the University of California. -Copyright (c) 2009 `PiCloud, Inc. `_. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the University of California, Berkeley nor the - names of its contributors may be used to endorse or promote - products derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" -from __future__ import print_function - -import dis -from functools import partial -import imp -import io -import itertools -import logging -import opcode -import operator -import pickle -import struct -import sys -import traceback -import types -import weakref - - -if sys.version < '3': - from pickle import Pickler - try: - from cStringIO import StringIO - except ImportError: - from StringIO import StringIO - PY3 = False -else: - types.ClassType = type - from pickle import _Pickler as Pickler - from io import BytesIO as StringIO - PY3 = True - - -def _make_cell_set_template_code(): - """Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF - - Notes - ----- - In Python 3, we could use an easier function: - - .. code-block:: python - - def f(): - cell = None - - def _stub(value): - nonlocal cell - cell = value - - return _stub - - _cell_set_template_code = f() - - This function is _only_ a LOAD_FAST(arg); STORE_DEREF, but that is - invalid syntax on Python 2. If we use this function we also don't need - to do the weird freevars/cellvars swap below - """ - def inner(value): - lambda: cell # make ``cell`` a closure so that we get a STORE_DEREF - cell = value - - co = inner.__code__ - - # NOTE: we are marking the cell variable as a free variable intentionally - # so that we simulate an inner function instead of the outer function. This - # is what gives us the ``nonlocal`` behavior in a Python 2 compatible way. - if not PY3: - return types.CodeType( - co.co_argcount, - co.co_nlocals, - co.co_stacksize, - co.co_flags, - co.co_code, - co.co_consts, - co.co_names, - co.co_varnames, - co.co_filename, - co.co_name, - co.co_firstlineno, - co.co_lnotab, - co.co_cellvars, # this is the trickery - (), - ) - else: - return types.CodeType( - co.co_argcount, - co.co_kwonlyargcount, - co.co_nlocals, - co.co_stacksize, - co.co_flags, - co.co_code, - co.co_consts, - co.co_names, - co.co_varnames, - co.co_filename, - co.co_name, - co.co_firstlineno, - co.co_lnotab, - co.co_cellvars, # this is the trickery - (), - ) - - -_cell_set_template_code = _make_cell_set_template_code() - - -def cell_set(cell, value): - """Set the value of a closure cell. - """ - return types.FunctionType( - _cell_set_template_code, - {}, - '_cell_set_inner', - (), - (cell,), - )(value) - - -#relevant opcodes -STORE_GLOBAL = opcode.opmap['STORE_GLOBAL'] -DELETE_GLOBAL = opcode.opmap['DELETE_GLOBAL'] -LOAD_GLOBAL = opcode.opmap['LOAD_GLOBAL'] -GLOBAL_OPS = (STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL) -HAVE_ARGUMENT = dis.HAVE_ARGUMENT -EXTENDED_ARG = dis.EXTENDED_ARG - - -def islambda(func): - return getattr(func,'__name__') == '' - - -_BUILTIN_TYPE_NAMES = {} -for k, v in types.__dict__.items(): - if type(v) is type: - _BUILTIN_TYPE_NAMES[v] = k - - -def _builtin_type(name): - return getattr(types, name) - - -def _make__new__factory(type_): - def _factory(): - return type_.__new__ - return _factory - - -# NOTE: These need to be module globals so that they're pickleable as globals. -_get_dict_new = _make__new__factory(dict) -_get_frozenset_new = _make__new__factory(frozenset) -_get_list_new = _make__new__factory(list) -_get_set_new = _make__new__factory(set) -_get_tuple_new = _make__new__factory(tuple) -_get_object_new = _make__new__factory(object) - -# Pre-defined set of builtin_function_or_method instances that can be -# serialized. -_BUILTIN_TYPE_CONSTRUCTORS = { - dict.__new__: _get_dict_new, - frozenset.__new__: _get_frozenset_new, - set.__new__: _get_set_new, - list.__new__: _get_list_new, - tuple.__new__: _get_tuple_new, - object.__new__: _get_object_new, -} - - -if sys.version_info < (3, 4): - def _walk_global_ops(code): - """ - Yield (opcode, argument number) tuples for all - global-referencing instructions in *code*. - """ - code = getattr(code, 'co_code', b'') - if not PY3: - code = map(ord, code) - - n = len(code) - i = 0 - extended_arg = 0 - while i < n: - op = code[i] - i += 1 - if op >= HAVE_ARGUMENT: - oparg = code[i] + code[i + 1] * 256 + extended_arg - extended_arg = 0 - i += 2 - if op == EXTENDED_ARG: - extended_arg = oparg * 65536 - if op in GLOBAL_OPS: - yield op, oparg - -else: - def _walk_global_ops(code): - """ - Yield (opcode, argument number) tuples for all - global-referencing instructions in *code*. - """ - for instr in dis.get_instructions(code): - op = instr.opcode - if op in GLOBAL_OPS: - yield op, instr.arg - - -class CloudPickler(Pickler): - - dispatch = Pickler.dispatch.copy() - - def __init__(self, file, protocol=None): - Pickler.__init__(self, file, protocol) - # set of modules to unpickle - self.modules = set() - # map ids to dictionary. used to ensure that functions can share global env - self.globals_ref = {} - - def dump(self, obj): - self.inject_addons() - try: - return Pickler.dump(self, obj) - except RuntimeError as e: - if 'recursion' in e.args[0]: - msg = """Could not pickle object as excessively deep recursion required.""" - raise pickle.PicklingError(msg) - else: - raise - - def save_memoryview(self, obj): - self.save(obj.tobytes()) - dispatch[memoryview] = save_memoryview - - if not PY3: - def save_buffer(self, obj): - self.save(str(obj)) - dispatch[buffer] = save_buffer # noqa: F821 'buffer' was removed in Python 3 - - def save_unsupported(self, obj): - raise pickle.PicklingError("Cannot pickle objects of type %s" % type(obj)) - dispatch[types.GeneratorType] = save_unsupported - - # itertools objects do not pickle! - for v in itertools.__dict__.values(): - if type(v) is type: - dispatch[v] = save_unsupported - - def save_module(self, obj): - """ - Save a module as an import - """ - mod_name = obj.__name__ - # If module is successfully found then it is not a dynamically created module - if hasattr(obj, '__file__'): - is_dynamic = False - else: - try: - _find_module(mod_name) - is_dynamic = False - except ImportError: - is_dynamic = True - - self.modules.add(obj) - if is_dynamic: - self.save_reduce(dynamic_subimport, (obj.__name__, vars(obj)), obj=obj) - else: - self.save_reduce(subimport, (obj.__name__,), obj=obj) - dispatch[types.ModuleType] = save_module - - def save_codeobject(self, obj): - """ - Save a code object - """ - if PY3: - args = ( - obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize, - obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, obj.co_varnames, - obj.co_filename, obj.co_name, obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, - obj.co_cellvars - ) - else: - args = ( - obj.co_argcount, obj.co_nlocals, obj.co_stacksize, obj.co_flags, obj.co_code, - obj.co_consts, obj.co_names, obj.co_varnames, obj.co_filename, obj.co_name, - obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, obj.co_cellvars - ) - self.save_reduce(types.CodeType, args, obj=obj) - dispatch[types.CodeType] = save_codeobject - - def save_function(self, obj, name=None): - """ Registered with the dispatch to handle all function types. - - Determines what kind of function obj is (e.g. lambda, defined at - interactive prompt, etc) and handles the pickling appropriately. - """ - try: - should_special_case = obj in _BUILTIN_TYPE_CONSTRUCTORS - except TypeError: - # Methods of builtin types aren't hashable in python 2. - should_special_case = False - - if should_special_case: - # We keep a special-cased cache of built-in type constructors at - # global scope, because these functions are structured very - # differently in different python versions and implementations (for - # example, they're instances of types.BuiltinFunctionType in - # CPython, but they're ordinary types.FunctionType instances in - # PyPy). - # - # If the function we've received is in that cache, we just - # serialize it as a lookup into the cache. - return self.save_reduce(_BUILTIN_TYPE_CONSTRUCTORS[obj], (), obj=obj) - - write = self.write - - if name is None: - name = obj.__name__ - try: - # whichmodule() could fail, see - # https://bitbucket.org/gutworth/six/issues/63/importing-six-breaks-pickling - modname = pickle.whichmodule(obj, name) - except Exception: - modname = None - # print('which gives %s %s %s' % (modname, obj, name)) - try: - themodule = sys.modules[modname] - except KeyError: - # eval'd items such as namedtuple give invalid items for their function __module__ - modname = '__main__' - - if modname == '__main__': - themodule = None - - if themodule: - self.modules.add(themodule) - if getattr(themodule, name, None) is obj: - return self.save_global(obj, name) - - # a builtin_function_or_method which comes in as an attribute of some - # object (e.g., itertools.chain.from_iterable) will end - # up with modname "__main__" and so end up here. But these functions - # have no __code__ attribute in CPython, so the handling for - # user-defined functions below will fail. - # So we pickle them here using save_reduce; have to do it differently - # for different python versions. - if not hasattr(obj, '__code__'): - if PY3: - rv = obj.__reduce_ex__(self.proto) - else: - if hasattr(obj, '__self__'): - rv = (getattr, (obj.__self__, name)) - else: - raise pickle.PicklingError("Can't pickle %r" % obj) - return self.save_reduce(obj=obj, *rv) - - # if func is lambda, def'ed at prompt, is in main, or is nested, then - # we'll pickle the actual function object rather than simply saving a - # reference (as is done in default pickler), via save_function_tuple. - if (islambda(obj) - or getattr(obj.__code__, 'co_filename', None) == '' - or themodule is None): - self.save_function_tuple(obj) - return - else: - # func is nested - klass = getattr(themodule, name, None) - if klass is None or klass is not obj: - self.save_function_tuple(obj) - return - - if obj.__dict__: - # essentially save_reduce, but workaround needed to avoid recursion - self.save(_restore_attr) - write(pickle.MARK + pickle.GLOBAL + modname + '\n' + name + '\n') - self.memoize(obj) - self.save(obj.__dict__) - write(pickle.TUPLE + pickle.REDUCE) - else: - write(pickle.GLOBAL + modname + '\n' + name + '\n') - self.memoize(obj) - dispatch[types.FunctionType] = save_function - - def _save_subimports(self, code, top_level_dependencies): - """ - Ensure de-pickler imports any package child-modules that - are needed by the function - """ - # check if any known dependency is an imported package - for x in top_level_dependencies: - if isinstance(x, types.ModuleType) and hasattr(x, '__package__') and x.__package__: - # check if the package has any currently loaded sub-imports - prefix = x.__name__ + '.' - for name, module in sys.modules.items(): - # Older versions of pytest will add a "None" module to sys.modules. - if name is not None and name.startswith(prefix): - # check whether the function can address the sub-module - tokens = set(name[len(prefix):].split('.')) - if not tokens - set(code.co_names): - # ensure unpickler executes this import - self.save(module) - # then discards the reference to it - self.write(pickle.POP) - - def save_dynamic_class(self, obj): - """ - Save a class that can't be stored as module global. - - This method is used to serialize classes that are defined inside - functions, or that otherwise can't be serialized as attribute lookups - from global modules. - """ - clsdict = dict(obj.__dict__) # copy dict proxy to a dict - clsdict.pop('__weakref__', None) - - # On PyPy, __doc__ is a readonly attribute, so we need to include it in - # the initial skeleton class. This is safe because we know that the - # doc can't participate in a cycle with the original class. - type_kwargs = {'__doc__': clsdict.pop('__doc__', None)} - - # If type overrides __dict__ as a property, include it in the type kwargs. - # In Python 2, we can't set this attribute after construction. - __dict__ = clsdict.pop('__dict__', None) - if isinstance(__dict__, property): - type_kwargs['__dict__'] = __dict__ - - save = self.save - write = self.write - - # We write pickle instructions explicitly here to handle the - # possibility that the type object participates in a cycle with its own - # __dict__. We first write an empty "skeleton" version of the class and - # memoize it before writing the class' __dict__ itself. We then write - # instructions to "rehydrate" the skeleton class by restoring the - # attributes from the __dict__. - # - # A type can appear in a cycle with its __dict__ if an instance of the - # type appears in the type's __dict__ (which happens for the stdlib - # Enum class), or if the type defines methods that close over the name - # of the type, (which is common for Python 2-style super() calls). - - # Push the rehydration function. - save(_rehydrate_skeleton_class) - - # Mark the start of the args tuple for the rehydration function. - write(pickle.MARK) - - # Create and memoize an skeleton class with obj's name and bases. - tp = type(obj) - self.save_reduce(tp, (obj.__name__, obj.__bases__, type_kwargs), obj=obj) - - # Now save the rest of obj's __dict__. Any references to obj - # encountered while saving will point to the skeleton class. - save(clsdict) - - # Write a tuple of (skeleton_class, clsdict). - write(pickle.TUPLE) - - # Call _rehydrate_skeleton_class(skeleton_class, clsdict) - write(pickle.REDUCE) - - def save_function_tuple(self, func): - """ Pickles an actual func object. - - A func comprises: code, globals, defaults, closure, and dict. We - extract and save these, injecting reducing functions at certain points - to recreate the func object. Keep in mind that some of these pieces - can contain a ref to the func itself. Thus, a naive save on these - pieces could trigger an infinite loop of save's. To get around that, - we first create a skeleton func object using just the code (this is - safe, since this won't contain a ref to the func), and memoize it as - soon as it's created. The other stuff can then be filled in later. - """ - if is_tornado_coroutine(func): - self.save_reduce(_rebuild_tornado_coroutine, (func.__wrapped__,), - obj=func) - return - - save = self.save - write = self.write - - code, f_globals, defaults, closure_values, dct, base_globals = self.extract_func_data(func) - - save(_fill_function) # skeleton function updater - write(pickle.MARK) # beginning of tuple that _fill_function expects - - self._save_subimports( - code, - itertools.chain(f_globals.values(), closure_values or ()), - ) - - # create a skeleton function object and memoize it - save(_make_skel_func) - save(( - code, - len(closure_values) if closure_values is not None else -1, - base_globals, - )) - write(pickle.REDUCE) - self.memoize(func) - - # save the rest of the func data needed by _fill_function - state = { - 'globals': f_globals, - 'defaults': defaults, - 'dict': dct, - 'module': func.__module__, - 'closure_values': closure_values, - } - if hasattr(func, '__qualname__'): - state['qualname'] = func.__qualname__ - save(state) - write(pickle.TUPLE) - write(pickle.REDUCE) # applies _fill_function on the tuple - - _extract_code_globals_cache = ( - weakref.WeakKeyDictionary() - if not hasattr(sys, "pypy_version_info") - else {}) - - @classmethod - def extract_code_globals(cls, co): - """ - Find all globals names read or written to by codeblock co - """ - out_names = cls._extract_code_globals_cache.get(co) - if out_names is None: - try: - names = co.co_names - except AttributeError: - # PyPy "builtin-code" object - out_names = set() - else: - out_names = set(names[oparg] - for op, oparg in _walk_global_ops(co)) - - # see if nested function have any global refs - if co.co_consts: - for const in co.co_consts: - if type(const) is types.CodeType: - out_names |= cls.extract_code_globals(const) - - cls._extract_code_globals_cache[co] = out_names - - return out_names - - def extract_func_data(self, func): - """ - Turn the function into a tuple of data necessary to recreate it: - code, globals, defaults, closure_values, dict - """ - code = func.__code__ - - # extract all global ref's - func_global_refs = self.extract_code_globals(code) - - # process all variables referenced by global environment - f_globals = {} - for var in func_global_refs: - if var in func.__globals__: - f_globals[var] = func.__globals__[var] - - # defaults requires no processing - defaults = func.__defaults__ - - # process closure - closure = ( - list(map(_get_cell_contents, func.__closure__)) - if func.__closure__ is not None - else None - ) - - # save the dict - dct = func.__dict__ - - base_globals = self.globals_ref.get(id(func.__globals__), {}) - self.globals_ref[id(func.__globals__)] = base_globals - - return (code, f_globals, defaults, closure, dct, base_globals) - - def save_builtin_function(self, obj): - if obj.__module__ == "__builtin__": - return self.save_global(obj) - return self.save_function(obj) - dispatch[types.BuiltinFunctionType] = save_builtin_function - - def save_global(self, obj, name=None, pack=struct.pack): - """ - Save a "global". - - The name of this method is somewhat misleading: all types get - dispatched here. - """ - if obj.__module__ == "__main__": - return self.save_dynamic_class(obj) - - try: - return Pickler.save_global(self, obj, name=name) - except Exception: - if obj.__module__ == "__builtin__" or obj.__module__ == "builtins": - if obj in _BUILTIN_TYPE_NAMES: - return self.save_reduce( - _builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj) - - typ = type(obj) - if typ is not obj and isinstance(obj, (type, types.ClassType)): - return self.save_dynamic_class(obj) - - raise - - dispatch[type] = save_global - dispatch[types.ClassType] = save_global - - def save_instancemethod(self, obj): - # Memoization rarely is ever useful due to python bounding - if obj.__self__ is None: - self.save_reduce(getattr, (obj.im_class, obj.__name__)) - else: - if PY3: - self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj) - else: - self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__), - obj=obj) - dispatch[types.MethodType] = save_instancemethod - - def save_inst(self, obj): - """Inner logic to save instance. Based off pickle.save_inst""" - cls = obj.__class__ - - # Try the dispatch table (pickle module doesn't do it) - f = self.dispatch.get(cls) - if f: - f(self, obj) # Call unbound method with explicit self - return - - memo = self.memo - write = self.write - save = self.save - - if hasattr(obj, '__getinitargs__'): - args = obj.__getinitargs__() - len(args) # XXX Assert it's a sequence - pickle._keep_alive(args, memo) - else: - args = () - - write(pickle.MARK) - - if self.bin: - save(cls) - for arg in args: - save(arg) - write(pickle.OBJ) - else: - for arg in args: - save(arg) - write(pickle.INST + cls.__module__ + '\n' + cls.__name__ + '\n') - - self.memoize(obj) - - try: - getstate = obj.__getstate__ - except AttributeError: - stuff = obj.__dict__ - else: - stuff = getstate() - pickle._keep_alive(stuff, memo) - save(stuff) - write(pickle.BUILD) - - if not PY3: - dispatch[types.InstanceType] = save_inst - - def save_property(self, obj): - # properties not correctly saved in python - self.save_reduce(property, (obj.fget, obj.fset, obj.fdel, obj.__doc__), obj=obj) - dispatch[property] = save_property - - def save_classmethod(self, obj): - orig_func = obj.__func__ - self.save_reduce(type(obj), (orig_func,), obj=obj) - dispatch[classmethod] = save_classmethod - dispatch[staticmethod] = save_classmethod - - def save_itemgetter(self, obj): - """itemgetter serializer (needed for namedtuple support)""" - class Dummy: - def __getitem__(self, item): - return item - items = obj(Dummy()) - if not isinstance(items, tuple): - items = (items, ) - return self.save_reduce(operator.itemgetter, items) - - if type(operator.itemgetter) is type: - dispatch[operator.itemgetter] = save_itemgetter - - def save_attrgetter(self, obj): - """attrgetter serializer""" - class Dummy(object): - def __init__(self, attrs, index=None): - self.attrs = attrs - self.index = index - def __getattribute__(self, item): - attrs = object.__getattribute__(self, "attrs") - index = object.__getattribute__(self, "index") - if index is None: - index = len(attrs) - attrs.append(item) - else: - attrs[index] = ".".join([attrs[index], item]) - return type(self)(attrs, index) - attrs = [] - obj(Dummy(attrs)) - return self.save_reduce(operator.attrgetter, tuple(attrs)) - - if type(operator.attrgetter) is type: - dispatch[operator.attrgetter] = save_attrgetter - - def save_file(self, obj): - """Save a file""" - try: - import StringIO as pystringIO #we can't use cStringIO as it lacks the name attribute - except ImportError: - import io as pystringIO - - if not hasattr(obj, 'name') or not hasattr(obj, 'mode'): - raise pickle.PicklingError("Cannot pickle files that do not map to an actual file") - if obj is sys.stdout: - return self.save_reduce(getattr, (sys,'stdout'), obj=obj) - if obj is sys.stderr: - return self.save_reduce(getattr, (sys,'stderr'), obj=obj) - if obj is sys.stdin: - raise pickle.PicklingError("Cannot pickle standard input") - if obj.closed: - raise pickle.PicklingError("Cannot pickle closed files") - if hasattr(obj, 'isatty') and obj.isatty(): - raise pickle.PicklingError("Cannot pickle files that map to tty objects") - if 'r' not in obj.mode and '+' not in obj.mode: - raise pickle.PicklingError("Cannot pickle files that are not opened for reading: %s" % obj.mode) - - name = obj.name - - retval = pystringIO.StringIO() - - try: - # Read the whole file - curloc = obj.tell() - obj.seek(0) - contents = obj.read() - obj.seek(curloc) - except IOError: - raise pickle.PicklingError("Cannot pickle file %s as it cannot be read" % name) - retval.write(contents) - retval.seek(curloc) - - retval.name = name - self.save(retval) - self.memoize(obj) - - def save_ellipsis(self, obj): - self.save_reduce(_gen_ellipsis, ()) - - def save_not_implemented(self, obj): - self.save_reduce(_gen_not_implemented, ()) - - try: # Python 2 - dispatch[file] = save_file - except NameError: # Python 3 - dispatch[io.TextIOWrapper] = save_file - - dispatch[type(Ellipsis)] = save_ellipsis - dispatch[type(NotImplemented)] = save_not_implemented - - def save_weakset(self, obj): - self.save_reduce(weakref.WeakSet, (list(obj),)) - - dispatch[weakref.WeakSet] = save_weakset - - def save_logger(self, obj): - self.save_reduce(logging.getLogger, (obj.name,), obj=obj) - - dispatch[logging.Logger] = save_logger - - def save_root_logger(self, obj): - self.save_reduce(logging.getLogger, (), obj=obj) - - dispatch[logging.RootLogger] = save_root_logger - - """Special functions for Add-on libraries""" - def inject_addons(self): - """Plug in system. Register additional pickling functions if modules already loaded""" - pass - - -# Tornado support - -def is_tornado_coroutine(func): - """ - Return whether *func* is a Tornado coroutine function. - Running coroutines are not supported. - """ - if 'tornado.gen' not in sys.modules: - return False - gen = sys.modules['tornado.gen'] - if not hasattr(gen, "is_coroutine_function"): - # Tornado version is too old - return False - return gen.is_coroutine_function(func) - -def _rebuild_tornado_coroutine(func): - from tornado import gen - return gen.coroutine(func) - - -# Shorthands for legacy support - -def dump(obj, file, protocol=2): - CloudPickler(file, protocol).dump(obj) - - -def dumps(obj, protocol=2): - file = StringIO() - try: - cp = CloudPickler(file,protocol) - cp.dump(obj) - return file.getvalue() - finally: - file.close() - -# including pickles unloading functions in this namespace -load = pickle.load -loads = pickle.loads - - -#hack for __import__ not working as desired -def subimport(name): - __import__(name) - return sys.modules[name] - - -def dynamic_subimport(name, vars): - mod = imp.new_module(name) - mod.__dict__.update(vars) - sys.modules[name] = mod - return mod - -# restores function attributes -def _restore_attr(obj, attr): - for key, val in attr.items(): - setattr(obj, key, val) - return obj - - -def _get_module_builtins(): - return pickle.__builtins__ - - -def print_exec(stream): - ei = sys.exc_info() - traceback.print_exception(ei[0], ei[1], ei[2], None, stream) - - -def _modules_to_main(modList): - """Force every module in modList to be placed into main""" - if not modList: - return - - main = sys.modules['__main__'] - for modname in modList: - if type(modname) is str: - try: - mod = __import__(modname) - except Exception as e: - sys.stderr.write('warning: could not import %s\n. ' - 'Your function may unexpectedly error due to this import failing;' - 'A version mismatch is likely. Specific error was:\n' % modname) - print_exec(sys.stderr) - else: - setattr(main, mod.__name__, mod) - - -#object generators: -def _genpartial(func, args, kwds): - if not args: - args = () - if not kwds: - kwds = {} - return partial(func, *args, **kwds) - -def _gen_ellipsis(): - return Ellipsis - -def _gen_not_implemented(): - return NotImplemented - - -def _get_cell_contents(cell): - try: - return cell.cell_contents - except ValueError: - # sentinel used by ``_fill_function`` which will leave the cell empty - return _empty_cell_value - - -def instance(cls): - """Create a new instance of a class. - - Parameters - ---------- - cls : type - The class to create an instance of. - - Returns - ------- - instance : cls - A new instance of ``cls``. - """ - return cls() - - -@instance -class _empty_cell_value(object): - """sentinel for empty closures - """ - @classmethod - def __reduce__(cls): - return cls.__name__ - - -def _fill_function(*args): - """Fills in the rest of function data into the skeleton function object - - The skeleton itself is create by _make_skel_func(). - """ - if len(args) == 2: - func = args[0] - state = args[1] - elif len(args) == 5: - # Backwards compat for cloudpickle v0.4.0, after which the `module` - # argument was introduced - func = args[0] - keys = ['globals', 'defaults', 'dict', 'closure_values'] - state = dict(zip(keys, args[1:])) - elif len(args) == 6: - # Backwards compat for cloudpickle v0.4.1, after which the function - # state was passed as a dict to the _fill_function it-self. - func = args[0] - keys = ['globals', 'defaults', 'dict', 'module', 'closure_values'] - state = dict(zip(keys, args[1:])) - else: - raise ValueError('Unexpected _fill_value arguments: %r' % (args,)) - - func.__globals__.update(state['globals']) - func.__defaults__ = state['defaults'] - func.__dict__ = state['dict'] - if 'module' in state: - func.__module__ = state['module'] - if 'qualname' in state: - func.__qualname__ = state['qualname'] - - cells = func.__closure__ - if cells is not None: - for cell, value in zip(cells, state['closure_values']): - if value is not _empty_cell_value: - cell_set(cell, value) - - return func - - -def _make_empty_cell(): - if False: - # trick the compiler into creating an empty cell in our lambda - cell = None - raise AssertionError('this route should not be executed') - - return (lambda: cell).__closure__[0] - - -def _make_skel_func(code, cell_count, base_globals=None): - """ Creates a skeleton function object that contains just the provided - code and the correct number of cells in func_closure. All other - func attributes (e.g. func_globals) are empty. - """ - if base_globals is None: - base_globals = {} - base_globals['__builtins__'] = __builtins__ - - closure = ( - tuple(_make_empty_cell() for _ in range(cell_count)) - if cell_count >= 0 else - None - ) - return types.FunctionType(code, base_globals, None, None, closure) - - -def _rehydrate_skeleton_class(skeleton_class, class_dict): - """Put attributes from `class_dict` back on `skeleton_class`. - - See CloudPickler.save_dynamic_class for more info. - """ - for attrname, attr in class_dict.items(): - setattr(skeleton_class, attrname, attr) - return skeleton_class - - -def _find_module(mod_name): - """ - Iterate over each part instead of calling imp.find_module directly. - This function is able to find submodules (e.g. sickit.tree) - """ - path = None - for part in mod_name.split('.'): - if path is not None: - path = [path] - file, path, description = imp.find_module(part, path) - if file is not None: - file.close() - return path, description - -"""Constructors for 3rd party libraries -Note: These can never be renamed due to client compatibility issues""" - -def _getobject(modname, attribute): - mod = __import__(modname, fromlist=[attribute]) - return mod.__dict__[attribute] - - -""" Use copy_reg to extend global pickle definitions """ - -if sys.version_info < (3, 4): - method_descriptor = type(str.upper) - - def _reduce_method_descriptor(obj): - return (getattr, (obj.__objclass__, obj.__name__)) - - try: - import copy_reg as copyreg - except ImportError: - import copyreg - copyreg.pickle(method_descriptor, _reduce_method_descriptor) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/conf.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/conf.py deleted file mode 100644 index ab429d9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/conf.py +++ /dev/null @@ -1,224 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" ->>> from pyspark.conf import SparkConf ->>> from pyspark.context import SparkContext ->>> conf = SparkConf() ->>> conf.setMaster("local").setAppName("My app") - ->>> conf.get("spark.master") -u'local' ->>> conf.get("spark.app.name") -u'My app' ->>> sc = SparkContext(conf=conf) ->>> sc.master -u'local' ->>> sc.appName -u'My app' ->>> sc.sparkHome is None -True - ->>> conf = SparkConf(loadDefaults=False) ->>> conf.setSparkHome("/path") - ->>> conf.get("spark.home") -u'/path' ->>> conf.setExecutorEnv("VAR1", "value1") - ->>> conf.setExecutorEnv(pairs = [("VAR3", "value3"), ("VAR4", "value4")]) - ->>> conf.get("spark.executorEnv.VAR1") -u'value1' ->>> print(conf.toDebugString()) -spark.executorEnv.VAR1=value1 -spark.executorEnv.VAR3=value3 -spark.executorEnv.VAR4=value4 -spark.home=/path ->>> sorted(conf.getAll(), key=lambda p: p[0]) -[(u'spark.executorEnv.VAR1', u'value1'), (u'spark.executorEnv.VAR3', u'value3'), \ -(u'spark.executorEnv.VAR4', u'value4'), (u'spark.home', u'/path')] ->>> conf._jconf.setExecutorEnv("VAR5", "value5") -JavaObject id... ->>> print(conf.toDebugString()) -spark.executorEnv.VAR1=value1 -spark.executorEnv.VAR3=value3 -spark.executorEnv.VAR4=value4 -spark.executorEnv.VAR5=value5 -spark.home=/path -""" - -__all__ = ['SparkConf'] - -import sys -import re - -if sys.version > '3': - unicode = str - __doc__ = re.sub(r"(\W|^)[uU](['])", r'\1\2', __doc__) - - -class SparkConf(object): - - """ - Configuration for a Spark application. Used to set various Spark - parameters as key-value pairs. - - Most of the time, you would create a SparkConf object with - C{SparkConf()}, which will load values from C{spark.*} Java system - properties as well. In this case, any parameters you set directly on - the C{SparkConf} object take priority over system properties. - - For unit tests, you can also call C{SparkConf(false)} to skip - loading external settings and get the same configuration no matter - what the system properties are. - - All setter methods in this class support chaining. For example, - you can write C{conf.setMaster("local").setAppName("My app")}. - - .. note:: Once a SparkConf object is passed to Spark, it is cloned - and can no longer be modified by the user. - """ - - def __init__(self, loadDefaults=True, _jvm=None, _jconf=None): - """ - Create a new Spark configuration. - - :param loadDefaults: whether to load values from Java system - properties (True by default) - :param _jvm: internal parameter used to pass a handle to the - Java VM; does not need to be set by users - :param _jconf: Optionally pass in an existing SparkConf handle - to use its parameters - """ - if _jconf: - self._jconf = _jconf - else: - from pyspark.context import SparkContext - _jvm = _jvm or SparkContext._jvm - - if _jvm is not None: - # JVM is created, so create self._jconf directly through JVM - self._jconf = _jvm.SparkConf(loadDefaults) - self._conf = None - else: - # JVM is not created, so store data in self._conf first - self._jconf = None - self._conf = {} - - def set(self, key, value): - """Set a configuration property.""" - # Try to set self._jconf first if JVM is created, set self._conf if JVM is not created yet. - if self._jconf is not None: - self._jconf.set(key, unicode(value)) - else: - self._conf[key] = unicode(value) - return self - - def setIfMissing(self, key, value): - """Set a configuration property, if not already set.""" - if self.get(key) is None: - self.set(key, value) - return self - - def setMaster(self, value): - """Set master URL to connect to.""" - self.set("spark.master", value) - return self - - def setAppName(self, value): - """Set application name.""" - self.set("spark.app.name", value) - return self - - def setSparkHome(self, value): - """Set path where Spark is installed on worker nodes.""" - self.set("spark.home", value) - return self - - def setExecutorEnv(self, key=None, value=None, pairs=None): - """Set an environment variable to be passed to executors.""" - if (key is not None and pairs is not None) or (key is None and pairs is None): - raise Exception("Either pass one key-value pair or a list of pairs") - elif key is not None: - self.set("spark.executorEnv." + key, value) - elif pairs is not None: - for (k, v) in pairs: - self.set("spark.executorEnv." + k, v) - return self - - def setAll(self, pairs): - """ - Set multiple parameters, passed as a list of key-value pairs. - - :param pairs: list of key-value pairs to set - """ - for (k, v) in pairs: - self.set(k, v) - return self - - def get(self, key, defaultValue=None): - """Get the configured value for some key, or return a default otherwise.""" - if defaultValue is None: # Py4J doesn't call the right get() if we pass None - if self._jconf is not None: - if not self._jconf.contains(key): - return None - return self._jconf.get(key) - else: - if key not in self._conf: - return None - return self._conf[key] - else: - if self._jconf is not None: - return self._jconf.get(key, defaultValue) - else: - return self._conf.get(key, defaultValue) - - def getAll(self): - """Get all values as a list of key-value pairs.""" - if self._jconf is not None: - return [(elem._1(), elem._2()) for elem in self._jconf.getAll()] - else: - return self._conf.items() - - def contains(self, key): - """Does this configuration contain a given key?""" - if self._jconf is not None: - return self._jconf.contains(key) - else: - return key in self._conf - - def toDebugString(self): - """ - Returns a printable version of the configuration, as a list of - key=value pairs, one per line. - """ - if self._jconf is not None: - return self._jconf.toDebugString() - else: - return '\n'.join('%s=%s' % (k, v) for k, v in self._conf.items()) - - -def _test(): - import doctest - (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS) - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/context.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/context.py deleted file mode 100644 index aff3635..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/context.py +++ /dev/null @@ -1,1110 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import os -import shutil -import signal -import sys -import threading -import warnings -from threading import RLock -from tempfile import NamedTemporaryFile - -from py4j.protocol import Py4JError - -from pyspark import accumulators -from pyspark.accumulators import Accumulator -from pyspark.broadcast import Broadcast, BroadcastPickleRegistry -from pyspark.conf import SparkConf -from pyspark.files import SparkFiles -from pyspark.java_gateway import launch_gateway, local_connect_and_auth -from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer, \ - PairDeserializer, AutoBatchedSerializer, NoOpSerializer, ChunkedStream -from pyspark.storagelevel import StorageLevel -from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix -from pyspark.traceback_utils import CallSite, first_spark_call -from pyspark.status import StatusTracker -from pyspark.profiler import ProfilerCollector, BasicProfiler - -if sys.version > '3': - xrange = range - - -__all__ = ['SparkContext'] - - -# These are special default configs for PySpark, they will overwrite -# the default ones for Spark if they are not configured by user. -DEFAULT_CONFIGS = { - "spark.serializer.objectStreamReset": 100, - "spark.rdd.compress": True, -} - - -class SparkContext(object): - - """ - Main entry point for Spark functionality. A SparkContext represents the - connection to a Spark cluster, and can be used to create L{RDD} and - broadcast variables on that cluster. - - .. note:: :class:`SparkContext` instance is not supported to share across multiple - processes out of the box, and PySpark does not guarantee multi-processing execution. - Use threads instead for concurrent processing purpose. - """ - - _gateway = None - _jvm = None - _next_accum_id = 0 - _active_spark_context = None - _lock = RLock() - _python_includes = None # zip and egg files that need to be added to PYTHONPATH - - PACKAGE_EXTENSIONS = ('.zip', '.egg', '.jar') - - def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, - environment=None, batchSize=0, serializer=PickleSerializer(), conf=None, - gateway=None, jsc=None, profiler_cls=BasicProfiler): - """ - Create a new SparkContext. At least the master and app name should be set, - either through the named parameters here or through C{conf}. - - :param master: Cluster URL to connect to - (e.g. mesos://host:port, spark://host:port, local[4]). - :param appName: A name for your job, to display on the cluster web UI. - :param sparkHome: Location where Spark is installed on cluster nodes. - :param pyFiles: Collection of .zip or .py files to send to the cluster - and add to PYTHONPATH. These can be paths on the local file - system or HDFS, HTTP, HTTPS, or FTP URLs. - :param environment: A dictionary of environment variables to set on - worker nodes. - :param batchSize: The number of Python objects represented as a single - Java object. Set 1 to disable batching, 0 to automatically choose - the batch size based on object sizes, or -1 to use an unlimited - batch size - :param serializer: The serializer for RDDs. - :param conf: A L{SparkConf} object setting Spark properties. - :param gateway: Use an existing gateway and JVM, otherwise a new JVM - will be instantiated. - :param jsc: The JavaSparkContext instance (optional). - :param profiler_cls: A class of custom Profiler used to do profiling - (default is pyspark.profiler.BasicProfiler). - - - >>> from pyspark.context import SparkContext - >>> sc = SparkContext('local', 'test') - - >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ValueError:... - """ - self._callsite = first_spark_call() or CallSite(None, None, None) - if gateway is not None and gateway.gateway_parameters.auth_token is None: - allow_insecure_env = os.environ.get("PYSPARK_ALLOW_INSECURE_GATEWAY", "0") - if allow_insecure_env == "1" or allow_insecure_env.lower() == "true": - warnings.warn( - "You are passing in an insecure Py4j gateway. This " - "presents a security risk, and will be completely forbidden in Spark 3.0") - else: - raise ValueError( - "You are trying to pass an insecure Py4j gateway to Spark. This" - " presents a security risk. If you are sure you understand and accept this" - " risk, you can set the environment variable" - " 'PYSPARK_ALLOW_INSECURE_GATEWAY=1', but" - " note this option will be removed in Spark 3.0") - - SparkContext._ensure_initialized(self, gateway=gateway, conf=conf) - try: - self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer, - conf, jsc, profiler_cls) - except: - # If an error occurs, clean up in order to allow future SparkContext creation: - self.stop() - raise - - def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, - conf, jsc, profiler_cls): - self.environment = environment or {} - # java gateway must have been launched at this point. - if conf is not None and conf._jconf is not None: - # conf has been initialized in JVM properly, so use conf directly. This represents the - # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is - # created and then stopped, and we create a new SparkConf and new SparkContext again) - self._conf = conf - else: - self._conf = SparkConf(_jvm=SparkContext._jvm) - if conf is not None: - for k, v in conf.getAll(): - self._conf.set(k, v) - - self._batchSize = batchSize # -1 represents an unlimited batch size - self._unbatched_serializer = serializer - if batchSize == 0: - self.serializer = AutoBatchedSerializer(self._unbatched_serializer) - else: - self.serializer = BatchedSerializer(self._unbatched_serializer, - batchSize) - - # Set any parameters passed directly to us on the conf - if master: - self._conf.setMaster(master) - if appName: - self._conf.setAppName(appName) - if sparkHome: - self._conf.setSparkHome(sparkHome) - if environment: - for key, value in environment.items(): - self._conf.setExecutorEnv(key, value) - for key, value in DEFAULT_CONFIGS.items(): - self._conf.setIfMissing(key, value) - - # Check that we have at least the required parameters - if not self._conf.contains("spark.master"): - raise Exception("A master URL must be set in your configuration") - if not self._conf.contains("spark.app.name"): - raise Exception("An application name must be set in your configuration") - - # Read back our properties from the conf in case we loaded some of them from - # the classpath or an external config file - self.master = self._conf.get("spark.master") - self.appName = self._conf.get("spark.app.name") - self.sparkHome = self._conf.get("spark.home", None) - - for (k, v) in self._conf.getAll(): - if k.startswith("spark.executorEnv."): - varName = k[len("spark.executorEnv."):] - self.environment[varName] = v - - self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0") - - # Create the Java SparkContext through Py4J - self._jsc = jsc or self._initialize_context(self._conf._jconf) - # Reset the SparkConf to the one actually used by the SparkContext in JVM. - self._conf = SparkConf(_jconf=self._jsc.sc().conf()) - - # Create a single Accumulator in Java that we'll send all our updates through; - # they will be passed back to us through a TCP server - auth_token = self._gateway.gateway_parameters.auth_token - self._accumulatorServer = accumulators._start_update_server(auth_token) - (host, port) = self._accumulatorServer.server_address - self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port, auth_token) - self._jsc.sc().register(self._javaAccumulator) - - # If encryption is enabled, we need to setup a server in the jvm to read broadcast - # data via a socket. - # scala's mangled names w/ $ in them require special treatment. - self._encryption_enabled = self._jvm.PythonUtils.getEncryptionEnabled(self._jsc) - - self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python') - self.pythonVer = "%d.%d" % sys.version_info[:2] - - # Broadcast's __reduce__ method stores Broadcast instances here. - # This allows other code to determine which Broadcast instances have - # been pickled, so it can determine which Java broadcast objects to - # send. - self._pickled_broadcast_vars = BroadcastPickleRegistry() - - SparkFiles._sc = self - root_dir = SparkFiles.getRootDirectory() - sys.path.insert(1, root_dir) - - # Deploy any code dependencies specified in the constructor - self._python_includes = list() - for path in (pyFiles or []): - self.addPyFile(path) - - # Deploy code dependencies set by spark-submit; these will already have been added - # with SparkContext.addFile, so we just need to add them to the PYTHONPATH - for path in self._conf.get("spark.submit.pyFiles", "").split(","): - if path != "": - (dirname, filename) = os.path.split(path) - try: - filepath = os.path.join(SparkFiles.getRootDirectory(), filename) - if not os.path.exists(filepath): - # In case of YARN with shell mode, 'spark.submit.pyFiles' files are - # not added via SparkContext.addFile. Here we check if the file exists, - # try to copy and then add it to the path. See SPARK-21945. - shutil.copyfile(path, filepath) - if filename[-4:].lower() in self.PACKAGE_EXTENSIONS: - self._python_includes.append(filename) - sys.path.insert(1, filepath) - except Exception: - warnings.warn( - "Failed to add file [%s] speficied in 'spark.submit.pyFiles' to " - "Python path:\n %s" % (path, "\n ".join(sys.path)), - RuntimeWarning) - - # Create a temporary directory inside spark.local.dir: - local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf()) - self._temp_dir = \ - self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \ - .getAbsolutePath() - - # profiling stats collected for each PythonRDD - if self._conf.get("spark.python.profile", "false") == "true": - dump_path = self._conf.get("spark.python.profile.dump", None) - self.profiler_collector = ProfilerCollector(profiler_cls, dump_path) - else: - self.profiler_collector = None - - # create a signal handler which would be invoked on receiving SIGINT - def signal_handler(signal, frame): - self.cancelAllJobs() - raise KeyboardInterrupt() - - # see http://stackoverflow.com/questions/23206787/ - if isinstance(threading.current_thread(), threading._MainThread): - signal.signal(signal.SIGINT, signal_handler) - - def __repr__(self): - return "".format( - master=self.master, - appName=self.appName, - ) - - def _repr_html_(self): - return """ -
      -

      SparkContext

      - -

      Spark UI

      - -
      -
      Version
      -
      v{sc.version}
      -
      Master
      -
      {sc.master}
      -
      AppName
      -
      {sc.appName}
      -
      -
      - """.format( - sc=self - ) - - def _initialize_context(self, jconf): - """ - Initialize SparkContext in function to allow subclass specific initialization - """ - return self._jvm.JavaSparkContext(jconf) - - @classmethod - def _ensure_initialized(cls, instance=None, gateway=None, conf=None): - """ - Checks whether a SparkContext is initialized or not. - Throws error if a SparkContext is already running. - """ - with SparkContext._lock: - if not SparkContext._gateway: - SparkContext._gateway = gateway or launch_gateway(conf) - SparkContext._jvm = SparkContext._gateway.jvm - - if instance: - if (SparkContext._active_spark_context and - SparkContext._active_spark_context != instance): - currentMaster = SparkContext._active_spark_context.master - currentAppName = SparkContext._active_spark_context.appName - callsite = SparkContext._active_spark_context._callsite - - # Raise error if there is already a running Spark context - raise ValueError( - "Cannot run multiple SparkContexts at once; " - "existing SparkContext(app=%s, master=%s)" - " created by %s at %s:%s " - % (currentAppName, currentMaster, - callsite.function, callsite.file, callsite.linenum)) - else: - SparkContext._active_spark_context = instance - - def __getnewargs__(self): - # This method is called when attempting to pickle SparkContext, which is always an error: - raise Exception( - "It appears that you are attempting to reference SparkContext from a broadcast " - "variable, action, or transformation. SparkContext can only be used on the driver, " - "not in code that it run on workers. For more information, see SPARK-5063." - ) - - def __enter__(self): - """ - Enable 'with SparkContext(...) as sc: app(sc)' syntax. - """ - return self - - def __exit__(self, type, value, trace): - """ - Enable 'with SparkContext(...) as sc: app' syntax. - - Specifically stop the context on exit of the with block. - """ - self.stop() - - @classmethod - def getOrCreate(cls, conf=None): - """ - Get or instantiate a SparkContext and register it as a singleton object. - - :param conf: SparkConf (optional) - """ - with SparkContext._lock: - if SparkContext._active_spark_context is None: - SparkContext(conf=conf or SparkConf()) - return SparkContext._active_spark_context - - def setLogLevel(self, logLevel): - """ - Control our logLevel. This overrides any user-defined log settings. - Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN - """ - self._jsc.setLogLevel(logLevel) - - @classmethod - def setSystemProperty(cls, key, value): - """ - Set a Java system property, such as spark.executor.memory. This must - must be invoked before instantiating SparkContext. - """ - SparkContext._ensure_initialized() - SparkContext._jvm.java.lang.System.setProperty(key, value) - - @property - def version(self): - """ - The version of Spark on which this application is running. - """ - return self._jsc.version() - - @property - @ignore_unicode_prefix - def applicationId(self): - """ - A unique identifier for the Spark application. - Its format depends on the scheduler implementation. - - * in case of local spark app something like 'local-1433865536131' - * in case of YARN something like 'application_1433865536131_34483' - - >>> sc.applicationId # doctest: +ELLIPSIS - u'local-...' - """ - return self._jsc.sc().applicationId() - - @property - def uiWebUrl(self): - """Return the URL of the SparkUI instance started by this SparkContext""" - return self._jsc.sc().uiWebUrl().get() - - @property - def startTime(self): - """Return the epoch time when the Spark Context was started.""" - return self._jsc.startTime() - - @property - def defaultParallelism(self): - """ - Default level of parallelism to use when not given by user (e.g. for - reduce tasks) - """ - return self._jsc.sc().defaultParallelism() - - @property - def defaultMinPartitions(self): - """ - Default min number of partitions for Hadoop RDDs when not given by user - """ - return self._jsc.sc().defaultMinPartitions() - - def stop(self): - """ - Shut down the SparkContext. - """ - if getattr(self, "_jsc", None): - try: - self._jsc.stop() - except Py4JError: - # Case: SPARK-18523 - warnings.warn( - 'Unable to cleanly shutdown Spark JVM process.' - ' It is possible that the process has crashed,' - ' been killed or may also be in a zombie state.', - RuntimeWarning - ) - pass - finally: - self._jsc = None - if getattr(self, "_accumulatorServer", None): - self._accumulatorServer.shutdown() - self._accumulatorServer = None - with SparkContext._lock: - SparkContext._active_spark_context = None - - def emptyRDD(self): - """ - Create an RDD that has no partitions or elements. - """ - return RDD(self._jsc.emptyRDD(), self, NoOpSerializer()) - - def range(self, start, end=None, step=1, numSlices=None): - """ - Create a new RDD of int containing elements from `start` to `end` - (exclusive), increased by `step` every element. Can be called the same - way as python's built-in range() function. If called with a single argument, - the argument is interpreted as `end`, and `start` is set to 0. - - :param start: the start value - :param end: the end value (exclusive) - :param step: the incremental step (default: 1) - :param numSlices: the number of partitions of the new RDD - :return: An RDD of int - - >>> sc.range(5).collect() - [0, 1, 2, 3, 4] - >>> sc.range(2, 4).collect() - [2, 3] - >>> sc.range(1, 7, 2).collect() - [1, 3, 5] - """ - if end is None: - end = start - start = 0 - - return self.parallelize(xrange(start, end, step), numSlices) - - def parallelize(self, c, numSlices=None): - """ - Distribute a local Python collection to form an RDD. Using xrange - is recommended if the input represents a range for performance. - - >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() - [[0], [2], [3], [4], [6]] - >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect() - [[], [0], [], [2], [4]] - """ - numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism - if isinstance(c, xrange): - size = len(c) - if size == 0: - return self.parallelize([], numSlices) - step = c[1] - c[0] if size > 1 else 1 - start0 = c[0] - - def getStart(split): - return start0 + int((split * size / numSlices)) * step - - def f(split, iterator): - return xrange(getStart(split), getStart(split + 1), step) - - return self.parallelize([], numSlices).mapPartitionsWithIndex(f) - - # Make sure we distribute data evenly if it's smaller than self.batchSize - if "__len__" not in dir(c): - c = list(c) # Make it a list so we can compute its length - batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024)) - serializer = BatchedSerializer(self._unbatched_serializer, batchSize) - - def reader_func(temp_filename): - return self._jvm.PythonRDD.readRDDFromFile(self._jsc, temp_filename, numSlices) - - def createRDDServer(): - return self._jvm.PythonParallelizeServer(self._jsc.sc(), numSlices) - - jrdd = self._serialize_to_jvm(c, serializer, reader_func, createRDDServer) - return RDD(jrdd, self, serializer) - - def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer): - """ - Using py4j to send a large dataset to the jvm is really slow, so we use either a file - or a socket if we have encryption enabled. - :param data: - :param serializer: - :param reader_func: A function which takes a filename and reads in the data in the jvm and - returns a JavaRDD. Only used when encryption is disabled. - :param createRDDServer: A function which creates a PythonRDDServer in the jvm to - accept the serialized data, for use when encryption is enabled. - :return: - """ - if self._encryption_enabled: - # with encryption, we open a server in java and send the data directly - server = createRDDServer() - (sock_file, _) = local_connect_and_auth(server.port(), server.secret()) - chunked_out = ChunkedStream(sock_file, 8192) - serializer.dump_stream(data, chunked_out) - chunked_out.close() - # this call will block until the server has read all the data and processed it (or - # throws an exception) - r = server.getResult() - return r - else: - # without encryption, we serialize to a file, and we read the file in java and - # parallelize from there. - tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) - try: - try: - serializer.dump_stream(data, tempFile) - finally: - tempFile.close() - return reader_func(tempFile.name) - finally: - # we eagerily reads the file so we can delete right after. - os.unlink(tempFile.name) - - def pickleFile(self, name, minPartitions=None): - """ - Load an RDD previously saved using L{RDD.saveAsPickleFile} method. - - >>> tmpFile = NamedTemporaryFile(delete=True) - >>> tmpFile.close() - >>> sc.parallelize(range(10)).saveAsPickleFile(tmpFile.name, 5) - >>> sorted(sc.pickleFile(tmpFile.name, 3).collect()) - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - """ - minPartitions = minPartitions or self.defaultMinPartitions - return RDD(self._jsc.objectFile(name, minPartitions), self) - - @ignore_unicode_prefix - def textFile(self, name, minPartitions=None, use_unicode=True): - """ - Read a text file from HDFS, a local file system (available on all - nodes), or any Hadoop-supported file system URI, and return it as an - RDD of Strings. - - If use_unicode is False, the strings will be kept as `str` (encoding - as `utf-8`), which is faster and smaller than unicode. (Added in - Spark 1.2) - - >>> path = os.path.join(tempdir, "sample-text.txt") - >>> with open(path, "w") as testFile: - ... _ = testFile.write("Hello world!") - >>> textFile = sc.textFile(path) - >>> textFile.collect() - [u'Hello world!'] - """ - minPartitions = minPartitions or min(self.defaultParallelism, 2) - return RDD(self._jsc.textFile(name, minPartitions), self, - UTF8Deserializer(use_unicode)) - - @ignore_unicode_prefix - def wholeTextFiles(self, path, minPartitions=None, use_unicode=True): - """ - Read a directory of text files from HDFS, a local file system - (available on all nodes), or any Hadoop-supported file system - URI. Each file is read as a single record and returned in a - key-value pair, where the key is the path of each file, the - value is the content of each file. - - If use_unicode is False, the strings will be kept as `str` (encoding - as `utf-8`), which is faster and smaller than unicode. (Added in - Spark 1.2) - - For example, if you have the following files:: - - hdfs://a-hdfs-path/part-00000 - hdfs://a-hdfs-path/part-00001 - ... - hdfs://a-hdfs-path/part-nnnnn - - Do C{rdd = sparkContext.wholeTextFiles("hdfs://a-hdfs-path")}, - then C{rdd} contains:: - - (a-hdfs-path/part-00000, its content) - (a-hdfs-path/part-00001, its content) - ... - (a-hdfs-path/part-nnnnn, its content) - - .. note:: Small files are preferred, as each file will be loaded - fully in memory. - - >>> dirPath = os.path.join(tempdir, "files") - >>> os.mkdir(dirPath) - >>> with open(os.path.join(dirPath, "1.txt"), "w") as file1: - ... _ = file1.write("1") - >>> with open(os.path.join(dirPath, "2.txt"), "w") as file2: - ... _ = file2.write("2") - >>> textFiles = sc.wholeTextFiles(dirPath) - >>> sorted(textFiles.collect()) - [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')] - """ - minPartitions = minPartitions or self.defaultMinPartitions - return RDD(self._jsc.wholeTextFiles(path, minPartitions), self, - PairDeserializer(UTF8Deserializer(use_unicode), UTF8Deserializer(use_unicode))) - - def binaryFiles(self, path, minPartitions=None): - """ - .. note:: Experimental - - Read a directory of binary files from HDFS, a local file system - (available on all nodes), or any Hadoop-supported file system URI - as a byte array. Each file is read as a single record and returned - in a key-value pair, where the key is the path of each file, the - value is the content of each file. - - .. note:: Small files are preferred, large file is also allowable, but - may cause bad performance. - """ - minPartitions = minPartitions or self.defaultMinPartitions - return RDD(self._jsc.binaryFiles(path, minPartitions), self, - PairDeserializer(UTF8Deserializer(), NoOpSerializer())) - - def binaryRecords(self, path, recordLength): - """ - .. note:: Experimental - - Load data from a flat binary file, assuming each record is a set of numbers - with the specified numerical format (see ByteBuffer), and the number of - bytes per record is constant. - - :param path: Directory to the input data files - :param recordLength: The length at which to split the records - """ - return RDD(self._jsc.binaryRecords(path, recordLength), self, NoOpSerializer()) - - def _dictToJavaMap(self, d): - jm = self._jvm.java.util.HashMap() - if not d: - d = {} - for k, v in d.items(): - jm[k] = v - return jm - - def sequenceFile(self, path, keyClass=None, valueClass=None, keyConverter=None, - valueConverter=None, minSplits=None, batchSize=0): - """ - Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS, - a local file system (available on all nodes), or any Hadoop-supported file system URI. - The mechanism is as follows: - - 1. A Java RDD is created from the SequenceFile or other InputFormat, and the key - and value Writable classes - 2. Serialization is attempted via Pyrolite pickling - 3. If this fails, the fallback is to call 'toString' on each key and value - 4. C{PickleSerializer} is used to deserialize pickled objects on the Python side - - :param path: path to sequncefile - :param keyClass: fully qualified classname of key Writable class - (e.g. "org.apache.hadoop.io.Text") - :param valueClass: fully qualified classname of value Writable class - (e.g. "org.apache.hadoop.io.LongWritable") - :param keyConverter: - :param valueConverter: - :param minSplits: minimum splits in dataset - (default min(2, sc.defaultParallelism)) - :param batchSize: The number of Python objects represented as a single - Java object. (default 0, choose batchSize automatically) - """ - minSplits = minSplits or min(self.defaultParallelism, 2) - jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, path, keyClass, valueClass, - keyConverter, valueConverter, minSplits, batchSize) - return RDD(jrdd, self) - - def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None, - valueConverter=None, conf=None, batchSize=0): - """ - Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS, - a local file system (available on all nodes), or any Hadoop-supported file system URI. - The mechanism is the same as for sc.sequenceFile. - - A Hadoop configuration can be passed in as a Python dict. This will be converted into a - Configuration in Java - - :param path: path to Hadoop file - :param inputFormatClass: fully qualified classname of Hadoop InputFormat - (e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat") - :param keyClass: fully qualified classname of key Writable class - (e.g. "org.apache.hadoop.io.Text") - :param valueClass: fully qualified classname of value Writable class - (e.g. "org.apache.hadoop.io.LongWritable") - :param keyConverter: (None by default) - :param valueConverter: (None by default) - :param conf: Hadoop configuration, passed in as a dict - (None by default) - :param batchSize: The number of Python objects represented as a single - Java object. (default 0, choose batchSize automatically) - """ - jconf = self._dictToJavaMap(conf) - jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, path, inputFormatClass, keyClass, - valueClass, keyConverter, valueConverter, - jconf, batchSize) - return RDD(jrdd, self) - - def newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None, - valueConverter=None, conf=None, batchSize=0): - """ - Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary - Hadoop configuration, which is passed in as a Python dict. - This will be converted into a Configuration in Java. - The mechanism is the same as for sc.sequenceFile. - - :param inputFormatClass: fully qualified classname of Hadoop InputFormat - (e.g. "org.apache.hadoop.mapreduce.lib.input.TextInputFormat") - :param keyClass: fully qualified classname of key Writable class - (e.g. "org.apache.hadoop.io.Text") - :param valueClass: fully qualified classname of value Writable class - (e.g. "org.apache.hadoop.io.LongWritable") - :param keyConverter: (None by default) - :param valueConverter: (None by default) - :param conf: Hadoop configuration, passed in as a dict - (None by default) - :param batchSize: The number of Python objects represented as a single - Java object. (default 0, choose batchSize automatically) - """ - jconf = self._dictToJavaMap(conf) - jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass, - valueClass, keyConverter, valueConverter, - jconf, batchSize) - return RDD(jrdd, self) - - def hadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None, - valueConverter=None, conf=None, batchSize=0): - """ - Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS, - a local file system (available on all nodes), or any Hadoop-supported file system URI. - The mechanism is the same as for sc.sequenceFile. - - A Hadoop configuration can be passed in as a Python dict. This will be converted into a - Configuration in Java. - - :param path: path to Hadoop file - :param inputFormatClass: fully qualified classname of Hadoop InputFormat - (e.g. "org.apache.hadoop.mapred.TextInputFormat") - :param keyClass: fully qualified classname of key Writable class - (e.g. "org.apache.hadoop.io.Text") - :param valueClass: fully qualified classname of value Writable class - (e.g. "org.apache.hadoop.io.LongWritable") - :param keyConverter: (None by default) - :param valueConverter: (None by default) - :param conf: Hadoop configuration, passed in as a dict - (None by default) - :param batchSize: The number of Python objects represented as a single - Java object. (default 0, choose batchSize automatically) - """ - jconf = self._dictToJavaMap(conf) - jrdd = self._jvm.PythonRDD.hadoopFile(self._jsc, path, inputFormatClass, keyClass, - valueClass, keyConverter, valueConverter, - jconf, batchSize) - return RDD(jrdd, self) - - def hadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter=None, - valueConverter=None, conf=None, batchSize=0): - """ - Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary - Hadoop configuration, which is passed in as a Python dict. - This will be converted into a Configuration in Java. - The mechanism is the same as for sc.sequenceFile. - - :param inputFormatClass: fully qualified classname of Hadoop InputFormat - (e.g. "org.apache.hadoop.mapred.TextInputFormat") - :param keyClass: fully qualified classname of key Writable class - (e.g. "org.apache.hadoop.io.Text") - :param valueClass: fully qualified classname of value Writable class - (e.g. "org.apache.hadoop.io.LongWritable") - :param keyConverter: (None by default) - :param valueConverter: (None by default) - :param conf: Hadoop configuration, passed in as a dict - (None by default) - :param batchSize: The number of Python objects represented as a single - Java object. (default 0, choose batchSize automatically) - """ - jconf = self._dictToJavaMap(conf) - jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputFormatClass, keyClass, - valueClass, keyConverter, valueConverter, - jconf, batchSize) - return RDD(jrdd, self) - - def _checkpointFile(self, name, input_deserializer): - jrdd = self._jsc.checkpointFile(name) - return RDD(jrdd, self, input_deserializer) - - @ignore_unicode_prefix - def union(self, rdds): - """ - Build the union of a list of RDDs. - - This supports unions() of RDDs with different serialized formats, - although this forces them to be reserialized using the default - serializer: - - >>> path = os.path.join(tempdir, "union-text.txt") - >>> with open(path, "w") as testFile: - ... _ = testFile.write("Hello") - >>> textFile = sc.textFile(path) - >>> textFile.collect() - [u'Hello'] - >>> parallelized = sc.parallelize(["World!"]) - >>> sorted(sc.union([textFile, parallelized]).collect()) - [u'Hello', 'World!'] - """ - first_jrdd_deserializer = rdds[0]._jrdd_deserializer - if any(x._jrdd_deserializer != first_jrdd_deserializer for x in rdds): - rdds = [x._reserialize() for x in rdds] - first = rdds[0]._jrdd - rest = [x._jrdd for x in rdds[1:]] - return RDD(self._jsc.union(first, rest), self, rdds[0]._jrdd_deserializer) - - def broadcast(self, value): - """ - Broadcast a read-only variable to the cluster, returning a - L{Broadcast} - object for reading it in distributed functions. The variable will - be sent to each cluster only once. - """ - return Broadcast(self, value, self._pickled_broadcast_vars) - - def accumulator(self, value, accum_param=None): - """ - Create an L{Accumulator} with the given initial value, using a given - L{AccumulatorParam} helper object to define how to add values of the - data type if provided. Default AccumulatorParams are used for integers - and floating-point numbers if you do not provide one. For other types, - a custom AccumulatorParam can be used. - """ - if accum_param is None: - if isinstance(value, int): - accum_param = accumulators.INT_ACCUMULATOR_PARAM - elif isinstance(value, float): - accum_param = accumulators.FLOAT_ACCUMULATOR_PARAM - elif isinstance(value, complex): - accum_param = accumulators.COMPLEX_ACCUMULATOR_PARAM - else: - raise TypeError("No default accumulator param for type %s" % type(value)) - SparkContext._next_accum_id += 1 - return Accumulator(SparkContext._next_accum_id - 1, value, accum_param) - - def addFile(self, path, recursive=False): - """ - Add a file to be downloaded with this Spark job on every node. - The C{path} passed can be either a local file, a file in HDFS - (or other Hadoop-supported filesystems), or an HTTP, HTTPS or - FTP URI. - - To access the file in Spark jobs, use - L{SparkFiles.get(fileName)} with the - filename to find its download location. - - A directory can be given if the recursive option is set to True. - Currently directories are only supported for Hadoop-supported filesystems. - - .. note:: A path can be added only once. Subsequent additions of the same path are ignored. - - >>> from pyspark import SparkFiles - >>> path = os.path.join(tempdir, "test.txt") - >>> with open(path, "w") as testFile: - ... _ = testFile.write("100") - >>> sc.addFile(path) - >>> def func(iterator): - ... with open(SparkFiles.get("test.txt")) as testFile: - ... fileVal = int(testFile.readline()) - ... return [x * fileVal for x in iterator] - >>> sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect() - [100, 200, 300, 400] - """ - self._jsc.sc().addFile(path, recursive) - - def addPyFile(self, path): - """ - Add a .py or .zip dependency for all tasks to be executed on this - SparkContext in the future. The C{path} passed can be either a local - file, a file in HDFS (or other Hadoop-supported filesystems), or an - HTTP, HTTPS or FTP URI. - - .. note:: A path can be added only once. Subsequent additions of the same path are ignored. - """ - self.addFile(path) - (dirname, filename) = os.path.split(path) # dirname may be directory or HDFS/S3 prefix - if filename[-4:].lower() in self.PACKAGE_EXTENSIONS: - self._python_includes.append(filename) - # for tests in local mode - sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename)) - if sys.version > '3': - import importlib - importlib.invalidate_caches() - - def setCheckpointDir(self, dirName): - """ - Set the directory under which RDDs are going to be checkpointed. The - directory must be a HDFS path if running on a cluster. - """ - self._jsc.sc().setCheckpointDir(dirName) - - def _getJavaStorageLevel(self, storageLevel): - """ - Returns a Java StorageLevel based on a pyspark.StorageLevel. - """ - if not isinstance(storageLevel, StorageLevel): - raise Exception("storageLevel must be of type pyspark.StorageLevel") - - newStorageLevel = self._jvm.org.apache.spark.storage.StorageLevel - return newStorageLevel(storageLevel.useDisk, - storageLevel.useMemory, - storageLevel.useOffHeap, - storageLevel.deserialized, - storageLevel.replication) - - def setJobGroup(self, groupId, description, interruptOnCancel=False): - """ - Assigns a group ID to all the jobs started by this thread until the group ID is set to a - different value or cleared. - - Often, a unit of execution in an application consists of multiple Spark actions or jobs. - Application programmers can use this method to group all those jobs together and give a - group description. Once set, the Spark web UI will associate such jobs with this group. - - The application can use L{SparkContext.cancelJobGroup} to cancel all - running jobs in this group. - - >>> import threading - >>> from time import sleep - >>> result = "Not Set" - >>> lock = threading.Lock() - >>> def map_func(x): - ... sleep(100) - ... raise Exception("Task should have been cancelled") - >>> def start_job(x): - ... global result - ... try: - ... sc.setJobGroup("job_to_cancel", "some description") - ... result = sc.parallelize(range(x)).map(map_func).collect() - ... except Exception as e: - ... result = "Cancelled" - ... lock.release() - >>> def stop_job(): - ... sleep(5) - ... sc.cancelJobGroup("job_to_cancel") - >>> suppress = lock.acquire() - >>> suppress = threading.Thread(target=start_job, args=(10,)).start() - >>> suppress = threading.Thread(target=stop_job).start() - >>> suppress = lock.acquire() - >>> print(result) - Cancelled - - If interruptOnCancel is set to true for the job group, then job cancellation will result - in Thread.interrupt() being called on the job's executor threads. This is useful to help - ensure that the tasks are actually stopped in a timely manner, but is off by default due - to HDFS-1208, where HDFS may respond to Thread.interrupt() by marking nodes as dead. - """ - self._jsc.setJobGroup(groupId, description, interruptOnCancel) - - def setLocalProperty(self, key, value): - """ - Set a local property that affects jobs submitted from this thread, such as the - Spark fair scheduler pool. - """ - self._jsc.setLocalProperty(key, value) - - def getLocalProperty(self, key): - """ - Get a local property set in this thread, or null if it is missing. See - L{setLocalProperty} - """ - return self._jsc.getLocalProperty(key) - - def setJobDescription(self, value): - """ - Set a human readable description of the current job. - """ - self._jsc.setJobDescription(value) - - def sparkUser(self): - """ - Get SPARK_USER for user who is running SparkContext. - """ - return self._jsc.sc().sparkUser() - - def cancelJobGroup(self, groupId): - """ - Cancel active jobs for the specified group. See L{SparkContext.setJobGroup} - for more information. - """ - self._jsc.sc().cancelJobGroup(groupId) - - def cancelAllJobs(self): - """ - Cancel all jobs that have been scheduled or are running. - """ - self._jsc.sc().cancelAllJobs() - - def statusTracker(self): - """ - Return :class:`StatusTracker` object - """ - return StatusTracker(self._jsc.statusTracker()) - - def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False): - """ - Executes the given partitionFunc on the specified set of partitions, - returning the result as an array of elements. - - If 'partitions' is not specified, this will run over all partitions. - - >>> myRDD = sc.parallelize(range(6), 3) - >>> sc.runJob(myRDD, lambda part: [x * x for x in part]) - [0, 1, 4, 9, 16, 25] - - >>> myRDD = sc.parallelize(range(6), 3) - >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True) - [0, 1, 16, 25] - """ - if partitions is None: - partitions = range(rdd._jrdd.partitions().size()) - - # Implementation note: This is implemented as a mapPartitions followed - # by runJob() in order to avoid having to pass a Python lambda into - # SparkContext#runJob. - mappedRDD = rdd.mapPartitions(partitionFunc) - sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions) - return list(_load_from_socket(sock_info, mappedRDD._jrdd_deserializer)) - - def show_profiles(self): - """ Print the profile stats to stdout """ - if self.profiler_collector is not None: - self.profiler_collector.show_profiles() - else: - raise RuntimeError("'spark.python.profile' configuration must be set " - "to 'true' to enable Python profile.") - - def dump_profiles(self, path): - """ Dump the profile stats into directory `path` - """ - if self.profiler_collector is not None: - self.profiler_collector.dump_profiles(path) - else: - raise RuntimeError("'spark.python.profile' configuration must be set " - "to 'true' to enable Python profile.") - - def getConf(self): - conf = SparkConf() - conf.setAll(self._conf.getAll()) - return conf - - -def _test(): - import atexit - import doctest - import tempfile - globs = globals().copy() - globs['sc'] = SparkContext('local[4]', 'PythonTest') - globs['tempdir'] = tempfile.mkdtemp() - atexit.register(lambda: shutil.rmtree(globs['tempdir'])) - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - globs['sc'].stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/daemon.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/daemon.py deleted file mode 100644 index ebdd665..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/daemon.py +++ /dev/null @@ -1,195 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numbers -import os -import signal -import select -import socket -import sys -import traceback -import time -import gc -from errno import EINTR, EAGAIN -from socket import AF_INET, SOCK_STREAM, SOMAXCONN -from signal import SIGHUP, SIGTERM, SIGCHLD, SIG_DFL, SIG_IGN, SIGINT - -from pyspark.worker import main as worker_main -from pyspark.serializers import read_int, write_int, write_with_length, UTF8Deserializer - - -def compute_real_exit_code(exit_code): - # SystemExit's code can be integer or string, but os._exit only accepts integers - if isinstance(exit_code, numbers.Integral): - return exit_code - else: - return 1 - - -def worker(sock, authenticated): - """ - Called by a worker process after the fork(). - """ - signal.signal(SIGHUP, SIG_DFL) - signal.signal(SIGCHLD, SIG_DFL) - signal.signal(SIGTERM, SIG_DFL) - # restore the handler for SIGINT, - # it's useful for debugging (show the stacktrace before exit) - signal.signal(SIGINT, signal.default_int_handler) - - # Read the socket using fdopen instead of socket.makefile() because the latter - # seems to be very slow; note that we need to dup() the file descriptor because - # otherwise writes also cause a seek that makes us miss data on the read side. - infile = os.fdopen(os.dup(sock.fileno()), "rb", 65536) - outfile = os.fdopen(os.dup(sock.fileno()), "wb", 65536) - - if not authenticated: - client_secret = UTF8Deserializer().loads(infile) - if os.environ["PYTHON_WORKER_FACTORY_SECRET"] == client_secret: - write_with_length("ok".encode("utf-8"), outfile) - outfile.flush() - else: - write_with_length("err".encode("utf-8"), outfile) - outfile.flush() - sock.close() - return 1 - - exit_code = 0 - try: - worker_main(infile, outfile) - except SystemExit as exc: - exit_code = compute_real_exit_code(exc.code) - finally: - try: - outfile.flush() - except Exception: - pass - return exit_code - - -def manager(): - # Create a new process group to corral our children - os.setpgid(0, 0) - - # Create a listening socket on the AF_INET loopback interface - listen_sock = socket.socket(AF_INET, SOCK_STREAM) - listen_sock.bind(('127.0.0.1', 0)) - listen_sock.listen(max(1024, SOMAXCONN)) - listen_host, listen_port = listen_sock.getsockname() - - # re-open stdin/stdout in 'wb' mode - stdin_bin = os.fdopen(sys.stdin.fileno(), 'rb', 4) - stdout_bin = os.fdopen(sys.stdout.fileno(), 'wb', 4) - write_int(listen_port, stdout_bin) - stdout_bin.flush() - - def shutdown(code): - signal.signal(SIGTERM, SIG_DFL) - # Send SIGHUP to notify workers of shutdown - os.kill(0, SIGHUP) - sys.exit(code) - - def handle_sigterm(*args): - shutdown(1) - signal.signal(SIGTERM, handle_sigterm) # Gracefully exit on SIGTERM - signal.signal(SIGHUP, SIG_IGN) # Don't die on SIGHUP - signal.signal(SIGCHLD, SIG_IGN) - - reuse = os.environ.get("SPARK_REUSE_WORKER") - - # Initialization complete - try: - while True: - try: - ready_fds = select.select([0, listen_sock], [], [], 1)[0] - except select.error as ex: - if ex[0] == EINTR: - continue - else: - raise - - if 0 in ready_fds: - try: - worker_pid = read_int(stdin_bin) - except EOFError: - # Spark told us to exit by closing stdin - shutdown(0) - try: - os.kill(worker_pid, signal.SIGKILL) - except OSError: - pass # process already died - - if listen_sock in ready_fds: - try: - sock, _ = listen_sock.accept() - except OSError as e: - if e.errno == EINTR: - continue - raise - - # Launch a worker process - try: - pid = os.fork() - except OSError as e: - if e.errno in (EAGAIN, EINTR): - time.sleep(1) - pid = os.fork() # error here will shutdown daemon - else: - outfile = sock.makefile(mode='wb') - write_int(e.errno, outfile) # Signal that the fork failed - outfile.flush() - outfile.close() - sock.close() - continue - - if pid == 0: - # in child process - listen_sock.close() - try: - # Acknowledge that the fork was successful - outfile = sock.makefile(mode="wb") - write_int(os.getpid(), outfile) - outfile.flush() - outfile.close() - authenticated = False - while True: - code = worker(sock, authenticated) - if code == 0: - authenticated = True - if not reuse or code: - # wait for closing - try: - while sock.recv(1024): - pass - except Exception: - pass - break - gc.collect() - except: - traceback.print_exc() - os._exit(1) - else: - os._exit(0) - else: - sock.close() - - finally: - shutdown(1) - - -if __name__ == '__main__': - manager() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/files.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/files.py deleted file mode 100644 index 797573f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/files.py +++ /dev/null @@ -1,59 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os - - -__all__ = ['SparkFiles'] - - -class SparkFiles(object): - - """ - Resolves paths to files added through - L{SparkContext.addFile()}. - - SparkFiles contains only classmethods; users should not create SparkFiles - instances. - """ - - _root_directory = None - _is_running_on_worker = False - _sc = None - - def __init__(self): - raise NotImplementedError("Do not construct SparkFiles objects") - - @classmethod - def get(cls, filename): - """ - Get the absolute path of a file added through C{SparkContext.addFile()}. - """ - path = os.path.join(SparkFiles.getRootDirectory(), filename) - return os.path.abspath(path) - - @classmethod - def getRootDirectory(cls): - """ - Get the root directory that contains files added through - C{SparkContext.addFile()}. - """ - if cls._is_running_on_worker: - return cls._root_directory - else: - # This will have to change if we support multiple SparkContexts: - return cls._sc._jvm.org.apache.spark.SparkFiles.getRootDirectory() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/find_spark_home.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/find_spark_home.py deleted file mode 100755 index 9c4ed46..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/find_spark_home.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This script attempt to determine the correct setting for SPARK_HOME given -# that Spark may have been installed on the system with pip. - -from __future__ import print_function -import os -import sys - - -def _find_spark_home(): - """Find the SPARK_HOME.""" - # If the environment has SPARK_HOME set trust it. - if "SPARK_HOME" in os.environ: - return os.environ["SPARK_HOME"] - - def is_spark_home(path): - """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME""" - return (os.path.isfile(os.path.join(path, "bin/spark-submit")) and - (os.path.isdir(os.path.join(path, "jars")) or - os.path.isdir(os.path.join(path, "assembly")))) - - paths = ["../", os.path.dirname(os.path.realpath(__file__))] - - # Add the path of the PySpark module if it exists - if sys.version < "3": - import imp - try: - module_home = imp.find_module("pyspark")[1] - paths.append(module_home) - # If we are installed in edit mode also look two dirs up - paths.append(os.path.join(module_home, "../../")) - except ImportError: - # Not pip installed no worries - pass - else: - from importlib.util import find_spec - try: - module_home = os.path.dirname(find_spec("pyspark").origin) - paths.append(module_home) - # If we are installed in edit mode also look two dirs up - paths.append(os.path.join(module_home, "../../")) - except ImportError: - # Not pip installed no worries - pass - - # Normalize the paths - paths = [os.path.abspath(p) for p in paths] - - try: - return next(path for path in paths if is_spark_home(path)) - except StopIteration: - print("Could not find valid SPARK_HOME while searching {0}".format(paths), file=sys.stderr) - sys.exit(-1) - -if __name__ == "__main__": - print(_find_spark_home()) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/heapq3.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/heapq3.py deleted file mode 100644 index 37a2914..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/heapq3.py +++ /dev/null @@ -1,890 +0,0 @@ -# -*- encoding: utf-8 -*- -# back ported from CPython 3 -# A. HISTORY OF THE SOFTWARE -# ========================== -# -# Python was created in the early 1990s by Guido van Rossum at Stichting -# Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands -# as a successor of a language called ABC. Guido remains Python's -# principal author, although it includes many contributions from others. -# -# In 1995, Guido continued his work on Python at the Corporation for -# National Research Initiatives (CNRI, see http://www.cnri.reston.va.us) -# in Reston, Virginia where he released several versions of the -# software. -# -# In May 2000, Guido and the Python core development team moved to -# BeOpen.com to form the BeOpen PythonLabs team. In October of the same -# year, the PythonLabs team moved to Digital Creations (now Zope -# Corporation, see http://www.zope.com). In 2001, the Python Software -# Foundation (PSF, see http://www.python.org/psf/) was formed, a -# non-profit organization created specifically to own Python-related -# Intellectual Property. Zope Corporation is a sponsoring member of -# the PSF. -# -# All Python releases are Open Source (see http://www.opensource.org for -# the Open Source Definition). Historically, most, but not all, Python -# releases have also been GPL-compatible; the table below summarizes -# the various releases. -# -# Release Derived Year Owner GPL- -# from compatible? (1) -# -# 0.9.0 thru 1.2 1991-1995 CWI yes -# 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes -# 1.6 1.5.2 2000 CNRI no -# 2.0 1.6 2000 BeOpen.com no -# 1.6.1 1.6 2001 CNRI yes (2) -# 2.1 2.0+1.6.1 2001 PSF no -# 2.0.1 2.0+1.6.1 2001 PSF yes -# 2.1.1 2.1+2.0.1 2001 PSF yes -# 2.2 2.1.1 2001 PSF yes -# 2.1.2 2.1.1 2002 PSF yes -# 2.1.3 2.1.2 2002 PSF yes -# 2.2.1 2.2 2002 PSF yes -# 2.2.2 2.2.1 2002 PSF yes -# 2.2.3 2.2.2 2003 PSF yes -# 2.3 2.2.2 2002-2003 PSF yes -# 2.3.1 2.3 2002-2003 PSF yes -# 2.3.2 2.3.1 2002-2003 PSF yes -# 2.3.3 2.3.2 2002-2003 PSF yes -# 2.3.4 2.3.3 2004 PSF yes -# 2.3.5 2.3.4 2005 PSF yes -# 2.4 2.3 2004 PSF yes -# 2.4.1 2.4 2005 PSF yes -# 2.4.2 2.4.1 2005 PSF yes -# 2.4.3 2.4.2 2006 PSF yes -# 2.4.4 2.4.3 2006 PSF yes -# 2.5 2.4 2006 PSF yes -# 2.5.1 2.5 2007 PSF yes -# 2.5.2 2.5.1 2008 PSF yes -# 2.5.3 2.5.2 2008 PSF yes -# 2.6 2.5 2008 PSF yes -# 2.6.1 2.6 2008 PSF yes -# 2.6.2 2.6.1 2009 PSF yes -# 2.6.3 2.6.2 2009 PSF yes -# 2.6.4 2.6.3 2009 PSF yes -# 2.6.5 2.6.4 2010 PSF yes -# 2.7 2.6 2010 PSF yes -# -# Footnotes: -# -# (1) GPL-compatible doesn't mean that we're distributing Python under -# the GPL. All Python licenses, unlike the GPL, let you distribute -# a modified version without making your changes open source. The -# GPL-compatible licenses make it possible to combine Python with -# other software that is released under the GPL; the others don't. -# -# (2) According to Richard Stallman, 1.6.1 is not GPL-compatible, -# because its license has a choice of law clause. According to -# CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 -# is "not incompatible" with the GPL. -# -# Thanks to the many outside volunteers who have worked under Guido's -# direction to make these releases possible. -# -# -# B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON -# =============================================================== -# -# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 -# -------------------------------------------- -# -# 1. This LICENSE AGREEMENT is between the Python Software Foundation -# ("PSF"), and the Individual or Organization ("Licensee") accessing and -# otherwise using this software ("Python") in source or binary form and -# its associated documentation. -# -# 2. Subject to the terms and conditions of this License Agreement, PSF hereby -# grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, -# analyze, test, perform and/or display publicly, prepare derivative works, -# distribute, and otherwise use Python alone or in any derivative version, -# provided, however, that PSF's License Agreement and PSF's notice of copyright, -# i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, -# 2011, 2012, 2013 Python Software Foundation; All Rights Reserved" are retained -# in Python alone or in any derivative version prepared by Licensee. -# -# 3. In the event Licensee prepares a derivative work that is based on -# or incorporates Python or any part thereof, and wants to make -# the derivative work available to others as provided herein, then -# Licensee hereby agrees to include in any such work a brief summary of -# the changes made to Python. -# -# 4. PSF is making Python available to Licensee on an "AS IS" -# basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -# IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND -# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT -# INFRINGE ANY THIRD PARTY RIGHTS. -# -# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON -# FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS -# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, -# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. -# -# 6. This License Agreement will automatically terminate upon a material -# breach of its terms and conditions. -# -# 7. Nothing in this License Agreement shall be deemed to create any -# relationship of agency, partnership, or joint venture between PSF and -# Licensee. This License Agreement does not grant permission to use PSF -# trademarks or trade name in a trademark sense to endorse or promote -# products or services of Licensee, or any third party. -# -# 8. By copying, installing or otherwise using Python, Licensee -# agrees to be bound by the terms and conditions of this License -# Agreement. -# -# -# BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 -# ------------------------------------------- -# -# BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 -# -# 1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an -# office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the -# Individual or Organization ("Licensee") accessing and otherwise using -# this software in source or binary form and its associated -# documentation ("the Software"). -# -# 2. Subject to the terms and conditions of this BeOpen Python License -# Agreement, BeOpen hereby grants Licensee a non-exclusive, -# royalty-free, world-wide license to reproduce, analyze, test, perform -# and/or display publicly, prepare derivative works, distribute, and -# otherwise use the Software alone or in any derivative version, -# provided, however, that the BeOpen Python License is retained in the -# Software, alone or in any derivative version prepared by Licensee. -# -# 3. BeOpen is making the Software available to Licensee on an "AS IS" -# basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -# IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND -# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT -# INFRINGE ANY THIRD PARTY RIGHTS. -# -# 4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE -# SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS -# AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY -# DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. -# -# 5. This License Agreement will automatically terminate upon a material -# breach of its terms and conditions. -# -# 6. This License Agreement shall be governed by and interpreted in all -# respects by the law of the State of California, excluding conflict of -# law provisions. Nothing in this License Agreement shall be deemed to -# create any relationship of agency, partnership, or joint venture -# between BeOpen and Licensee. This License Agreement does not grant -# permission to use BeOpen trademarks or trade names in a trademark -# sense to endorse or promote products or services of Licensee, or any -# third party. As an exception, the "BeOpen Python" logos available at -# http://www.pythonlabs.com/logos.html may be used according to the -# permissions granted on that web page. -# -# 7. By copying, installing or otherwise using the software, Licensee -# agrees to be bound by the terms and conditions of this License -# Agreement. -# -# -# CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 -# --------------------------------------- -# -# 1. This LICENSE AGREEMENT is between the Corporation for National -# Research Initiatives, having an office at 1895 Preston White Drive, -# Reston, VA 20191 ("CNRI"), and the Individual or Organization -# ("Licensee") accessing and otherwise using Python 1.6.1 software in -# source or binary form and its associated documentation. -# -# 2. Subject to the terms and conditions of this License Agreement, CNRI -# hereby grants Licensee a nonexclusive, royalty-free, world-wide -# license to reproduce, analyze, test, perform and/or display publicly, -# prepare derivative works, distribute, and otherwise use Python 1.6.1 -# alone or in any derivative version, provided, however, that CNRI's -# License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) -# 1995-2001 Corporation for National Research Initiatives; All Rights -# Reserved" are retained in Python 1.6.1 alone or in any derivative -# version prepared by Licensee. Alternately, in lieu of CNRI's License -# Agreement, Licensee may substitute the following text (omitting the -# quotes): "Python 1.6.1 is made available subject to the terms and -# conditions in CNRI's License Agreement. This Agreement together with -# Python 1.6.1 may be located on the Internet using the following -# unique, persistent identifier (known as a handle): 1895.22/1013. This -# Agreement may also be obtained from a proxy server on the Internet -# using the following URL: http://hdl.handle.net/1895.22/1013". -# -# 3. In the event Licensee prepares a derivative work that is based on -# or incorporates Python 1.6.1 or any part thereof, and wants to make -# the derivative work available to others as provided herein, then -# Licensee hereby agrees to include in any such work a brief summary of -# the changes made to Python 1.6.1. -# -# 4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" -# basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -# IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND -# DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT -# INFRINGE ANY THIRD PARTY RIGHTS. -# -# 5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON -# 1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS -# A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, -# OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. -# -# 6. This License Agreement will automatically terminate upon a material -# breach of its terms and conditions. -# -# 7. This License Agreement shall be governed by the federal -# intellectual property law of the United States, including without -# limitation the federal copyright law, and, to the extent such -# U.S. federal law does not apply, by the law of the Commonwealth of -# Virginia, excluding Virginia's conflict of law provisions. -# Notwithstanding the foregoing, with regard to derivative works based -# on Python 1.6.1 that incorporate non-separable material that was -# previously distributed under the GNU General Public License (GPL), the -# law of the Commonwealth of Virginia shall govern this License -# Agreement only as to issues arising under or with respect to -# Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this -# License Agreement shall be deemed to create any relationship of -# agency, partnership, or joint venture between CNRI and Licensee. This -# License Agreement does not grant permission to use CNRI trademarks or -# trade name in a trademark sense to endorse or promote products or -# services of Licensee, or any third party. -# -# 8. By clicking on the "ACCEPT" button where indicated, or by copying, -# installing or otherwise using Python 1.6.1, Licensee agrees to be -# bound by the terms and conditions of this License Agreement. -# -# ACCEPT -# -# -# CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 -# -------------------------------------------------- -# -# Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, -# The Netherlands. All rights reserved. -# -# Permission to use, copy, modify, and distribute this software and its -# documentation for any purpose and without fee is hereby granted, -# provided that the above copyright notice appear in all copies and that -# both that copyright notice and this permission notice appear in -# supporting documentation, and that the name of Stichting Mathematisch -# Centrum or CWI not be used in advertising or publicity pertaining to -# distribution of the software without specific, written prior -# permission. -# -# STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO -# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND -# FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE -# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT -# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -"""Heap queue algorithm (a.k.a. priority queue). - -Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for -all k, counting elements from 0. For the sake of comparison, -non-existing elements are considered to be infinite. The interesting -property of a heap is that a[0] is always its smallest element. - -Usage: - -heap = [] # creates an empty heap -heappush(heap, item) # pushes a new item on the heap -item = heappop(heap) # pops the smallest item from the heap -item = heap[0] # smallest item on the heap without popping it -heapify(x) # transforms list into a heap, in-place, in linear time -item = heapreplace(heap, item) # pops and returns smallest item, and adds - # new item; the heap size is unchanged - -Our API differs from textbook heap algorithms as follows: - -- We use 0-based indexing. This makes the relationship between the - index for a node and the indexes for its children slightly less - obvious, but is more suitable since Python uses 0-based indexing. - -- Our heappop() method returns the smallest item, not the largest. - -These two make it possible to view the heap as a regular Python list -without surprises: heap[0] is the smallest item, and heap.sort() -maintains the heap invariant! -""" - -# Original code by Kevin O'Connor, augmented by Tim Peters and Raymond Hettinger - -__about__ = """Heap queues - -[explanation by François Pinard] - -Heaps are arrays for which a[k] <= a[2*k+1] and a[k] <= a[2*k+2] for -all k, counting elements from 0. For the sake of comparison, -non-existing elements are considered to be infinite. The interesting -property of a heap is that a[0] is always its smallest element. - -The strange invariant above is meant to be an efficient memory -representation for a tournament. The numbers below are `k', not a[k]: - - 0 - - 1 2 - - 3 4 5 6 - - 7 8 9 10 11 12 13 14 - - 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 - - -In the tree above, each cell `k' is topping `2*k+1' and `2*k+2'. In -an usual binary tournament we see in sports, each cell is the winner -over the two cells it tops, and we can trace the winner down the tree -to see all opponents s/he had. However, in many computer applications -of such tournaments, we do not need to trace the history of a winner. -To be more memory efficient, when a winner is promoted, we try to -replace it by something else at a lower level, and the rule becomes -that a cell and the two cells it tops contain three different items, -but the top cell "wins" over the two topped cells. - -If this heap invariant is protected at all time, index 0 is clearly -the overall winner. The simplest algorithmic way to remove it and -find the "next" winner is to move some loser (let's say cell 30 in the -diagram above) into the 0 position, and then percolate this new 0 down -the tree, exchanging values, until the invariant is re-established. -This is clearly logarithmic on the total number of items in the tree. -By iterating over all items, you get an O(n ln n) sort. - -A nice feature of this sort is that you can efficiently insert new -items while the sort is going on, provided that the inserted items are -not "better" than the last 0'th element you extracted. This is -especially useful in simulation contexts, where the tree holds all -incoming events, and the "win" condition means the smallest scheduled -time. When an event schedule other events for execution, they are -scheduled into the future, so they can easily go into the heap. So, a -heap is a good structure for implementing schedulers (this is what I -used for my MIDI sequencer :-). - -Various structures for implementing schedulers have been extensively -studied, and heaps are good for this, as they are reasonably speedy, -the speed is almost constant, and the worst case is not much different -than the average case. However, there are other representations which -are more efficient overall, yet the worst cases might be terrible. - -Heaps are also very useful in big disk sorts. You most probably all -know that a big sort implies producing "runs" (which are pre-sorted -sequences, which size is usually related to the amount of CPU memory), -followed by a merging passes for these runs, which merging is often -very cleverly organised[1]. It is very important that the initial -sort produces the longest runs possible. Tournaments are a good way -to that. If, using all the memory available to hold a tournament, you -replace and percolate items that happen to fit the current run, you'll -produce runs which are twice the size of the memory for random input, -and much better for input fuzzily ordered. - -Moreover, if you output the 0'th item on disk and get an input which -may not fit in the current tournament (because the value "wins" over -the last output value), it cannot fit in the heap, so the size of the -heap decreases. The freed memory could be cleverly reused immediately -for progressively building a second heap, which grows at exactly the -same rate the first heap is melting. When the first heap completely -vanishes, you switch heaps and start a new run. Clever and quite -effective! - -In a word, heaps are useful memory structures to know. I use them in -a few applications, and I think it is good to keep a `heap' module -around. :-) - --------------------- -[1] The disk balancing algorithms which are current, nowadays, are -more annoying than clever, and this is a consequence of the seeking -capabilities of the disks. On devices which cannot seek, like big -tape drives, the story was quite different, and one had to be very -clever to ensure (far in advance) that each tape movement will be the -most effective possible (that is, will best participate at -"progressing" the merge). Some tapes were even able to read -backwards, and this was also used to avoid the rewinding time. -Believe me, real good tape sorts were quite spectacular to watch! -From all times, sorting has always been a Great Art! :-) -""" - -__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge', - 'nlargest', 'nsmallest', 'heappushpop'] - -def heappush(heap, item): - """Push item onto heap, maintaining the heap invariant.""" - heap.append(item) - _siftdown(heap, 0, len(heap)-1) - -def heappop(heap): - """Pop the smallest item off the heap, maintaining the heap invariant.""" - lastelt = heap.pop() # raises appropriate IndexError if heap is empty - if heap: - returnitem = heap[0] - heap[0] = lastelt - _siftup(heap, 0) - return returnitem - return lastelt - -def heapreplace(heap, item): - """Pop and return the current smallest value, and add the new item. - - This is more efficient than heappop() followed by heappush(), and can be - more appropriate when using a fixed-size heap. Note that the value - returned may be larger than item! That constrains reasonable uses of - this routine unless written as part of a conditional replacement: - - if item > heap[0]: - item = heapreplace(heap, item) - """ - returnitem = heap[0] # raises appropriate IndexError if heap is empty - heap[0] = item - _siftup(heap, 0) - return returnitem - -def heappushpop(heap, item): - """Fast version of a heappush followed by a heappop.""" - if heap and heap[0] < item: - item, heap[0] = heap[0], item - _siftup(heap, 0) - return item - -def heapify(x): - """Transform list into a heap, in-place, in O(len(x)) time.""" - n = len(x) - # Transform bottom-up. The largest index there's any point to looking at - # is the largest with a child index in-range, so must have 2*i + 1 < n, - # or i < (n-1)/2. If n is even = 2*j, this is (2*j-1)/2 = j-1/2 so - # j-1 is the largest, which is n//2 - 1. If n is odd = 2*j+1, this is - # (2*j+1-1)/2 = j so j-1 is the largest, and that's again n//2-1. - for i in reversed(range(n//2)): - _siftup(x, i) - -def _heappop_max(heap): - """Maxheap version of a heappop.""" - lastelt = heap.pop() # raises appropriate IndexError if heap is empty - if heap: - returnitem = heap[0] - heap[0] = lastelt - _siftup_max(heap, 0) - return returnitem - return lastelt - -def _heapreplace_max(heap, item): - """Maxheap version of a heappop followed by a heappush.""" - returnitem = heap[0] # raises appropriate IndexError if heap is empty - heap[0] = item - _siftup_max(heap, 0) - return returnitem - -def _heapify_max(x): - """Transform list into a maxheap, in-place, in O(len(x)) time.""" - n = len(x) - for i in reversed(range(n//2)): - _siftup_max(x, i) - -# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos -# is the index of a leaf with a possibly out-of-order value. Restore the -# heap invariant. -def _siftdown(heap, startpos, pos): - newitem = heap[pos] - # Follow the path to the root, moving parents down until finding a place - # newitem fits. - while pos > startpos: - parentpos = (pos - 1) >> 1 - parent = heap[parentpos] - if newitem < parent: - heap[pos] = parent - pos = parentpos - continue - break - heap[pos] = newitem - -# The child indices of heap index pos are already heaps, and we want to make -# a heap at index pos too. We do this by bubbling the smaller child of -# pos up (and so on with that child's children, etc) until hitting a leaf, -# then using _siftdown to move the oddball originally at index pos into place. -# -# We *could* break out of the loop as soon as we find a pos where newitem <= -# both its children, but turns out that's not a good idea, and despite that -# many books write the algorithm that way. During a heap pop, the last array -# element is sifted in, and that tends to be large, so that comparing it -# against values starting from the root usually doesn't pay (= usually doesn't -# get us out of the loop early). See Knuth, Volume 3, where this is -# explained and quantified in an exercise. -# -# Cutting the # of comparisons is important, since these routines have no -# way to extract "the priority" from an array element, so that intelligence -# is likely to be hiding in custom comparison methods, or in array elements -# storing (priority, record) tuples. Comparisons are thus potentially -# expensive. -# -# On random arrays of length 1000, making this change cut the number of -# comparisons made by heapify() a little, and those made by exhaustive -# heappop() a lot, in accord with theory. Here are typical results from 3 -# runs (3 just to demonstrate how small the variance is): -# -# Compares needed by heapify Compares needed by 1000 heappops -# -------------------------- -------------------------------- -# 1837 cut to 1663 14996 cut to 8680 -# 1855 cut to 1659 14966 cut to 8678 -# 1847 cut to 1660 15024 cut to 8703 -# -# Building the heap by using heappush() 1000 times instead required -# 2198, 2148, and 2219 compares: heapify() is more efficient, when -# you can use it. -# -# The total compares needed by list.sort() on the same lists were 8627, -# 8627, and 8632 (this should be compared to the sum of heapify() and -# heappop() compares): list.sort() is (unsurprisingly!) more efficient -# for sorting. - -def _siftup(heap, pos): - endpos = len(heap) - startpos = pos - newitem = heap[pos] - # Bubble up the smaller child until hitting a leaf. - childpos = 2*pos + 1 # leftmost child position - while childpos < endpos: - # Set childpos to index of smaller child. - rightpos = childpos + 1 - if rightpos < endpos and not heap[childpos] < heap[rightpos]: - childpos = rightpos - # Move the smaller child up. - heap[pos] = heap[childpos] - pos = childpos - childpos = 2*pos + 1 - # The leaf at pos is empty now. Put newitem there, and bubble it up - # to its final resting place (by sifting its parents down). - heap[pos] = newitem - _siftdown(heap, startpos, pos) - -def _siftdown_max(heap, startpos, pos): - 'Maxheap variant of _siftdown' - newitem = heap[pos] - # Follow the path to the root, moving parents down until finding a place - # newitem fits. - while pos > startpos: - parentpos = (pos - 1) >> 1 - parent = heap[parentpos] - if parent < newitem: - heap[pos] = parent - pos = parentpos - continue - break - heap[pos] = newitem - -def _siftup_max(heap, pos): - 'Maxheap variant of _siftup' - endpos = len(heap) - startpos = pos - newitem = heap[pos] - # Bubble up the larger child until hitting a leaf. - childpos = 2*pos + 1 # leftmost child position - while childpos < endpos: - # Set childpos to index of larger child. - rightpos = childpos + 1 - if rightpos < endpos and not heap[rightpos] < heap[childpos]: - childpos = rightpos - # Move the larger child up. - heap[pos] = heap[childpos] - pos = childpos - childpos = 2*pos + 1 - # The leaf at pos is empty now. Put newitem there, and bubble it up - # to its final resting place (by sifting its parents down). - heap[pos] = newitem - _siftdown_max(heap, startpos, pos) - -def merge(iterables, key=None, reverse=False): - '''Merge multiple sorted inputs into a single sorted output. - - Similar to sorted(itertools.chain(*iterables)) but returns a generator, - does not pull the data into memory all at once, and assumes that each of - the input streams is already sorted (smallest to largest). - - >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25])) - [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25] - - If *key* is not None, applies a key function to each element to determine - its sort order. - - >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len)) - ['dog', 'cat', 'fish', 'horse', 'kangaroo'] - - ''' - - h = [] - h_append = h.append - - if reverse: - _heapify = _heapify_max - _heappop = _heappop_max - _heapreplace = _heapreplace_max - direction = -1 - else: - _heapify = heapify - _heappop = heappop - _heapreplace = heapreplace - direction = 1 - - if key is None: - for order, it in enumerate(map(iter, iterables)): - try: - h_append([next(it), order * direction, it]) - except StopIteration: - pass - _heapify(h) - while len(h) > 1: - try: - while True: - value, order, it = s = h[0] - yield value - s[0] = next(it) # raises StopIteration when exhausted - _heapreplace(h, s) # restore heap condition - except StopIteration: - _heappop(h) # remove empty iterator - if h: - # fast case when only a single iterator remains - value, order, it = h[0] - yield value - for value in it: - yield value - return - - for order, it in enumerate(map(iter, iterables)): - try: - value = next(it) - h_append([key(value), order * direction, value, it]) - except StopIteration: - pass - _heapify(h) - while len(h) > 1: - try: - while True: - key_value, order, value, it = s = h[0] - yield value - value = next(it) - s[0] = key(value) - s[2] = value - _heapreplace(h, s) - except StopIteration: - _heappop(h) - if h: - key_value, order, value, it = h[0] - yield value - for value in it: - yield value - - -# Algorithm notes for nlargest() and nsmallest() -# ============================================== -# -# Make a single pass over the data while keeping the k most extreme values -# in a heap. Memory consumption is limited to keeping k values in a list. -# -# Measured performance for random inputs: -# -# number of comparisons -# n inputs k-extreme values (average of 5 trials) % more than min() -# ------------- ---------------- --------------------- ----------------- -# 1,000 100 3,317 231.7% -# 10,000 100 14,046 40.5% -# 100,000 100 105,749 5.7% -# 1,000,000 100 1,007,751 0.8% -# 10,000,000 100 10,009,401 0.1% -# -# Theoretical number of comparisons for k smallest of n random inputs: -# -# Step Comparisons Action -# ---- -------------------------- --------------------------- -# 1 1.66 * k heapify the first k-inputs -# 2 n - k compare remaining elements to top of heap -# 3 k * (1 + lg2(k)) * ln(n/k) replace the topmost value on the heap -# 4 k * lg2(k) - (k/2) final sort of the k most extreme values -# -# Combining and simplifying for a rough estimate gives: -# -# comparisons = n + k * (log(k, 2) * log(n/k) + log(k, 2) + log(n/k)) -# -# Computing the number of comparisons for step 3: -# ----------------------------------------------- -# * For the i-th new value from the iterable, the probability of being in the -# k most extreme values is k/i. For example, the probability of the 101st -# value seen being in the 100 most extreme values is 100/101. -# * If the value is a new extreme value, the cost of inserting it into the -# heap is 1 + log(k, 2). -# * The probability times the cost gives: -# (k/i) * (1 + log(k, 2)) -# * Summing across the remaining n-k elements gives: -# sum((k/i) * (1 + log(k, 2)) for i in range(k+1, n+1)) -# * This reduces to: -# (H(n) - H(k)) * k * (1 + log(k, 2)) -# * Where H(n) is the n-th harmonic number estimated by: -# gamma = 0.5772156649 -# H(n) = log(n, e) + gamma + 1 / (2 * n) -# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence -# * Substituting the H(n) formula: -# comparisons = k * (1 + log(k, 2)) * (log(n/k, e) + (1/n - 1/k) / 2) -# -# Worst-case for step 3: -# ---------------------- -# In the worst case, the input data is reversed sorted so that every new element -# must be inserted in the heap: -# -# comparisons = 1.66 * k + log(k, 2) * (n - k) -# -# Alternative Algorithms -# ---------------------- -# Other algorithms were not used because they: -# 1) Took much more auxiliary memory, -# 2) Made multiple passes over the data. -# 3) Made more comparisons in common cases (small k, large n, semi-random input). -# See the more detailed comparison of approach at: -# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest - -def nsmallest(n, iterable, key=None): - """Find the n smallest elements in a dataset. - - Equivalent to: sorted(iterable, key=key)[:n] - """ - - # Short-cut for n==1 is to use min() - if n == 1: - it = iter(iterable) - sentinel = object() - if key is None: - result = min(it, default=sentinel) - else: - result = min(it, default=sentinel, key=key) - return [] if result is sentinel else [result] - - # When n>=size, it's faster to use sorted() - try: - size = len(iterable) - except (TypeError, AttributeError): - pass - else: - if n >= size: - return sorted(iterable, key=key)[:n] - - # When key is none, use simpler decoration - if key is None: - it = iter(iterable) - # put the range(n) first so that zip() doesn't - # consume one too many elements from the iterator - result = [(elem, i) for i, elem in zip(range(n), it)] - if not result: - return result - _heapify_max(result) - top = result[0][0] - order = n - _heapreplace = _heapreplace_max - for elem in it: - if elem < top: - _heapreplace(result, (elem, order)) - top = result[0][0] - order += 1 - result.sort() - return [r[0] for r in result] - - # General case, slowest method - it = iter(iterable) - result = [(key(elem), i, elem) for i, elem in zip(range(n), it)] - if not result: - return result - _heapify_max(result) - top = result[0][0] - order = n - _heapreplace = _heapreplace_max - for elem in it: - k = key(elem) - if k < top: - _heapreplace(result, (k, order, elem)) - top = result[0][0] - order += 1 - result.sort() - return [r[2] for r in result] - -def nlargest(n, iterable, key=None): - """Find the n largest elements in a dataset. - - Equivalent to: sorted(iterable, key=key, reverse=True)[:n] - """ - - # Short-cut for n==1 is to use max() - if n == 1: - it = iter(iterable) - sentinel = object() - if key is None: - result = max(it, default=sentinel) - else: - result = max(it, default=sentinel, key=key) - return [] if result is sentinel else [result] - - # When n>=size, it's faster to use sorted() - try: - size = len(iterable) - except (TypeError, AttributeError): - pass - else: - if n >= size: - return sorted(iterable, key=key, reverse=True)[:n] - - # When key is none, use simpler decoration - if key is None: - it = iter(iterable) - result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)] - if not result: - return result - heapify(result) - top = result[0][0] - order = -n - _heapreplace = heapreplace - for elem in it: - if top < elem: - _heapreplace(result, (elem, order)) - top = result[0][0] - order -= 1 - result.sort(reverse=True) - return [r[0] for r in result] - - # General case, slowest method - it = iter(iterable) - result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)] - if not result: - return result - heapify(result) - top = result[0][0] - order = -n - _heapreplace = heapreplace - for elem in it: - k = key(elem) - if top < k: - _heapreplace(result, (k, order, elem)) - top = result[0][0] - order -= 1 - result.sort(reverse=True) - return [r[2] for r in result] - -# If available, use C implementation -try: - from _heapq import * -except ImportError: - pass -try: - from _heapq import _heapreplace_max -except ImportError: - pass -try: - from _heapq import _heapify_max -except ImportError: - pass -try: - from _heapq import _heappop_max -except ImportError: - pass - - -if __name__ == "__main__": - import doctest - import sys - (failure_count, test_count) = doctest.testmod() - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/java_gateway.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/java_gateway.py deleted file mode 100644 index feb6b7b..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/java_gateway.py +++ /dev/null @@ -1,216 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import atexit -import os -import sys -import select -import signal -import shlex -import shutil -import socket -import platform -import tempfile -import time -from subprocess import Popen, PIPE - -if sys.version >= '3': - xrange = range - -from py4j.java_gateway import java_import, JavaGateway, JavaObject, GatewayParameters -from pyspark.find_spark_home import _find_spark_home -from pyspark.serializers import read_int, write_with_length, UTF8Deserializer -from pyspark.util import _exception_message - - -def launch_gateway(conf=None): - """ - launch jvm gateway - :param conf: spark configuration passed to spark-submit - :return: a JVM gateway - """ - return _launch_gateway(conf) - - -def _launch_gateway(conf=None, insecure=False): - """ - launch jvm gateway - :param conf: spark configuration passed to spark-submit - :param insecure: True to create an insecure gateway; only for testing - :return: a JVM gateway - """ - if insecure and os.environ.get("SPARK_TESTING", "0") != "1": - raise ValueError("creating insecure gateways is only for testing") - if "PYSPARK_GATEWAY_PORT" in os.environ: - gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"]) - gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"] - else: - SPARK_HOME = _find_spark_home() - # Launch the Py4j gateway using Spark's run command so that we pick up the - # proper classpath and settings from spark-env.sh - on_windows = platform.system() == "Windows" - script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit" - command = [os.path.join(SPARK_HOME, script)] - if conf: - for k, v in conf.getAll(): - command += ['--conf', '%s=%s' % (k, v)] - submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell") - if os.environ.get("SPARK_TESTING"): - submit_args = ' '.join([ - "--conf spark.ui.enabled=false", - submit_args - ]) - command = command + shlex.split(submit_args) - - # Create a temporary directory where the gateway server should write the connection - # information. - conn_info_dir = tempfile.mkdtemp() - try: - fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir) - os.close(fd) - os.unlink(conn_info_file) - - env = dict(os.environ) - env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file - if insecure: - env["_PYSPARK_CREATE_INSECURE_GATEWAY"] = "1" - - # Launch the Java gateway. - # We open a pipe to stdin so that the Java gateway can die when the pipe is broken - if not on_windows: - # Don't send ctrl-c / SIGINT to the Java gateway: - def preexec_func(): - signal.signal(signal.SIGINT, signal.SIG_IGN) - proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env) - else: - # preexec_fn not supported on Windows - proc = Popen(command, stdin=PIPE, env=env) - - # Wait for the file to appear, or for the process to exit, whichever happens first. - while not proc.poll() and not os.path.isfile(conn_info_file): - time.sleep(0.1) - - if not os.path.isfile(conn_info_file): - raise Exception("Java gateway process exited before sending its port number") - - with open(conn_info_file, "rb") as info: - gateway_port = read_int(info) - gateway_secret = UTF8Deserializer().loads(info) - finally: - shutil.rmtree(conn_info_dir) - - # In Windows, ensure the Java child processes do not linger after Python has exited. - # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when - # the parent process' stdin sends an EOF). In Windows, however, this is not possible - # because java.lang.Process reads directly from the parent process' stdin, contending - # with any opportunity to read an EOF from the parent. Note that this is only best - # effort and will not take effect if the python process is violently terminated. - if on_windows: - # In Windows, the child process here is "spark-submit.cmd", not the JVM itself - # (because the UNIX "exec" command is not available). This means we cannot simply - # call proc.kill(), which kills only the "spark-submit.cmd" process but not the - # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all - # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx) - def killChild(): - Popen(["cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid)]) - atexit.register(killChild) - - # Connect to the gateway - gateway_params = GatewayParameters(port=gateway_port, auto_convert=True) - if not insecure: - gateway_params.auth_token = gateway_secret - gateway = JavaGateway(gateway_parameters=gateway_params) - - # Import the classes used by PySpark - java_import(gateway.jvm, "org.apache.spark.SparkConf") - java_import(gateway.jvm, "org.apache.spark.api.java.*") - java_import(gateway.jvm, "org.apache.spark.api.python.*") - java_import(gateway.jvm, "org.apache.spark.ml.python.*") - java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") - # TODO(davies): move into sql - java_import(gateway.jvm, "org.apache.spark.sql.*") - java_import(gateway.jvm, "org.apache.spark.sql.api.python.*") - java_import(gateway.jvm, "org.apache.spark.sql.hive.*") - java_import(gateway.jvm, "scala.Tuple2") - - return gateway - - -def _do_server_auth(conn, auth_secret): - """ - Performs the authentication protocol defined by the SocketAuthHelper class on the given - file-like object 'conn'. - """ - write_with_length(auth_secret.encode("utf-8"), conn) - conn.flush() - reply = UTF8Deserializer().loads(conn) - if reply != "ok": - conn.close() - raise Exception("Unexpected reply from iterator server.") - - -def local_connect_and_auth(port, auth_secret): - """ - Connect to local host, authenticate with it, and return a (sockfile,sock) for that connection. - Handles IPV4 & IPV6, does some error handling. - :param port - :param auth_secret - :return: a tuple with (sockfile, sock) - """ - sock = None - errors = [] - # Support for both IPv4 and IPv6. - # On most of IPv6-ready systems, IPv6 will take precedence. - for res in socket.getaddrinfo("127.0.0.1", port, socket.AF_UNSPEC, socket.SOCK_STREAM): - af, socktype, proto, _, sa = res - try: - sock = socket.socket(af, socktype, proto) - sock.settimeout(15) - sock.connect(sa) - sockfile = sock.makefile("rwb", 65536) - _do_server_auth(sockfile, auth_secret) - return (sockfile, sock) - except socket.error as e: - emsg = _exception_message(e) - errors.append("tried to connect to %s, but an error occured: %s" % (sa, emsg)) - sock.close() - sock = None - else: - raise Exception("could not open socket: %s" % errors) - - -def ensure_callback_server_started(gw): - """ - Start callback server if not already started. The callback server is needed if the Java - driver process needs to callback into the Python driver process to execute Python code. - """ - - # getattr will fallback to JVM, so we cannot test by hasattr() - if "_callback_server" not in gw.__dict__ or gw._callback_server is None: - gw.callback_server_parameters.eager_load = True - gw.callback_server_parameters.daemonize = True - gw.callback_server_parameters.daemonize_connections = True - gw.callback_server_parameters.port = 0 - gw.start_callback_server(gw.callback_server_parameters) - cbport = gw._callback_server.server_socket.getsockname()[1] - gw._callback_server.port = cbport - # gateway with real port - gw._python_proxy_port = gw._callback_server.port - # get the GatewayServer object in JVM by ID - jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) - # update the port of CallbackClient with real port - jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/join.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/join.py deleted file mode 100644 index c1f5362..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/join.py +++ /dev/null @@ -1,113 +0,0 @@ -""" -Copyright (c) 2011, Douban Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - - * Neither the name of the Douban Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" - -from pyspark.resultiterable import ResultIterable -from functools import reduce - - -def _do_python_join(rdd, other, numPartitions, dispatch): - vs = rdd.mapValues(lambda v: (1, v)) - ws = other.mapValues(lambda v: (2, v)) - return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__())) - - -def python_join(rdd, other, numPartitions): - def dispatch(seq): - vbuf, wbuf = [], [] - for (n, v) in seq: - if n == 1: - vbuf.append(v) - elif n == 2: - wbuf.append(v) - return ((v, w) for v in vbuf for w in wbuf) - return _do_python_join(rdd, other, numPartitions, dispatch) - - -def python_right_outer_join(rdd, other, numPartitions): - def dispatch(seq): - vbuf, wbuf = [], [] - for (n, v) in seq: - if n == 1: - vbuf.append(v) - elif n == 2: - wbuf.append(v) - if not vbuf: - vbuf.append(None) - return ((v, w) for v in vbuf for w in wbuf) - return _do_python_join(rdd, other, numPartitions, dispatch) - - -def python_left_outer_join(rdd, other, numPartitions): - def dispatch(seq): - vbuf, wbuf = [], [] - for (n, v) in seq: - if n == 1: - vbuf.append(v) - elif n == 2: - wbuf.append(v) - if not wbuf: - wbuf.append(None) - return ((v, w) for v in vbuf for w in wbuf) - return _do_python_join(rdd, other, numPartitions, dispatch) - - -def python_full_outer_join(rdd, other, numPartitions): - def dispatch(seq): - vbuf, wbuf = [], [] - for (n, v) in seq: - if n == 1: - vbuf.append(v) - elif n == 2: - wbuf.append(v) - if not vbuf: - vbuf.append(None) - if not wbuf: - wbuf.append(None) - return ((v, w) for v in vbuf for w in wbuf) - return _do_python_join(rdd, other, numPartitions, dispatch) - - -def python_cogroup(rdds, numPartitions): - def make_mapper(i): - return lambda v: (i, v) - vrdds = [rdd.mapValues(make_mapper(i)) for i, rdd in enumerate(rdds)] - union_vrdds = reduce(lambda acc, other: acc.union(other), vrdds) - rdd_len = len(vrdds) - - def dispatch(seq): - bufs = [[] for _ in range(rdd_len)] - for n, v in seq: - bufs[n].append(v) - return tuple(ResultIterable(vs) for vs in bufs) - - return union_vrdds.groupByKey(numPartitions).mapValues(dispatch) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/__init__.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/__init__.py deleted file mode 100644 index d99a253..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -DataFrame-based machine learning APIs to let users quickly assemble and configure practical -machine learning pipelines. -""" -from pyspark.ml.base import Estimator, Model, Transformer, UnaryTransformer -from pyspark.ml.pipeline import Pipeline, PipelineModel -from pyspark.ml import classification, clustering, evaluation, feature, fpm, \ - image, pipeline, recommendation, regression, stat, tuning, util, linalg, param - -__all__ = [ - "Transformer", "UnaryTransformer", "Estimator", "Model", "Pipeline", "PipelineModel", - "classification", "clustering", "evaluation", "feature", "fpm", "image", - "recommendation", "regression", "stat", "tuning", "util", "linalg", "param", -] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/base.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/base.py deleted file mode 100644 index d4470b5..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/base.py +++ /dev/null @@ -1,237 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from abc import ABCMeta, abstractmethod - -import copy -import threading - -from pyspark import since -from pyspark.ml.param.shared import * -from pyspark.ml.common import inherit_doc -from pyspark.sql.functions import udf -from pyspark.sql.types import StructField, StructType - - -class _FitMultipleIterator(object): - """ - Used by default implementation of Estimator.fitMultiple to produce models in a thread safe - iterator. This class handles the simple case of fitMultiple where each param map should be - fit independently. - - :param fitSingleModel: Function: (int => Model) which fits an estimator to a dataset. - `fitSingleModel` may be called up to `numModels` times, with a unique index each time. - Each call to `fitSingleModel` with an index should return the Model associated with - that index. - :param numModel: Number of models this iterator should produce. - - See Estimator.fitMultiple for more info. - """ - def __init__(self, fitSingleModel, numModels): - """ - - """ - self.fitSingleModel = fitSingleModel - self.numModel = numModels - self.counter = 0 - self.lock = threading.Lock() - - def __iter__(self): - return self - - def __next__(self): - with self.lock: - index = self.counter - if index >= self.numModel: - raise StopIteration("No models remaining.") - self.counter += 1 - return index, self.fitSingleModel(index) - - def next(self): - """For python2 compatibility.""" - return self.__next__() - - -@inherit_doc -class Estimator(Params): - """ - Abstract class for estimators that fit models to data. - - .. versionadded:: 1.3.0 - """ - - __metaclass__ = ABCMeta - - @abstractmethod - def _fit(self, dataset): - """ - Fits a model to the input dataset. This is called by the default implementation of fit. - - :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame` - :returns: fitted model - """ - raise NotImplementedError() - - @since("2.3.0") - def fitMultiple(self, dataset, paramMaps): - """ - Fits a model to the input dataset for each param map in `paramMaps`. - - :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame`. - :param paramMaps: A Sequence of param maps. - :return: A thread safe iterable which contains one model for each param map. Each - call to `next(modelIterator)` will return `(index, model)` where model was fit - using `paramMaps[index]`. `index` values may not be sequential. - - .. note:: DeveloperApi - .. note:: Experimental - """ - estimator = self.copy() - - def fitSingleModel(index): - return estimator.fit(dataset, paramMaps[index]) - - return _FitMultipleIterator(fitSingleModel, len(paramMaps)) - - @since("1.3.0") - def fit(self, dataset, params=None): - """ - Fits a model to the input dataset with optional parameters. - - :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame` - :param params: an optional param map that overrides embedded params. If a list/tuple of - param maps is given, this calls fit on each param map and returns a list of - models. - :returns: fitted model(s) - """ - if params is None: - params = dict() - if isinstance(params, (list, tuple)): - models = [None] * len(params) - for index, model in self.fitMultiple(dataset, params): - models[index] = model - return models - elif isinstance(params, dict): - if params: - return self.copy(params)._fit(dataset) - else: - return self._fit(dataset) - else: - raise ValueError("Params must be either a param map or a list/tuple of param maps, " - "but got %s." % type(params)) - - -@inherit_doc -class Transformer(Params): - """ - Abstract class for transformers that transform one dataset into another. - - .. versionadded:: 1.3.0 - """ - - __metaclass__ = ABCMeta - - @abstractmethod - def _transform(self, dataset): - """ - Transforms the input dataset. - - :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame` - :returns: transformed dataset - """ - raise NotImplementedError() - - @since("1.3.0") - def transform(self, dataset, params=None): - """ - Transforms the input dataset with optional parameters. - - :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame` - :param params: an optional param map that overrides embedded params. - :returns: transformed dataset - """ - if params is None: - params = dict() - if isinstance(params, dict): - if params: - return self.copy(params)._transform(dataset) - else: - return self._transform(dataset) - else: - raise ValueError("Params must be a param map but got %s." % type(params)) - - -@inherit_doc -class Model(Transformer): - """ - Abstract class for models that are fitted by estimators. - - .. versionadded:: 1.4.0 - """ - - __metaclass__ = ABCMeta - - -@inherit_doc -class UnaryTransformer(HasInputCol, HasOutputCol, Transformer): - """ - Abstract class for transformers that take one input column, apply transformation, - and output the result as a new column. - - .. versionadded:: 2.3.0 - """ - - @abstractmethod - def createTransformFunc(self): - """ - Creates the transform function using the given param map. The input param map already takes - account of the embedded param map. So the param values should be determined - solely by the input param map. - """ - raise NotImplementedError() - - @abstractmethod - def outputDataType(self): - """ - Returns the data type of the output column. - """ - raise NotImplementedError() - - @abstractmethod - def validateInputType(self, inputType): - """ - Validates the input type. Throw an exception if it is invalid. - """ - raise NotImplementedError() - - def transformSchema(self, schema): - inputType = schema[self.getInputCol()].dataType - self.validateInputType(inputType) - if self.getOutputCol() in schema.names: - raise ValueError("Output column %s already exists." % self.getOutputCol()) - outputFields = copy.copy(schema.fields) - outputFields.append(StructField(self.getOutputCol(), - self.outputDataType(), - nullable=False)) - return StructType(outputFields) - - def _transform(self, dataset): - self.transformSchema(dataset.schema) - transformUDF = udf(self.createTransformFunc(), self.outputDataType()) - transformedDataset = dataset.withColumn(self.getOutputCol(), - transformUDF(dataset[self.getInputCol()])) - return transformedDataset diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/classification.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/classification.py deleted file mode 100644 index ce02851..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/classification.py +++ /dev/null @@ -1,2090 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import operator -import sys -from multiprocessing.pool import ThreadPool - -from pyspark import since, keyword_only -from pyspark.ml import Estimator, Model -from pyspark.ml.param.shared import * -from pyspark.ml.regression import DecisionTreeModel, DecisionTreeRegressionModel, \ - RandomForestParams, TreeEnsembleModel, TreeEnsembleParams -from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams -from pyspark.ml.wrapper import JavaWrapper -from pyspark.ml.common import inherit_doc, _java2py, _py2java -from pyspark.sql import DataFrame -from pyspark.sql.functions import udf, when -from pyspark.sql.types import ArrayType, DoubleType -from pyspark.storagelevel import StorageLevel - -__all__ = ['LinearSVC', 'LinearSVCModel', - 'LogisticRegression', 'LogisticRegressionModel', - 'LogisticRegressionSummary', 'LogisticRegressionTrainingSummary', - 'BinaryLogisticRegressionSummary', 'BinaryLogisticRegressionTrainingSummary', - 'DecisionTreeClassifier', 'DecisionTreeClassificationModel', - 'GBTClassifier', 'GBTClassificationModel', - 'RandomForestClassifier', 'RandomForestClassificationModel', - 'NaiveBayes', 'NaiveBayesModel', - 'MultilayerPerceptronClassifier', 'MultilayerPerceptronClassificationModel', - 'OneVsRest', 'OneVsRestModel'] - - -@inherit_doc -class JavaClassificationModel(JavaPredictionModel): - """ - (Private) Java Model produced by a ``Classifier``. - Classes are indexed {0, 1, ..., numClasses - 1}. - To be mixed in with class:`pyspark.ml.JavaModel` - """ - - @property - @since("2.1.0") - def numClasses(self): - """ - Number of classes (values which the label can take). - """ - return self._call_java("numClasses") - - -@inherit_doc -class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - HasRegParam, HasTol, HasRawPredictionCol, HasFitIntercept, HasStandardization, - HasWeightCol, HasAggregationDepth, HasThreshold, JavaMLWritable, JavaMLReadable): - """ - .. note:: Experimental - - `Linear SVM Classifier `_ - - This binary classifier optimizes the Hinge Loss using the OWLQN optimizer. - Only supports L2 regularization currently. - - >>> from pyspark.sql import Row - >>> from pyspark.ml.linalg import Vectors - >>> df = sc.parallelize([ - ... Row(label=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), - ... Row(label=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() - >>> svm = LinearSVC(maxIter=5, regParam=0.01) - >>> model = svm.fit(df) - >>> model.coefficients - DenseVector([0.0, -0.2792, -0.1833]) - >>> model.intercept - 1.0206118982229047 - >>> model.numClasses - 2 - >>> model.numFeatures - 3 - >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, -1.0, -1.0))]).toDF() - >>> result = model.transform(test0).head() - >>> result.prediction - 1.0 - >>> result.rawPrediction - DenseVector([-1.4831, 1.4831]) - >>> svm_path = temp_path + "/svm" - >>> svm.save(svm_path) - >>> svm2 = LinearSVC.load(svm_path) - >>> svm2.getMaxIter() - 5 - >>> model_path = temp_path + "/svm_model" - >>> model.save(model_path) - >>> model2 = LinearSVCModel.load(model_path) - >>> model.coefficients[0] == model2.coefficients[0] - True - >>> model.intercept == model2.intercept - True - - .. versionadded:: 2.2.0 - """ - - threshold = Param(Params._dummy(), "threshold", - "The threshold in binary classification applied to the linear model" - " prediction. This threshold can be any real number, where Inf will make" - " all predictions 0.0 and -Inf will make all predictions 1.0.", - typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", - fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, - aggregationDepth=2): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", \ - fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \ - aggregationDepth=2): - """ - super(LinearSVC, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.classification.LinearSVC", self.uid) - self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, fitIntercept=True, - standardization=True, threshold=0.0, aggregationDepth=2) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.2.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", - fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, - aggregationDepth=2): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", \ - fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \ - aggregationDepth=2): - Sets params for Linear SVM Classifier. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return LinearSVCModel(java_model) - - -class LinearSVCModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): - """ - .. note:: Experimental - - Model fitted by LinearSVC. - - .. versionadded:: 2.2.0 - """ - - @property - @since("2.2.0") - def coefficients(self): - """ - Model coefficients of Linear SVM Classifier. - """ - return self._call_java("coefficients") - - @property - @since("2.2.0") - def intercept(self): - """ - Model intercept of Linear SVM Classifier. - """ - return self._call_java("intercept") - - -@inherit_doc -class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol, - HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, - HasWeightCol, HasAggregationDepth, JavaMLWritable, JavaMLReadable): - """ - Logistic regression. - This class supports multinomial logistic (softmax) and binomial logistic regression. - - >>> from pyspark.sql import Row - >>> from pyspark.ml.linalg import Vectors - >>> bdf = sc.parallelize([ - ... Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)), - ... Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)), - ... Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)), - ... Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))]).toDF() - >>> blor = LogisticRegression(regParam=0.01, weightCol="weight") - >>> blorModel = blor.fit(bdf) - >>> blorModel.coefficients - DenseVector([-1.080..., -0.646...]) - >>> blorModel.intercept - 3.112... - >>> data_path = "data/mllib/sample_multiclass_classification_data.txt" - >>> mdf = spark.read.format("libsvm").load(data_path) - >>> mlor = LogisticRegression(regParam=0.1, elasticNetParam=1.0, family="multinomial") - >>> mlorModel = mlor.fit(mdf) - >>> mlorModel.coefficientMatrix - SparseMatrix(3, 4, [0, 1, 2, 3], [3, 2, 1], [1.87..., -2.75..., -0.50...], 1) - >>> mlorModel.interceptVector - DenseVector([0.04..., -0.42..., 0.37...]) - >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 1.0))]).toDF() - >>> result = blorModel.transform(test0).head() - >>> result.prediction - 1.0 - >>> result.probability - DenseVector([0.02..., 0.97...]) - >>> result.rawPrediction - DenseVector([-3.54..., 3.54...]) - >>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF() - >>> blorModel.transform(test1).head().prediction - 1.0 - >>> blor.setParams("vector") - Traceback (most recent call last): - ... - TypeError: Method setParams forces keyword arguments. - >>> lr_path = temp_path + "/lr" - >>> blor.save(lr_path) - >>> lr2 = LogisticRegression.load(lr_path) - >>> lr2.getRegParam() - 0.01 - >>> model_path = temp_path + "/lr_model" - >>> blorModel.save(model_path) - >>> model2 = LogisticRegressionModel.load(model_path) - >>> blorModel.coefficients[0] == model2.coefficients[0] - True - >>> blorModel.intercept == model2.intercept - True - >>> model2 - LogisticRegressionModel: uid = ..., numClasses = 2, numFeatures = 2 - - .. versionadded:: 1.3.0 - """ - - threshold = Param(Params._dummy(), "threshold", - "Threshold in binary classification prediction, in range [0, 1]." + - " If threshold and thresholds are both set, they must match." + - "e.g. if threshold is p, then thresholds must be equal to [1-p, p].", - typeConverter=TypeConverters.toFloat) - - family = Param(Params._dummy(), "family", - "The name of family which is a description of the label distribution to " + - "be used in the model. Supported options: auto, binomial, multinomial", - typeConverter=TypeConverters.toString) - - lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients", - "The lower bounds on coefficients if fitting under bound " - "constrained optimization. The bound matrix must be " - "compatible with the shape " - "(1, number of features) for binomial regression, or " - "(number of classes, number of features) " - "for multinomial regression.", - typeConverter=TypeConverters.toMatrix) - - upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients", - "The upper bounds on coefficients if fitting under bound " - "constrained optimization. The bound matrix must be " - "compatible with the shape " - "(1, number of features) for binomial regression, or " - "(number of classes, number of features) " - "for multinomial regression.", - typeConverter=TypeConverters.toMatrix) - - lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts", - "The lower bounds on intercepts if fitting under bound " - "constrained optimization. The bounds vector size must be" - "equal with 1 for binomial regression, or the number of" - "lasses for multinomial regression.", - typeConverter=TypeConverters.toVector) - - upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts", - "The upper bounds on intercepts if fitting under bound " - "constrained optimization. The bound vector size must be " - "equal with 1 for binomial regression, or the number of " - "classes for multinomial regression.", - typeConverter=TypeConverters.toVector) - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, - threshold=0.5, thresholds=None, probabilityCol="probability", - rawPredictionCol="rawPrediction", standardization=True, weightCol=None, - aggregationDepth=2, family="auto", - lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, - lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): - - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ - threshold=0.5, thresholds=None, probabilityCol="probability", \ - rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \ - aggregationDepth=2, family="auto", \ - lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \ - lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): - If the threshold and thresholds Params are both set, they must be equivalent. - """ - super(LogisticRegression, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.classification.LogisticRegression", self.uid) - self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto") - kwargs = self._input_kwargs - self.setParams(**kwargs) - self._checkThresholdConsistency() - - @keyword_only - @since("1.3.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, - threshold=0.5, thresholds=None, probabilityCol="probability", - rawPredictionCol="rawPrediction", standardization=True, weightCol=None, - aggregationDepth=2, family="auto", - lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, - lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ - threshold=0.5, thresholds=None, probabilityCol="probability", \ - rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \ - aggregationDepth=2, family="auto", \ - lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \ - lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): - Sets params for logistic regression. - If the threshold and thresholds Params are both set, they must be equivalent. - """ - kwargs = self._input_kwargs - self._set(**kwargs) - self._checkThresholdConsistency() - return self - - def _create_model(self, java_model): - return LogisticRegressionModel(java_model) - - @since("1.4.0") - def setThreshold(self, value): - """ - Sets the value of :py:attr:`threshold`. - Clears value of :py:attr:`thresholds` if it has been set. - """ - self._set(threshold=value) - self._clear(self.thresholds) - return self - - @since("1.4.0") - def getThreshold(self): - """ - Get threshold for binary classification. - - If :py:attr:`thresholds` is set with length 2 (i.e., binary classification), - this returns the equivalent threshold: - :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. - Otherwise, returns :py:attr:`threshold` if set or its default value if unset. - """ - self._checkThresholdConsistency() - if self.isSet(self.thresholds): - ts = self.getOrDefault(self.thresholds) - if len(ts) != 2: - raise ValueError("Logistic Regression getThreshold only applies to" + - " binary classification, but thresholds has length != 2." + - " thresholds: " + ",".join(ts)) - return 1.0/(1.0 + ts[0]/ts[1]) - else: - return self.getOrDefault(self.threshold) - - @since("1.5.0") - def setThresholds(self, value): - """ - Sets the value of :py:attr:`thresholds`. - Clears value of :py:attr:`threshold` if it has been set. - """ - self._set(thresholds=value) - self._clear(self.threshold) - return self - - @since("1.5.0") - def getThresholds(self): - """ - If :py:attr:`thresholds` is set, return its value. - Otherwise, if :py:attr:`threshold` is set, return the equivalent thresholds for binary - classification: (1-threshold, threshold). - If neither are set, throw an error. - """ - self._checkThresholdConsistency() - if not self.isSet(self.thresholds) and self.isSet(self.threshold): - t = self.getOrDefault(self.threshold) - return [1.0-t, t] - else: - return self.getOrDefault(self.thresholds) - - def _checkThresholdConsistency(self): - if self.isSet(self.threshold) and self.isSet(self.thresholds): - ts = self.getOrDefault(self.thresholds) - if len(ts) != 2: - raise ValueError("Logistic Regression getThreshold only applies to" + - " binary classification, but thresholds has length != 2." + - " thresholds: {0}".format(str(ts))) - t = 1.0/(1.0 + ts[0]/ts[1]) - t2 = self.getOrDefault(self.threshold) - if abs(t2 - t) >= 1E-5: - raise ValueError("Logistic Regression getThreshold found inconsistent values for" + - " threshold (%g) and thresholds (equivalent to %g)" % (t2, t)) - - @since("2.1.0") - def setFamily(self, value): - """ - Sets the value of :py:attr:`family`. - """ - return self._set(family=value) - - @since("2.1.0") - def getFamily(self): - """ - Gets the value of :py:attr:`family` or its default value. - """ - return self.getOrDefault(self.family) - - @since("2.3.0") - def setLowerBoundsOnCoefficients(self, value): - """ - Sets the value of :py:attr:`lowerBoundsOnCoefficients` - """ - return self._set(lowerBoundsOnCoefficients=value) - - @since("2.3.0") - def getLowerBoundsOnCoefficients(self): - """ - Gets the value of :py:attr:`lowerBoundsOnCoefficients` - """ - return self.getOrDefault(self.lowerBoundsOnCoefficients) - - @since("2.3.0") - def setUpperBoundsOnCoefficients(self, value): - """ - Sets the value of :py:attr:`upperBoundsOnCoefficients` - """ - return self._set(upperBoundsOnCoefficients=value) - - @since("2.3.0") - def getUpperBoundsOnCoefficients(self): - """ - Gets the value of :py:attr:`upperBoundsOnCoefficients` - """ - return self.getOrDefault(self.upperBoundsOnCoefficients) - - @since("2.3.0") - def setLowerBoundsOnIntercepts(self, value): - """ - Sets the value of :py:attr:`lowerBoundsOnIntercepts` - """ - return self._set(lowerBoundsOnIntercepts=value) - - @since("2.3.0") - def getLowerBoundsOnIntercepts(self): - """ - Gets the value of :py:attr:`lowerBoundsOnIntercepts` - """ - return self.getOrDefault(self.lowerBoundsOnIntercepts) - - @since("2.3.0") - def setUpperBoundsOnIntercepts(self, value): - """ - Sets the value of :py:attr:`upperBoundsOnIntercepts` - """ - return self._set(upperBoundsOnIntercepts=value) - - @since("2.3.0") - def getUpperBoundsOnIntercepts(self): - """ - Gets the value of :py:attr:`upperBoundsOnIntercepts` - """ - return self.getOrDefault(self.upperBoundsOnIntercepts) - - -class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): - """ - Model fitted by LogisticRegression. - - .. versionadded:: 1.3.0 - """ - - @property - @since("2.0.0") - def coefficients(self): - """ - Model coefficients of binomial logistic regression. - An exception is thrown in the case of multinomial logistic regression. - """ - return self._call_java("coefficients") - - @property - @since("1.4.0") - def intercept(self): - """ - Model intercept of binomial logistic regression. - An exception is thrown in the case of multinomial logistic regression. - """ - return self._call_java("intercept") - - @property - @since("2.1.0") - def coefficientMatrix(self): - """ - Model coefficients. - """ - return self._call_java("coefficientMatrix") - - @property - @since("2.1.0") - def interceptVector(self): - """ - Model intercept. - """ - return self._call_java("interceptVector") - - @property - @since("2.0.0") - def summary(self): - """ - Gets summary (e.g. accuracy/precision/recall, objective history, total iterations) of model - trained on the training set. An exception is thrown if `trainingSummary is None`. - """ - if self.hasSummary: - java_lrt_summary = self._call_java("summary") - if self.numClasses <= 2: - return BinaryLogisticRegressionTrainingSummary(java_lrt_summary) - else: - return LogisticRegressionTrainingSummary(java_lrt_summary) - else: - raise RuntimeError("No training summary available for this %s" % - self.__class__.__name__) - - @property - @since("2.0.0") - def hasSummary(self): - """ - Indicates whether a training summary exists for this model - instance. - """ - return self._call_java("hasSummary") - - @since("2.0.0") - def evaluate(self, dataset): - """ - Evaluates the model on a test dataset. - - :param dataset: - Test dataset to evaluate model on, where dataset is an - instance of :py:class:`pyspark.sql.DataFrame` - """ - if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) - java_blr_summary = self._call_java("evaluate", dataset) - return BinaryLogisticRegressionSummary(java_blr_summary) - - def __repr__(self): - return self._call_java("toString") - - -class LogisticRegressionSummary(JavaWrapper): - """ - .. note:: Experimental - - Abstraction for Logistic Regression Results for a given model. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def predictions(self): - """ - Dataframe outputted by the model's `transform` method. - """ - return self._call_java("predictions") - - @property - @since("2.0.0") - def probabilityCol(self): - """ - Field in "predictions" which gives the probability - of each class as a vector. - """ - return self._call_java("probabilityCol") - - @property - @since("2.3.0") - def predictionCol(self): - """ - Field in "predictions" which gives the prediction of each class. - """ - return self._call_java("predictionCol") - - @property - @since("2.0.0") - def labelCol(self): - """ - Field in "predictions" which gives the true label of each - instance. - """ - return self._call_java("labelCol") - - @property - @since("2.0.0") - def featuresCol(self): - """ - Field in "predictions" which gives the features of each instance - as a vector. - """ - return self._call_java("featuresCol") - - @property - @since("2.3.0") - def labels(self): - """ - Returns the sequence of labels in ascending order. This order matches the order used - in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel. - - Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the - training set is missing a label, then all of the arrays over labels - (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the - expected numClasses. - """ - return self._call_java("labels") - - @property - @since("2.3.0") - def truePositiveRateByLabel(self): - """ - Returns true positive rate for each label (category). - """ - return self._call_java("truePositiveRateByLabel") - - @property - @since("2.3.0") - def falsePositiveRateByLabel(self): - """ - Returns false positive rate for each label (category). - """ - return self._call_java("falsePositiveRateByLabel") - - @property - @since("2.3.0") - def precisionByLabel(self): - """ - Returns precision for each label (category). - """ - return self._call_java("precisionByLabel") - - @property - @since("2.3.0") - def recallByLabel(self): - """ - Returns recall for each label (category). - """ - return self._call_java("recallByLabel") - - @since("2.3.0") - def fMeasureByLabel(self, beta=1.0): - """ - Returns f-measure for each label (category). - """ - return self._call_java("fMeasureByLabel", beta) - - @property - @since("2.3.0") - def accuracy(self): - """ - Returns accuracy. - (equals to the total number of correctly classified instances - out of the total number of instances.) - """ - return self._call_java("accuracy") - - @property - @since("2.3.0") - def weightedTruePositiveRate(self): - """ - Returns weighted true positive rate. - (equals to precision, recall and f-measure) - """ - return self._call_java("weightedTruePositiveRate") - - @property - @since("2.3.0") - def weightedFalsePositiveRate(self): - """ - Returns weighted false positive rate. - """ - return self._call_java("weightedFalsePositiveRate") - - @property - @since("2.3.0") - def weightedRecall(self): - """ - Returns weighted averaged recall. - (equals to precision, recall and f-measure) - """ - return self._call_java("weightedRecall") - - @property - @since("2.3.0") - def weightedPrecision(self): - """ - Returns weighted averaged precision. - """ - return self._call_java("weightedPrecision") - - @since("2.3.0") - def weightedFMeasure(self, beta=1.0): - """ - Returns weighted averaged f-measure. - """ - return self._call_java("weightedFMeasure", beta) - - -@inherit_doc -class LogisticRegressionTrainingSummary(LogisticRegressionSummary): - """ - .. note:: Experimental - - Abstraction for multinomial Logistic Regression Training results. - Currently, the training summary ignores the training weights except - for the objective trace. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def objectiveHistory(self): - """ - Objective function (scaled loss + regularization) at each - iteration. - """ - return self._call_java("objectiveHistory") - - @property - @since("2.0.0") - def totalIterations(self): - """ - Number of training iterations until termination. - """ - return self._call_java("totalIterations") - - -@inherit_doc -class BinaryLogisticRegressionSummary(LogisticRegressionSummary): - """ - .. note:: Experimental - - Binary Logistic regression results for a given model. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def roc(self): - """ - Returns the receiver operating characteristic (ROC) curve, - which is a Dataframe having two fields (FPR, TPR) with - (0.0, 0.0) prepended and (1.0, 1.0) appended to it. - - .. seealso:: `Wikipedia reference - `_ - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("roc") - - @property - @since("2.0.0") - def areaUnderROC(self): - """ - Computes the area under the receiver operating characteristic - (ROC) curve. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("areaUnderROC") - - @property - @since("2.0.0") - def pr(self): - """ - Returns the precision-recall curve, which is a Dataframe - containing two fields recall, precision with (0.0, 1.0) prepended - to it. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("pr") - - @property - @since("2.0.0") - def fMeasureByThreshold(self): - """ - Returns a dataframe with two fields (threshold, F-Measure) curve - with beta = 1.0. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("fMeasureByThreshold") - - @property - @since("2.0.0") - def precisionByThreshold(self): - """ - Returns a dataframe with two fields (threshold, precision) curve. - Every possible probability obtained in transforming the dataset - are used as thresholds used in calculating the precision. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("precisionByThreshold") - - @property - @since("2.0.0") - def recallByThreshold(self): - """ - Returns a dataframe with two fields (threshold, recall) curve. - Every possible probability obtained in transforming the dataset - are used as thresholds used in calculating the recall. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("recallByThreshold") - - -@inherit_doc -class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, - LogisticRegressionTrainingSummary): - """ - .. note:: Experimental - - Binary Logistic regression training results for a given model. - - .. versionadded:: 2.0.0 - """ - pass - - -class TreeClassifierParams(object): - """ - Private class to track supported impurity measures. - - .. versionadded:: 1.4.0 - """ - supportedImpurities = ["entropy", "gini"] - - impurity = Param(Params._dummy(), "impurity", - "Criterion used for information gain calculation (case-insensitive). " + - "Supported options: " + - ", ".join(supportedImpurities), typeConverter=TypeConverters.toString) - - def __init__(self): - super(TreeClassifierParams, self).__init__() - - @since("1.6.0") - def setImpurity(self, value): - """ - Sets the value of :py:attr:`impurity`. - """ - return self._set(impurity=value) - - @since("1.6.0") - def getImpurity(self): - """ - Gets the value of impurity or its default value. - """ - return self.getOrDefault(self.impurity) - - -class GBTParams(TreeEnsembleParams): - """ - Private class to track supported GBT params. - - .. versionadded:: 1.4.0 - """ - supportedLossTypes = ["logistic"] - - -@inherit_doc -class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - HasProbabilityCol, HasRawPredictionCol, DecisionTreeParams, - TreeClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable, - JavaMLReadable): - """ - `Decision tree `_ - learning algorithm for classification. - It supports both binary and multiclass labels, as well as both continuous and categorical - features. - - >>> from pyspark.ml.linalg import Vectors - >>> from pyspark.ml.feature import StringIndexer - >>> df = spark.createDataFrame([ - ... (1.0, Vectors.dense(1.0)), - ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") - >>> si_model = stringIndexer.fit(df) - >>> td = si_model.transform(df) - >>> dt = DecisionTreeClassifier(maxDepth=2, labelCol="indexed") - >>> model = dt.fit(td) - >>> model.numNodes - 3 - >>> model.depth - 1 - >>> model.featureImportances - SparseVector(1, {0: 1.0}) - >>> model.numFeatures - 1 - >>> model.numClasses - 2 - >>> print(model.toDebugString) - DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes... - >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) - >>> result = model.transform(test0).head() - >>> result.prediction - 0.0 - >>> result.probability - DenseVector([1.0, 0.0]) - >>> result.rawPrediction - DenseVector([1.0, 0.0]) - >>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) - >>> model.transform(test1).head().prediction - 1.0 - - >>> dtc_path = temp_path + "/dtc" - >>> dt.save(dtc_path) - >>> dt2 = DecisionTreeClassifier.load(dtc_path) - >>> dt2.getMaxDepth() - 2 - >>> model_path = temp_path + "/dtc_model" - >>> model.save(model_path) - >>> model2 = DecisionTreeClassificationModel.load(model_path) - >>> model.featureImportances == model2.featureImportances - True - - .. versionadded:: 1.4.0 - """ - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - probabilityCol="probability", rawPredictionCol="rawPrediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", - seed=None): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - probabilityCol="probability", rawPredictionCol="rawPrediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ - seed=None) - """ - super(DecisionTreeClassifier, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.classification.DecisionTreeClassifier", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="gini") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - probabilityCol="probability", rawPredictionCol="rawPrediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="gini", seed=None): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - probabilityCol="probability", rawPredictionCol="rawPrediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ - seed=None) - Sets params for the DecisionTreeClassifier. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return DecisionTreeClassificationModel(java_model) - - -@inherit_doc -class DecisionTreeClassificationModel(DecisionTreeModel, JavaClassificationModel, JavaMLWritable, - JavaMLReadable): - """ - Model fitted by DecisionTreeClassifier. - - .. versionadded:: 1.4.0 - """ - - @property - @since("2.0.0") - def featureImportances(self): - """ - Estimate of the importance of each feature. - - This generalizes the idea of "Gini" importance to other losses, - following the explanation of Gini importance from "Random Forests" documentation - by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn. - - This feature importance is calculated as follows: - - importance(feature j) = sum (over nodes which split on feature j) of the gain, - where gain is scaled by the number of instances passing through node - - Normalize importances for tree to sum to 1. - - .. note:: Feature importance for single decision trees can have high variance due to - correlated predictor variables. Consider using a :py:class:`RandomForestClassifier` - to determine feature importance instead. - """ - return self._call_java("featureImportances") - - -@inherit_doc -class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed, - HasRawPredictionCol, HasProbabilityCol, - RandomForestParams, TreeClassifierParams, HasCheckpointInterval, - JavaMLWritable, JavaMLReadable): - """ - `Random Forest `_ - learning algorithm for classification. - It supports both binary and multiclass labels, as well as both continuous and categorical - features. - - >>> import numpy - >>> from numpy import allclose - >>> from pyspark.ml.linalg import Vectors - >>> from pyspark.ml.feature import StringIndexer - >>> df = spark.createDataFrame([ - ... (1.0, Vectors.dense(1.0)), - ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") - >>> si_model = stringIndexer.fit(df) - >>> td = si_model.transform(df) - >>> rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42) - >>> model = rf.fit(td) - >>> model.featureImportances - SparseVector(1, {0: 1.0}) - >>> allclose(model.treeWeights, [1.0, 1.0, 1.0]) - True - >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) - >>> result = model.transform(test0).head() - >>> result.prediction - 0.0 - >>> numpy.argmax(result.probability) - 0 - >>> numpy.argmax(result.rawPrediction) - 0 - >>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) - >>> model.transform(test1).head().prediction - 1.0 - >>> model.trees - [DecisionTreeClassificationModel (uid=...) of depth..., DecisionTreeClassificationModel...] - >>> rfc_path = temp_path + "/rfc" - >>> rf.save(rfc_path) - >>> rf2 = RandomForestClassifier.load(rfc_path) - >>> rf2.getNumTrees() - 3 - >>> model_path = temp_path + "/rfc_model" - >>> model.save(model_path) - >>> model2 = RandomForestClassificationModel.load(model_path) - >>> model.featureImportances == model2.featureImportances - True - - .. versionadded:: 1.4.0 - """ - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - probabilityCol="probability", rawPredictionCol="rawPrediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", - numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - probabilityCol="probability", rawPredictionCol="rawPrediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ - numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0) - """ - super(RandomForestClassifier, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.classification.RandomForestClassifier", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="gini", numTrees=20, featureSubsetStrategy="auto", - subsamplingRate=1.0) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - probabilityCol="probability", rawPredictionCol="rawPrediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, - impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - probabilityCol="probability", rawPredictionCol="rawPrediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \ - impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0) - Sets params for linear classification. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return RandomForestClassificationModel(java_model) - - @since("2.4.0") - def setFeatureSubsetStrategy(self, value): - """ - Sets the value of :py:attr:`featureSubsetStrategy`. - """ - return self._set(featureSubsetStrategy=value) - - -class RandomForestClassificationModel(TreeEnsembleModel, JavaClassificationModel, JavaMLWritable, - JavaMLReadable): - """ - Model fitted by RandomForestClassifier. - - .. versionadded:: 1.4.0 - """ - - @property - @since("2.0.0") - def featureImportances(self): - """ - Estimate of the importance of each feature. - - Each feature's importance is the average of its importance across all trees in the ensemble - The importance vector is normalized to sum to 1. This method is suggested by Hastie et al. - (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.) - and follows the implementation from scikit-learn. - - .. seealso:: :py:attr:`DecisionTreeClassificationModel.featureImportances` - """ - return self._call_java("featureImportances") - - @property - @since("2.0.0") - def trees(self): - """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeClassificationModel(m) for m in list(self._call_java("trees"))] - - -@inherit_doc -class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, - JavaMLReadable): - """ - `Gradient-Boosted Trees (GBTs) `_ - learning algorithm for classification. - It supports binary labels, as well as both continuous and categorical features. - - The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999. - - Notes on Gradient Boosting vs. TreeBoost: - - This implementation is for Stochastic Gradient Boosting, not for TreeBoost. - - Both algorithms learn tree ensembles by minimizing loss functions. - - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes - based on the loss function, whereas the original gradient boosting method does not. - - We expect to implement TreeBoost in the future: - `SPARK-4240 `_ - - .. note:: Multiclass labels are not currently supported. - - >>> from numpy import allclose - >>> from pyspark.ml.linalg import Vectors - >>> from pyspark.ml.feature import StringIndexer - >>> df = spark.createDataFrame([ - ... (1.0, Vectors.dense(1.0)), - ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") - >>> si_model = stringIndexer.fit(df) - >>> td = si_model.transform(df) - >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42) - >>> gbt.getFeatureSubsetStrategy() - 'all' - >>> model = gbt.fit(td) - >>> model.featureImportances - SparseVector(1, {0: 1.0}) - >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) - True - >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) - >>> model.transform(test0).head().prediction - 0.0 - >>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) - >>> model.transform(test1).head().prediction - 1.0 - >>> model.totalNumNodes - 15 - >>> print(model.toDebugString) - GBTClassificationModel (uid=...)...with 5 trees... - >>> gbtc_path = temp_path + "gbtc" - >>> gbt.save(gbtc_path) - >>> gbt2 = GBTClassifier.load(gbtc_path) - >>> gbt2.getMaxDepth() - 2 - >>> model_path = temp_path + "gbtc_model" - >>> model.save(model_path) - >>> model2 = GBTClassificationModel.load(model_path) - >>> model.featureImportances == model2.featureImportances - True - >>> model.treeWeights == model2.treeWeights - True - >>> model.trees - [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...] - >>> validation = spark.createDataFrame([(0.0, Vectors.dense(-1.0),)], - ... ["indexed", "features"]) - >>> model.evaluateEachIteration(validation) - [0.25..., 0.23..., 0.21..., 0.19..., 0.18...] - >>> model.numClasses - 2 - - .. versionadded:: 1.4.0 - """ - - lossType = Param(Params._dummy(), "lossType", - "Loss function which GBT tries to minimize (case-insensitive). " + - "Supported options: " + ", ".join(GBTParams.supportedLossTypes), - typeConverter=TypeConverters.toString) - - stepSize = Param(Params._dummy(), "stepSize", - "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " + - "the contribution of each estimator.", - typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", - maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, - featureSubsetStrategy="all"): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ - featureSubsetStrategy="all") - """ - super(GBTClassifier, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.classification.GBTClassifier", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0, - featureSubsetStrategy="all") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, - featureSubsetStrategy="all"): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ - featureSubsetStrategy="all") - Sets params for Gradient Boosted Tree Classification. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return GBTClassificationModel(java_model) - - @since("1.4.0") - def setLossType(self, value): - """ - Sets the value of :py:attr:`lossType`. - """ - return self._set(lossType=value) - - @since("1.4.0") - def getLossType(self): - """ - Gets the value of lossType or its default value. - """ - return self.getOrDefault(self.lossType) - - @since("2.4.0") - def setFeatureSubsetStrategy(self, value): - """ - Sets the value of :py:attr:`featureSubsetStrategy`. - """ - return self._set(featureSubsetStrategy=value) - - -class GBTClassificationModel(TreeEnsembleModel, JavaClassificationModel, JavaMLWritable, - JavaMLReadable): - """ - Model fitted by GBTClassifier. - - .. versionadded:: 1.4.0 - """ - - @property - @since("2.0.0") - def featureImportances(self): - """ - Estimate of the importance of each feature. - - Each feature's importance is the average of its importance across all trees in the ensemble - The importance vector is normalized to sum to 1. This method is suggested by Hastie et al. - (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.) - and follows the implementation from scikit-learn. - - .. seealso:: :py:attr:`DecisionTreeClassificationModel.featureImportances` - """ - return self._call_java("featureImportances") - - @property - @since("2.0.0") - def trees(self): - """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] - - @since("2.4.0") - def evaluateEachIteration(self, dataset): - """ - Method to compute error or loss for every iteration of gradient boosting. - - :param dataset: - Test dataset to evaluate model on, where dataset is an - instance of :py:class:`pyspark.sql.DataFrame` - """ - return self._call_java("evaluateEachIteration", dataset) - - -@inherit_doc -class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol, - HasRawPredictionCol, HasThresholds, HasWeightCol, JavaMLWritable, JavaMLReadable): - """ - Naive Bayes Classifiers. - It supports both Multinomial and Bernoulli NB. `Multinomial NB - `_ - can handle finitely supported discrete data. For example, by converting documents into - TF-IDF vectors, it can be used for document classification. By making every vector a - binary (0/1) data, it can also be used as `Bernoulli NB - `_. - The input feature values must be nonnegative. - - >>> from pyspark.sql import Row - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([ - ... Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])), - ... Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])), - ... Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))]) - >>> nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight") - >>> model = nb.fit(df) - >>> model.pi - DenseVector([-0.81..., -0.58...]) - >>> model.theta - DenseMatrix(2, 2, [-0.91..., -0.51..., -0.40..., -1.09...], 1) - >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() - >>> result = model.transform(test0).head() - >>> result.prediction - 1.0 - >>> result.probability - DenseVector([0.32..., 0.67...]) - >>> result.rawPrediction - DenseVector([-1.72..., -0.99...]) - >>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF() - >>> model.transform(test1).head().prediction - 1.0 - >>> nb_path = temp_path + "/nb" - >>> nb.save(nb_path) - >>> nb2 = NaiveBayes.load(nb_path) - >>> nb2.getSmoothing() - 1.0 - >>> model_path = temp_path + "/nb_model" - >>> model.save(model_path) - >>> model2 = NaiveBayesModel.load(model_path) - >>> model.pi == model2.pi - True - >>> model.theta == model2.theta - True - >>> nb = nb.setThresholds([0.01, 10.00]) - >>> model3 = nb.fit(df) - >>> result = model3.transform(test0).head() - >>> result.prediction - 0.0 - - .. versionadded:: 1.5.0 - """ - - smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " + - "default is 1.0", typeConverter=TypeConverters.toFloat) - modelType = Param(Params._dummy(), "modelType", "The model type which is a string " + - "(case-sensitive). Supported options: multinomial (default) and bernoulli.", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0, - modelType="multinomial", thresholds=None, weightCol=None): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0, \ - modelType="multinomial", thresholds=None, weightCol=None) - """ - super(NaiveBayes, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.classification.NaiveBayes", self.uid) - self._setDefault(smoothing=1.0, modelType="multinomial") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.5.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0, - modelType="multinomial", thresholds=None, weightCol=None): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - probabilityCol="probability", rawPredictionCol="rawPrediction", smoothing=1.0, \ - modelType="multinomial", thresholds=None, weightCol=None) - Sets params for Naive Bayes. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return NaiveBayesModel(java_model) - - @since("1.5.0") - def setSmoothing(self, value): - """ - Sets the value of :py:attr:`smoothing`. - """ - return self._set(smoothing=value) - - @since("1.5.0") - def getSmoothing(self): - """ - Gets the value of smoothing or its default value. - """ - return self.getOrDefault(self.smoothing) - - @since("1.5.0") - def setModelType(self, value): - """ - Sets the value of :py:attr:`modelType`. - """ - return self._set(modelType=value) - - @since("1.5.0") - def getModelType(self): - """ - Gets the value of modelType or its default value. - """ - return self.getOrDefault(self.modelType) - - -class NaiveBayesModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): - """ - Model fitted by NaiveBayes. - - .. versionadded:: 1.5.0 - """ - - @property - @since("2.0.0") - def pi(self): - """ - log of class priors. - """ - return self._call_java("pi") - - @property - @since("2.0.0") - def theta(self): - """ - log of class conditional probabilities. - """ - return self._call_java("theta") - - -@inherit_doc -class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - HasMaxIter, HasTol, HasSeed, HasStepSize, HasSolver, - JavaMLWritable, JavaMLReadable, HasProbabilityCol, - HasRawPredictionCol): - """ - Classifier trainer based on the Multilayer Perceptron. - Each layer has sigmoid activation function, output layer has softmax. - Number of inputs has to be equal to the size of feature vectors. - Number of outputs has to be equal to the total number of labels. - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([ - ... (0.0, Vectors.dense([0.0, 0.0])), - ... (1.0, Vectors.dense([0.0, 1.0])), - ... (1.0, Vectors.dense([1.0, 0.0])), - ... (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"]) - >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 2, 2], blockSize=1, seed=123) - >>> model = mlp.fit(df) - >>> model.layers - [2, 2, 2] - >>> model.weights.size - 12 - >>> testDF = spark.createDataFrame([ - ... (Vectors.dense([1.0, 0.0]),), - ... (Vectors.dense([0.0, 0.0]),)], ["features"]) - >>> model.transform(testDF).select("features", "prediction").show() - +---------+----------+ - | features|prediction| - +---------+----------+ - |[1.0,0.0]| 1.0| - |[0.0,0.0]| 0.0| - +---------+----------+ - ... - >>> mlp_path = temp_path + "/mlp" - >>> mlp.save(mlp_path) - >>> mlp2 = MultilayerPerceptronClassifier.load(mlp_path) - >>> mlp2.getBlockSize() - 1 - >>> model_path = temp_path + "/mlp_model" - >>> model.save(model_path) - >>> model2 = MultilayerPerceptronClassificationModel.load(model_path) - >>> model.layers == model2.layers - True - >>> model.weights == model2.weights - True - >>> mlp2 = mlp2.setInitialWeights(list(range(0, 12))) - >>> model3 = mlp2.fit(df) - >>> model3.weights != model2.weights - True - >>> model3.layers == model.layers - True - - .. versionadded:: 1.6.0 - """ - - layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " + - "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " + - "neurons and output layer of 10 neurons.", - typeConverter=TypeConverters.toListInt) - blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " + - "matrices. Data is stacked within partitions. If block size is more than " + - "remaining data in a partition then it is adjusted to the size of this " + - "data. Recommended size is between 10 and 1000, default is 128.", - typeConverter=TypeConverters.toInt) - solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + - "options: l-bfgs, gd.", typeConverter=TypeConverters.toString) - initialWeights = Param(Params._dummy(), "initialWeights", "The initial weights of the model.", - typeConverter=TypeConverters.toVector) - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, - solver="l-bfgs", initialWeights=None, probabilityCol="probability", - rawPredictionCol="rawPrediction"): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \ - solver="l-bfgs", initialWeights=None, probabilityCol="probability", \ - rawPredictionCol="rawPrediction") - """ - super(MultilayerPerceptronClassifier, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) - self._setDefault(maxIter=100, tol=1E-6, blockSize=128, stepSize=0.03, solver="l-bfgs") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.6.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, - solver="l-bfgs", initialWeights=None, probabilityCol="probability", - rawPredictionCol="rawPrediction"): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, \ - solver="l-bfgs", initialWeights=None, probabilityCol="probability", \ - rawPredictionCol="rawPrediction"): - Sets params for MultilayerPerceptronClassifier. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return MultilayerPerceptronClassificationModel(java_model) - - @since("1.6.0") - def setLayers(self, value): - """ - Sets the value of :py:attr:`layers`. - """ - return self._set(layers=value) - - @since("1.6.0") - def getLayers(self): - """ - Gets the value of layers or its default value. - """ - return self.getOrDefault(self.layers) - - @since("1.6.0") - def setBlockSize(self, value): - """ - Sets the value of :py:attr:`blockSize`. - """ - return self._set(blockSize=value) - - @since("1.6.0") - def getBlockSize(self): - """ - Gets the value of blockSize or its default value. - """ - return self.getOrDefault(self.blockSize) - - @since("2.0.0") - def setStepSize(self, value): - """ - Sets the value of :py:attr:`stepSize`. - """ - return self._set(stepSize=value) - - @since("2.0.0") - def getStepSize(self): - """ - Gets the value of stepSize or its default value. - """ - return self.getOrDefault(self.stepSize) - - @since("2.0.0") - def setInitialWeights(self, value): - """ - Sets the value of :py:attr:`initialWeights`. - """ - return self._set(initialWeights=value) - - @since("2.0.0") - def getInitialWeights(self): - """ - Gets the value of initialWeights or its default value. - """ - return self.getOrDefault(self.initialWeights) - - -class MultilayerPerceptronClassificationModel(JavaModel, JavaClassificationModel, JavaMLWritable, - JavaMLReadable): - """ - Model fitted by MultilayerPerceptronClassifier. - - .. versionadded:: 1.6.0 - """ - - @property - @since("1.6.0") - def layers(self): - """ - array of layer sizes including input and output layers. - """ - return self._call_java("javaLayers") - - @property - @since("2.0.0") - def weights(self): - """ - the weights of layers. - """ - return self._call_java("weights") - - -class OneVsRestParams(HasFeaturesCol, HasLabelCol, HasWeightCol, HasPredictionCol): - """ - Parameters for OneVsRest and OneVsRestModel. - """ - - classifier = Param(Params._dummy(), "classifier", "base binary classifier") - - @since("2.0.0") - def setClassifier(self, value): - """ - Sets the value of :py:attr:`classifier`. - - .. note:: Only LogisticRegression and NaiveBayes are supported now. - """ - return self._set(classifier=value) - - @since("2.0.0") - def getClassifier(self): - """ - Gets the value of classifier or its default value. - """ - return self.getOrDefault(self.classifier) - - -@inherit_doc -class OneVsRest(Estimator, OneVsRestParams, HasParallelism, JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Reduction of Multiclass Classification to Binary Classification. - Performs reduction using one against all strategy. - For a multiclass classification with k classes, train k models (one per class). - Each example is scored against all k models and the model with highest score - is picked to label the example. - - >>> from pyspark.sql import Row - >>> from pyspark.ml.linalg import Vectors - >>> data_path = "data/mllib/sample_multiclass_classification_data.txt" - >>> df = spark.read.format("libsvm").load(data_path) - >>> lr = LogisticRegression(regParam=0.01) - >>> ovr = OneVsRest(classifier=lr) - >>> model = ovr.fit(df) - >>> model.models[0].coefficients - DenseVector([0.5..., -1.0..., 3.4..., 4.2...]) - >>> model.models[1].coefficients - DenseVector([-2.1..., 3.1..., -2.6..., -2.3...]) - >>> model.models[2].coefficients - DenseVector([0.3..., -3.4..., 1.0..., -1.1...]) - >>> [x.intercept for x in model.models] - [-2.7..., -2.5..., -1.3...] - >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 0.0, 1.0, 1.0))]).toDF() - >>> model.transform(test0).head().prediction - 0.0 - >>> test1 = sc.parallelize([Row(features=Vectors.sparse(4, [0], [1.0]))]).toDF() - >>> model.transform(test1).head().prediction - 2.0 - >>> test2 = sc.parallelize([Row(features=Vectors.dense(0.5, 0.4, 0.3, 0.2))]).toDF() - >>> model.transform(test2).head().prediction - 0.0 - >>> model_path = temp_path + "/ovr_model" - >>> model.save(model_path) - >>> model2 = OneVsRestModel.load(model_path) - >>> model2.transform(test0).head().prediction - 0.0 - - .. versionadded:: 2.0.0 - """ - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - classifier=None, weightCol=None, parallelism=1): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - classifier=None, weightCol=None, parallelism=1): - """ - super(OneVsRest, self).__init__() - self._setDefault(parallelism=1) - kwargs = self._input_kwargs - self._set(**kwargs) - - @keyword_only - @since("2.0.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - classifier=None, weightCol=None, parallelism=1): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - classifier=None, weightCol=None, parallelism=1): - Sets params for OneVsRest. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _fit(self, dataset): - labelCol = self.getLabelCol() - featuresCol = self.getFeaturesCol() - predictionCol = self.getPredictionCol() - classifier = self.getClassifier() - assert isinstance(classifier, HasRawPredictionCol),\ - "Classifier %s doesn't extend from HasRawPredictionCol." % type(classifier) - - numClasses = int(dataset.agg({labelCol: "max"}).head()["max("+labelCol+")"]) + 1 - - weightCol = None - if (self.isDefined(self.weightCol) and self.getWeightCol()): - if isinstance(classifier, HasWeightCol): - weightCol = self.getWeightCol() - else: - warnings.warn("weightCol is ignored, " - "as it is not supported by {} now.".format(classifier)) - - if weightCol: - multiclassLabeled = dataset.select(labelCol, featuresCol, weightCol) - else: - multiclassLabeled = dataset.select(labelCol, featuresCol) - - # persist if underlying dataset is not persistent. - handlePersistence = dataset.storageLevel == StorageLevel(False, False, False, False) - if handlePersistence: - multiclassLabeled.persist(StorageLevel.MEMORY_AND_DISK) - - def trainSingleClass(index): - binaryLabelCol = "mc2b$" + str(index) - trainingDataset = multiclassLabeled.withColumn( - binaryLabelCol, - when(multiclassLabeled[labelCol] == float(index), 1.0).otherwise(0.0)) - paramMap = dict([(classifier.labelCol, binaryLabelCol), - (classifier.featuresCol, featuresCol), - (classifier.predictionCol, predictionCol)]) - if weightCol: - paramMap[classifier.weightCol] = weightCol - return classifier.fit(trainingDataset, paramMap) - - pool = ThreadPool(processes=min(self.getParallelism(), numClasses)) - - models = pool.map(trainSingleClass, range(numClasses)) - - if handlePersistence: - multiclassLabeled.unpersist() - - return self._copyValues(OneVsRestModel(models=models)) - - @since("2.0.0") - def copy(self, extra=None): - """ - Creates a copy of this instance with a randomly generated uid - and some extra params. This creates a deep copy of the embedded paramMap, - and copies the embedded and extra parameters over. - - :param extra: Extra parameters to copy to the new instance - :return: Copy of this instance - """ - if extra is None: - extra = dict() - newOvr = Params.copy(self, extra) - if self.isSet(self.classifier): - newOvr.setClassifier(self.getClassifier().copy(extra)) - return newOvr - - @classmethod - def _from_java(cls, java_stage): - """ - Given a Java OneVsRest, create and return a Python wrapper of it. - Used for ML persistence. - """ - featuresCol = java_stage.getFeaturesCol() - labelCol = java_stage.getLabelCol() - predictionCol = java_stage.getPredictionCol() - classifier = JavaParams._from_java(java_stage.getClassifier()) - parallelism = java_stage.getParallelism() - py_stage = cls(featuresCol=featuresCol, labelCol=labelCol, predictionCol=predictionCol, - classifier=classifier, parallelism=parallelism) - py_stage._resetUid(java_stage.uid()) - return py_stage - - def _to_java(self): - """ - Transfer this instance to a Java OneVsRest. Used for ML persistence. - - :return: Java object equivalent to this instance. - """ - _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRest", - self.uid) - _java_obj.setClassifier(self.getClassifier()._to_java()) - _java_obj.setParallelism(self.getParallelism()) - _java_obj.setFeaturesCol(self.getFeaturesCol()) - _java_obj.setLabelCol(self.getLabelCol()) - _java_obj.setPredictionCol(self.getPredictionCol()) - return _java_obj - - def _make_java_param_pair(self, param, value): - """ - Makes a Java param pair. - """ - sc = SparkContext._active_spark_context - param = self._resolveParam(param) - _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRest", - self.uid) - java_param = _java_obj.getParam(param.name) - if isinstance(value, JavaParams): - # used in the case of an estimator having another estimator as a parameter - # the reason why this is not in _py2java in common.py is that importing - # Estimator and Model in common.py results in a circular import with inherit_doc - java_value = value._to_java() - else: - java_value = _py2java(sc, value) - return java_param.w(java_value) - - def _transfer_param_map_to_java(self, pyParamMap): - """ - Transforms a Python ParamMap into a Java ParamMap. - """ - paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap") - for param in self.params: - if param in pyParamMap: - pair = self._make_java_param_pair(param, pyParamMap[param]) - paramMap.put([pair]) - return paramMap - - def _transfer_param_map_from_java(self, javaParamMap): - """ - Transforms a Java ParamMap into a Python ParamMap. - """ - sc = SparkContext._active_spark_context - paramMap = dict() - for pair in javaParamMap.toList(): - param = pair.param() - if self.hasParam(str(param.name())): - if param.name() == "classifier": - paramMap[self.getParam(param.name())] = JavaParams._from_java(pair.value()) - else: - paramMap[self.getParam(param.name())] = _java2py(sc, pair.value()) - return paramMap - - -class OneVsRestModel(Model, OneVsRestParams, JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Model fitted by OneVsRest. - This stores the models resulting from training k binary classifiers: one for each class. - Each example is scored against all k models, and the model with the highest score - is picked to label the example. - - .. versionadded:: 2.0.0 - """ - - def __init__(self, models): - super(OneVsRestModel, self).__init__() - self.models = models - java_models = [model._to_java() for model in self.models] - sc = SparkContext._active_spark_context - java_models_array = JavaWrapper._new_java_array(java_models, - sc._gateway.jvm.org.apache.spark.ml - .classification.ClassificationModel) - # TODO: need to set metadata - metadata = JavaParams._new_java_obj("org.apache.spark.sql.types.Metadata") - self._java_obj = \ - JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRestModel", - self.uid, metadata.empty(), java_models_array) - - def _transform(self, dataset): - # determine the input columns: these need to be passed through - origCols = dataset.columns - - # add an accumulator column to store predictions of all the models - accColName = "mbc$acc" + str(uuid.uuid4()) - initUDF = udf(lambda _: [], ArrayType(DoubleType())) - newDataset = dataset.withColumn(accColName, initUDF(dataset[origCols[0]])) - - # persist if underlying dataset is not persistent. - handlePersistence = dataset.storageLevel == StorageLevel(False, False, False, False) - if handlePersistence: - newDataset.persist(StorageLevel.MEMORY_AND_DISK) - - # update the accumulator column with the result of prediction of models - aggregatedDataset = newDataset - for index, model in enumerate(self.models): - rawPredictionCol = model._call_java("getRawPredictionCol") - columns = origCols + [rawPredictionCol, accColName] - - # add temporary column to store intermediate scores and update - tmpColName = "mbc$tmp" + str(uuid.uuid4()) - updateUDF = udf( - lambda predictions, prediction: predictions + [prediction.tolist()[1]], - ArrayType(DoubleType())) - transformedDataset = model.transform(aggregatedDataset).select(*columns) - updatedDataset = transformedDataset.withColumn( - tmpColName, - updateUDF(transformedDataset[accColName], transformedDataset[rawPredictionCol])) - newColumns = origCols + [tmpColName] - - # switch out the intermediate column with the accumulator column - aggregatedDataset = updatedDataset\ - .select(*newColumns).withColumnRenamed(tmpColName, accColName) - - if handlePersistence: - newDataset.unpersist() - - # output the index of the classifier with highest confidence as prediction - labelUDF = udf( - lambda predictions: float(max(enumerate(predictions), key=operator.itemgetter(1))[0]), - DoubleType()) - - # output label and label metadata as prediction - return aggregatedDataset.withColumn( - self.getPredictionCol(), labelUDF(aggregatedDataset[accColName])).drop(accColName) - - @since("2.0.0") - def copy(self, extra=None): - """ - Creates a copy of this instance with a randomly generated uid - and some extra params. This creates a deep copy of the embedded paramMap, - and copies the embedded and extra parameters over. - - :param extra: Extra parameters to copy to the new instance - :return: Copy of this instance - """ - if extra is None: - extra = dict() - newModel = Params.copy(self, extra) - newModel.models = [model.copy(extra) for model in self.models] - return newModel - - @classmethod - def _from_java(cls, java_stage): - """ - Given a Java OneVsRestModel, create and return a Python wrapper of it. - Used for ML persistence. - """ - featuresCol = java_stage.getFeaturesCol() - labelCol = java_stage.getLabelCol() - predictionCol = java_stage.getPredictionCol() - classifier = JavaParams._from_java(java_stage.getClassifier()) - models = [JavaParams._from_java(model) for model in java_stage.models()] - py_stage = cls(models=models).setPredictionCol(predictionCol).setLabelCol(labelCol)\ - .setFeaturesCol(featuresCol).setClassifier(classifier) - py_stage._resetUid(java_stage.uid()) - return py_stage - - def _to_java(self): - """ - Transfer this instance to a Java OneVsRestModel. Used for ML persistence. - - :return: Java object equivalent to this instance. - """ - sc = SparkContext._active_spark_context - java_models = [model._to_java() for model in self.models] - java_models_array = JavaWrapper._new_java_array( - java_models, sc._gateway.jvm.org.apache.spark.ml.classification.ClassificationModel) - metadata = JavaParams._new_java_obj("org.apache.spark.sql.types.Metadata") - _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRestModel", - self.uid, metadata.empty(), java_models_array) - _java_obj.set("classifier", self.getClassifier()._to_java()) - _java_obj.set("featuresCol", self.getFeaturesCol()) - _java_obj.set("labelCol", self.getLabelCol()) - _java_obj.set("predictionCol", self.getPredictionCol()) - return _java_obj - - -if __name__ == "__main__": - import doctest - import pyspark.ml.classification - from pyspark.sql import SparkSession - globs = pyspark.ml.classification.__dict__.copy() - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("ml.classification tests")\ - .getOrCreate() - sc = spark.sparkContext - globs['sc'] = sc - globs['spark'] = spark - import tempfile - temp_path = tempfile.mkdtemp() - globs['temp_path'] = temp_path - try: - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - finally: - from shutil import rmtree - try: - rmtree(temp_path) - except OSError: - pass - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/clustering.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/clustering.py deleted file mode 100644 index 5ef4e76..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/clustering.py +++ /dev/null @@ -1,1404 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import warnings - -from pyspark import since, keyword_only -from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaWrapper -from pyspark.ml.param.shared import * -from pyspark.ml.common import inherit_doc -from pyspark.sql import DataFrame - -__all__ = ['BisectingKMeans', 'BisectingKMeansModel', 'BisectingKMeansSummary', - 'KMeans', 'KMeansModel', - 'GaussianMixture', 'GaussianMixtureModel', 'GaussianMixtureSummary', - 'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel', 'PowerIterationClustering'] - - -class ClusteringSummary(JavaWrapper): - """ - .. note:: Experimental - - Clustering results for a given model. - - .. versionadded:: 2.1.0 - """ - - @property - @since("2.1.0") - def predictionCol(self): - """ - Name for column of predicted clusters in `predictions`. - """ - return self._call_java("predictionCol") - - @property - @since("2.1.0") - def predictions(self): - """ - DataFrame produced by the model's `transform` method. - """ - return self._call_java("predictions") - - @property - @since("2.1.0") - def featuresCol(self): - """ - Name for column of features in `predictions`. - """ - return self._call_java("featuresCol") - - @property - @since("2.1.0") - def k(self): - """ - The number of clusters the model was trained with. - """ - return self._call_java("k") - - @property - @since("2.1.0") - def cluster(self): - """ - DataFrame of predicted cluster centers for each training data point. - """ - return self._call_java("cluster") - - @property - @since("2.1.0") - def clusterSizes(self): - """ - Size of (number of data points in) each cluster. - """ - return self._call_java("clusterSizes") - - @property - @since("2.4.0") - def numIter(self): - """ - Number of iterations. - """ - return self._call_java("numIter") - - -class GaussianMixtureModel(JavaModel, JavaMLWritable, JavaMLReadable): - """ - Model fitted by GaussianMixture. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def weights(self): - """ - Weight for each Gaussian distribution in the mixture. - This is a multinomial probability distribution over the k Gaussians, - where weights[i] is the weight for Gaussian i, and weights sum to 1. - """ - return self._call_java("weights") - - @property - @since("2.0.0") - def gaussiansDF(self): - """ - Retrieve Gaussian distributions as a DataFrame. - Each row represents a Gaussian Distribution. - The DataFrame has two columns: mean (Vector) and cov (Matrix). - """ - return self._call_java("gaussiansDF") - - @property - @since("2.1.0") - def hasSummary(self): - """ - Indicates whether a training summary exists for this model - instance. - """ - return self._call_java("hasSummary") - - @property - @since("2.1.0") - def summary(self): - """ - Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the - training set. An exception is thrown if no summary exists. - """ - if self.hasSummary: - return GaussianMixtureSummary(self._call_java("summary")) - else: - raise RuntimeError("No training summary available for this %s" % - self.__class__.__name__) - - -@inherit_doc -class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, - HasProbabilityCol, JavaMLWritable, JavaMLReadable): - """ - GaussianMixture clustering. - This class performs expectation maximization for multivariate Gaussian - Mixture Models (GMMs). A GMM represents a composite distribution of - independent Gaussian distributions with associated "mixing" weights - specifying each's contribution to the composite. - - Given a set of sample points, this class will maximize the log-likelihood - for a mixture of k Gaussians, iterating until the log-likelihood changes by - less than convergenceTol, or until it has reached the max number of iterations. - While this process is generally guaranteed to converge, it is not guaranteed - to find a global optimum. - - .. note:: For high-dimensional data (with many features), this algorithm may perform poorly. - This is due to high-dimensional data (a) making it difficult to cluster at all - (based on statistical/theoretical arguments) and (b) numerical issues with - Gaussian distributions. - - >>> from pyspark.ml.linalg import Vectors - - >>> data = [(Vectors.dense([-0.1, -0.05 ]),), - ... (Vectors.dense([-0.01, -0.1]),), - ... (Vectors.dense([0.9, 0.8]),), - ... (Vectors.dense([0.75, 0.935]),), - ... (Vectors.dense([-0.83, -0.68]),), - ... (Vectors.dense([-0.91, -0.76]),)] - >>> df = spark.createDataFrame(data, ["features"]) - >>> gm = GaussianMixture(k=3, tol=0.0001, - ... maxIter=10, seed=10) - >>> model = gm.fit(df) - >>> model.hasSummary - True - >>> summary = model.summary - >>> summary.k - 3 - >>> summary.clusterSizes - [2, 2, 2] - >>> summary.logLikelihood - 8.14636... - >>> weights = model.weights - >>> len(weights) - 3 - >>> model.gaussiansDF.select("mean").head() - Row(mean=DenseVector([0.825, 0.8675])) - >>> model.gaussiansDF.select("cov").head() - Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False)) - >>> transformed = model.transform(df).select("features", "prediction") - >>> rows = transformed.collect() - >>> rows[4].prediction == rows[5].prediction - True - >>> rows[2].prediction == rows[3].prediction - True - >>> gmm_path = temp_path + "/gmm" - >>> gm.save(gmm_path) - >>> gm2 = GaussianMixture.load(gmm_path) - >>> gm2.getK() - 3 - >>> model_path = temp_path + "/gmm_model" - >>> model.save(model_path) - >>> model2 = GaussianMixtureModel.load(model_path) - >>> model2.hasSummary - False - >>> model2.weights == model.weights - True - >>> model2.gaussiansDF.select("mean").head() - Row(mean=DenseVector([0.825, 0.8675])) - >>> model2.gaussiansDF.select("cov").head() - Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False)) - - .. versionadded:: 2.0.0 - """ - - k = Param(Params._dummy(), "k", "Number of independent Gaussians in the mixture model. " + - "Must be > 1.", typeConverter=TypeConverters.toInt) - - @keyword_only - def __init__(self, featuresCol="features", predictionCol="prediction", k=2, - probabilityCol="probability", tol=0.01, maxIter=100, seed=None): - """ - __init__(self, featuresCol="features", predictionCol="prediction", k=2, \ - probabilityCol="probability", tol=0.01, maxIter=100, seed=None) - """ - super(GaussianMixture, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.GaussianMixture", - self.uid) - self._setDefault(k=2, tol=0.01, maxIter=100) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - def _create_model(self, java_model): - return GaussianMixtureModel(java_model) - - @keyword_only - @since("2.0.0") - def setParams(self, featuresCol="features", predictionCol="prediction", k=2, - probabilityCol="probability", tol=0.01, maxIter=100, seed=None): - """ - setParams(self, featuresCol="features", predictionCol="prediction", k=2, \ - probabilityCol="probability", tol=0.01, maxIter=100, seed=None) - - Sets params for GaussianMixture. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.0.0") - def setK(self, value): - """ - Sets the value of :py:attr:`k`. - """ - return self._set(k=value) - - @since("2.0.0") - def getK(self): - """ - Gets the value of `k` - """ - return self.getOrDefault(self.k) - - -class GaussianMixtureSummary(ClusteringSummary): - """ - .. note:: Experimental - - Gaussian mixture clustering results for a given model. - - .. versionadded:: 2.1.0 - """ - - @property - @since("2.1.0") - def probabilityCol(self): - """ - Name for column of predicted probability of each cluster in `predictions`. - """ - return self._call_java("probabilityCol") - - @property - @since("2.1.0") - def probability(self): - """ - DataFrame of probabilities of each cluster for each training data point. - """ - return self._call_java("probability") - - @property - @since("2.2.0") - def logLikelihood(self): - """ - Total log-likelihood for this model on the given data. - """ - return self._call_java("logLikelihood") - - -class KMeansSummary(ClusteringSummary): - """ - .. note:: Experimental - - Summary of KMeans. - - .. versionadded:: 2.1.0 - """ - - @property - @since("2.4.0") - def trainingCost(self): - """ - K-means cost (sum of squared distances to the nearest centroid for all points in the - training dataset). This is equivalent to sklearn's inertia. - """ - return self._call_java("trainingCost") - - -class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): - """ - Model fitted by KMeans. - - .. versionadded:: 1.5.0 - """ - - @since("1.5.0") - def clusterCenters(self): - """Get the cluster centers, represented as a list of NumPy arrays.""" - return [c.toArray() for c in self._call_java("clusterCenters")] - - @since("2.0.0") - def computeCost(self, dataset): - """ - Return the K-means cost (sum of squared distances of points to their nearest center) - for this model on the given data. - - ..note:: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead. - You can also get the cost on the training dataset in the summary. - """ - warnings.warn("Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator " - "instead. You can also get the cost on the training dataset in the summary.", - DeprecationWarning) - return self._call_java("computeCost", dataset) - - @property - @since("2.1.0") - def hasSummary(self): - """ - Indicates whether a training summary exists for this model instance. - """ - return self._call_java("hasSummary") - - @property - @since("2.1.0") - def summary(self): - """ - Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the - training set. An exception is thrown if no summary exists. - """ - if self.hasSummary: - return KMeansSummary(self._call_java("summary")) - else: - raise RuntimeError("No training summary available for this %s" % - self.__class__.__name__) - - -@inherit_doc -class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol, HasMaxIter, - HasTol, HasSeed, JavaMLWritable, JavaMLReadable): - """ - K-means clustering with a k-means++ like initialization mode - (the k-means|| algorithm by Bahmani et al). - - >>> from pyspark.ml.linalg import Vectors - >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), - ... (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] - >>> df = spark.createDataFrame(data, ["features"]) - >>> kmeans = KMeans(k=2, seed=1) - >>> model = kmeans.fit(df) - >>> centers = model.clusterCenters() - >>> len(centers) - 2 - >>> model.computeCost(df) - 2.000... - >>> transformed = model.transform(df).select("features", "prediction") - >>> rows = transformed.collect() - >>> rows[0].prediction == rows[1].prediction - True - >>> rows[2].prediction == rows[3].prediction - True - >>> model.hasSummary - True - >>> summary = model.summary - >>> summary.k - 2 - >>> summary.clusterSizes - [2, 2] - >>> summary.trainingCost - 2.000... - >>> kmeans_path = temp_path + "/kmeans" - >>> kmeans.save(kmeans_path) - >>> kmeans2 = KMeans.load(kmeans_path) - >>> kmeans2.getK() - 2 - >>> model_path = temp_path + "/kmeans_model" - >>> model.save(model_path) - >>> model2 = KMeansModel.load(model_path) - >>> model2.hasSummary - False - >>> model.clusterCenters()[0] == model2.clusterCenters()[0] - array([ True, True], dtype=bool) - >>> model.clusterCenters()[1] == model2.clusterCenters()[1] - array([ True, True], dtype=bool) - - .. versionadded:: 1.5.0 - """ - - k = Param(Params._dummy(), "k", "The number of clusters to create. Must be > 1.", - typeConverter=TypeConverters.toInt) - initMode = Param(Params._dummy(), "initMode", - "The initialization algorithm. This can be either \"random\" to " + - "choose random points as initial cluster centers, or \"k-means||\" " + - "to use a parallel variant of k-means++", - typeConverter=TypeConverters.toString) - initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " + - "initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt) - - @keyword_only - def __init__(self, featuresCol="features", predictionCol="prediction", k=2, - initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, - distanceMeasure="euclidean"): - """ - __init__(self, featuresCol="features", predictionCol="prediction", k=2, \ - initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, \ - distanceMeasure="euclidean") - """ - super(KMeans, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid) - self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, - distanceMeasure="euclidean") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - def _create_model(self, java_model): - return KMeansModel(java_model) - - @keyword_only - @since("1.5.0") - def setParams(self, featuresCol="features", predictionCol="prediction", k=2, - initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, - distanceMeasure="euclidean"): - """ - setParams(self, featuresCol="features", predictionCol="prediction", k=2, \ - initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None, \ - distanceMeasure="euclidean") - - Sets params for KMeans. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.5.0") - def setK(self, value): - """ - Sets the value of :py:attr:`k`. - """ - return self._set(k=value) - - @since("1.5.0") - def getK(self): - """ - Gets the value of `k` - """ - return self.getOrDefault(self.k) - - @since("1.5.0") - def setInitMode(self, value): - """ - Sets the value of :py:attr:`initMode`. - """ - return self._set(initMode=value) - - @since("1.5.0") - def getInitMode(self): - """ - Gets the value of `initMode` - """ - return self.getOrDefault(self.initMode) - - @since("1.5.0") - def setInitSteps(self, value): - """ - Sets the value of :py:attr:`initSteps`. - """ - return self._set(initSteps=value) - - @since("1.5.0") - def getInitSteps(self): - """ - Gets the value of `initSteps` - """ - return self.getOrDefault(self.initSteps) - - @since("2.4.0") - def setDistanceMeasure(self, value): - """ - Sets the value of :py:attr:`distanceMeasure`. - """ - return self._set(distanceMeasure=value) - - @since("2.4.0") - def getDistanceMeasure(self): - """ - Gets the value of `distanceMeasure` - """ - return self.getOrDefault(self.distanceMeasure) - - -class BisectingKMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): - """ - Model fitted by BisectingKMeans. - - .. versionadded:: 2.0.0 - """ - - @since("2.0.0") - def clusterCenters(self): - """Get the cluster centers, represented as a list of NumPy arrays.""" - return [c.toArray() for c in self._call_java("clusterCenters")] - - @since("2.0.0") - def computeCost(self, dataset): - """ - Computes the sum of squared distances between the input points - and their corresponding cluster centers. - """ - return self._call_java("computeCost", dataset) - - @property - @since("2.1.0") - def hasSummary(self): - """ - Indicates whether a training summary exists for this model instance. - """ - return self._call_java("hasSummary") - - @property - @since("2.1.0") - def summary(self): - """ - Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the - training set. An exception is thrown if no summary exists. - """ - if self.hasSummary: - return BisectingKMeansSummary(self._call_java("summary")) - else: - raise RuntimeError("No training summary available for this %s" % - self.__class__.__name__) - - -@inherit_doc -class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol, - HasMaxIter, HasSeed, JavaMLWritable, JavaMLReadable): - """ - A bisecting k-means algorithm based on the paper "A comparison of document clustering - techniques" by Steinbach, Karypis, and Kumar, with modification to fit Spark. - The algorithm starts from a single cluster that contains all points. - Iteratively it finds divisible clusters on the bottom level and bisects each of them using - k-means, until there are `k` leaf clusters in total or no leaf clusters are divisible. - The bisecting steps of clusters on the same level are grouped together to increase parallelism. - If bisecting all divisible clusters on the bottom level would result more than `k` leaf - clusters, larger clusters get higher priority. - - >>> from pyspark.ml.linalg import Vectors - >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), - ... (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] - >>> df = spark.createDataFrame(data, ["features"]) - >>> bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0) - >>> model = bkm.fit(df) - >>> centers = model.clusterCenters() - >>> len(centers) - 2 - >>> model.computeCost(df) - 2.000... - >>> model.hasSummary - True - >>> summary = model.summary - >>> summary.k - 2 - >>> summary.clusterSizes - [2, 2] - >>> transformed = model.transform(df).select("features", "prediction") - >>> rows = transformed.collect() - >>> rows[0].prediction == rows[1].prediction - True - >>> rows[2].prediction == rows[3].prediction - True - >>> bkm_path = temp_path + "/bkm" - >>> bkm.save(bkm_path) - >>> bkm2 = BisectingKMeans.load(bkm_path) - >>> bkm2.getK() - 2 - >>> bkm2.getDistanceMeasure() - 'euclidean' - >>> model_path = temp_path + "/bkm_model" - >>> model.save(model_path) - >>> model2 = BisectingKMeansModel.load(model_path) - >>> model2.hasSummary - False - >>> model.clusterCenters()[0] == model2.clusterCenters()[0] - array([ True, True], dtype=bool) - >>> model.clusterCenters()[1] == model2.clusterCenters()[1] - array([ True, True], dtype=bool) - - .. versionadded:: 2.0.0 - """ - - k = Param(Params._dummy(), "k", "The desired number of leaf clusters. Must be > 1.", - typeConverter=TypeConverters.toInt) - minDivisibleClusterSize = Param(Params._dummy(), "minDivisibleClusterSize", - "The minimum number of points (if >= 1.0) or the minimum " + - "proportion of points (if < 1.0) of a divisible cluster.", - typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, featuresCol="features", predictionCol="prediction", maxIter=20, - seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"): - """ - __init__(self, featuresCol="features", predictionCol="prediction", maxIter=20, \ - seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean") - """ - super(BisectingKMeans, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.BisectingKMeans", - self.uid) - self._setDefault(maxIter=20, k=4, minDivisibleClusterSize=1.0) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.0.0") - def setParams(self, featuresCol="features", predictionCol="prediction", maxIter=20, - seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean"): - """ - setParams(self, featuresCol="features", predictionCol="prediction", maxIter=20, \ - seed=None, k=4, minDivisibleClusterSize=1.0, distanceMeasure="euclidean") - Sets params for BisectingKMeans. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.0.0") - def setK(self, value): - """ - Sets the value of :py:attr:`k`. - """ - return self._set(k=value) - - @since("2.0.0") - def getK(self): - """ - Gets the value of `k` or its default value. - """ - return self.getOrDefault(self.k) - - @since("2.0.0") - def setMinDivisibleClusterSize(self, value): - """ - Sets the value of :py:attr:`minDivisibleClusterSize`. - """ - return self._set(minDivisibleClusterSize=value) - - @since("2.0.0") - def getMinDivisibleClusterSize(self): - """ - Gets the value of `minDivisibleClusterSize` or its default value. - """ - return self.getOrDefault(self.minDivisibleClusterSize) - - @since("2.4.0") - def setDistanceMeasure(self, value): - """ - Sets the value of :py:attr:`distanceMeasure`. - """ - return self._set(distanceMeasure=value) - - @since("2.4.0") - def getDistanceMeasure(self): - """ - Gets the value of `distanceMeasure` or its default value. - """ - return self.getOrDefault(self.distanceMeasure) - - def _create_model(self, java_model): - return BisectingKMeansModel(java_model) - - -class BisectingKMeansSummary(ClusteringSummary): - """ - .. note:: Experimental - - Bisecting KMeans clustering results for a given model. - - .. versionadded:: 2.1.0 - """ - pass - - -@inherit_doc -class LDAModel(JavaModel): - """ - Latent Dirichlet Allocation (LDA) model. - This abstraction permits for different underlying representations, - including local and distributed data structures. - - .. versionadded:: 2.0.0 - """ - - @since("2.0.0") - def isDistributed(self): - """ - Indicates whether this instance is of type DistributedLDAModel - """ - return self._call_java("isDistributed") - - @since("2.0.0") - def vocabSize(self): - """Vocabulary size (number of terms or words in the vocabulary)""" - return self._call_java("vocabSize") - - @since("2.0.0") - def topicsMatrix(self): - """ - Inferred topics, where each topic is represented by a distribution over terms. - This is a matrix of size vocabSize x k, where each column is a topic. - No guarantees are given about the ordering of the topics. - - WARNING: If this model is actually a :py:class:`DistributedLDAModel` instance produced by - the Expectation-Maximization ("em") `optimizer`, then this method could involve - collecting a large amount of data to the driver (on the order of vocabSize x k). - """ - return self._call_java("topicsMatrix") - - @since("2.0.0") - def logLikelihood(self, dataset): - """ - Calculates a lower bound on the log likelihood of the entire corpus. - See Equation (16) in the Online LDA paper (Hoffman et al., 2010). - - WARNING: If this model is an instance of :py:class:`DistributedLDAModel` (produced when - :py:attr:`optimizer` is set to "em"), this involves collecting a large - :py:func:`topicsMatrix` to the driver. This implementation may be changed in the future. - """ - return self._call_java("logLikelihood", dataset) - - @since("2.0.0") - def logPerplexity(self, dataset): - """ - Calculate an upper bound on perplexity. (Lower is better.) - See Equation (16) in the Online LDA paper (Hoffman et al., 2010). - - WARNING: If this model is an instance of :py:class:`DistributedLDAModel` (produced when - :py:attr:`optimizer` is set to "em"), this involves collecting a large - :py:func:`topicsMatrix` to the driver. This implementation may be changed in the future. - """ - return self._call_java("logPerplexity", dataset) - - @since("2.0.0") - def describeTopics(self, maxTermsPerTopic=10): - """ - Return the topics described by their top-weighted terms. - """ - return self._call_java("describeTopics", maxTermsPerTopic) - - @since("2.0.0") - def estimatedDocConcentration(self): - """ - Value for :py:attr:`LDA.docConcentration` estimated from data. - If Online LDA was used and :py:attr:`LDA.optimizeDocConcentration` was set to false, - then this returns the fixed (given) value for the :py:attr:`LDA.docConcentration` parameter. - """ - return self._call_java("estimatedDocConcentration") - - -@inherit_doc -class DistributedLDAModel(LDAModel, JavaMLReadable, JavaMLWritable): - """ - Distributed model fitted by :py:class:`LDA`. - This type of model is currently only produced by Expectation-Maximization (EM). - - This model stores the inferred topics, the full training dataset, and the topic distribution - for each training document. - - .. versionadded:: 2.0.0 - """ - - @since("2.0.0") - def toLocal(self): - """ - Convert this distributed model to a local representation. This discards info about the - training dataset. - - WARNING: This involves collecting a large :py:func:`topicsMatrix` to the driver. - """ - model = LocalLDAModel(self._call_java("toLocal")) - - # SPARK-10931: Temporary fix to be removed once LDAModel defines Params - model._create_params_from_java() - model._transfer_params_from_java() - - return model - - @since("2.0.0") - def trainingLogLikelihood(self): - """ - Log likelihood of the observed tokens in the training set, - given the current parameter estimates: - log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters) - - Notes: - - This excludes the prior; for that, use :py:func:`logPrior`. - - Even with :py:func:`logPrior`, this is NOT the same as the data log likelihood given - the hyperparameters. - - This is computed from the topic distributions computed during training. If you call - :py:func:`logLikelihood` on the same training dataset, the topic distributions - will be computed again, possibly giving different results. - """ - return self._call_java("trainingLogLikelihood") - - @since("2.0.0") - def logPrior(self): - """ - Log probability of the current parameter estimate: - log P(topics, topic distributions for docs | alpha, eta) - """ - return self._call_java("logPrior") - - @since("2.0.0") - def getCheckpointFiles(self): - """ - If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may - be saved checkpoint files. This method is provided so that users can manage those files. - - .. note:: Removing the checkpoints can cause failures if a partition is lost and is needed - by certain :py:class:`DistributedLDAModel` methods. Reference counting will clean up - the checkpoints when this model and derivative data go out of scope. - - :return List of checkpoint files from training - """ - return self._call_java("getCheckpointFiles") - - -@inherit_doc -class LocalLDAModel(LDAModel, JavaMLReadable, JavaMLWritable): - """ - Local (non-distributed) model fitted by :py:class:`LDA`. - This model stores the inferred topics only; it does not store info about the training dataset. - - .. versionadded:: 2.0.0 - """ - pass - - -@inherit_doc -class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval, - JavaMLReadable, JavaMLWritable): - """ - Latent Dirichlet Allocation (LDA), a topic model designed for text documents. - - Terminology: - - - "term" = "word": an element of the vocabulary - - "token": instance of a term appearing in a document - - "topic": multinomial distribution over terms representing some concept - - "document": one piece of text, corresponding to one row in the input data - - Original LDA paper (journal version): - Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003. - - Input data (featuresCol): - LDA is given a collection of documents as input data, via the featuresCol parameter. - Each document is specified as a :py:class:`Vector` of length vocabSize, where each entry is the - count for the corresponding term (word) in the document. Feature transformers such as - :py:class:`pyspark.ml.feature.Tokenizer` and :py:class:`pyspark.ml.feature.CountVectorizer` - can be useful for converting text to word count vectors. - - >>> from pyspark.ml.linalg import Vectors, SparseVector - >>> from pyspark.ml.clustering import LDA - >>> df = spark.createDataFrame([[1, Vectors.dense([0.0, 1.0])], - ... [2, SparseVector(2, {0: 1.0})],], ["id", "features"]) - >>> lda = LDA(k=2, seed=1, optimizer="em") - >>> model = lda.fit(df) - >>> model.isDistributed() - True - >>> localModel = model.toLocal() - >>> localModel.isDistributed() - False - >>> model.vocabSize() - 2 - >>> model.describeTopics().show() - +-----+-----------+--------------------+ - |topic|termIndices| termWeights| - +-----+-----------+--------------------+ - | 0| [1, 0]|[0.50401530077160...| - | 1| [0, 1]|[0.50401530077160...| - +-----+-----------+--------------------+ - ... - >>> model.topicsMatrix() - DenseMatrix(2, 2, [0.496, 0.504, 0.504, 0.496], 0) - >>> lda_path = temp_path + "/lda" - >>> lda.save(lda_path) - >>> sameLDA = LDA.load(lda_path) - >>> distributed_model_path = temp_path + "/lda_distributed_model" - >>> model.save(distributed_model_path) - >>> sameModel = DistributedLDAModel.load(distributed_model_path) - >>> local_model_path = temp_path + "/lda_local_model" - >>> localModel.save(local_model_path) - >>> sameLocalModel = LocalLDAModel.load(local_model_path) - - .. versionadded:: 2.0.0 - """ - - k = Param(Params._dummy(), "k", "The number of topics (clusters) to infer. Must be > 1.", - typeConverter=TypeConverters.toInt) - optimizer = Param(Params._dummy(), "optimizer", - "Optimizer or inference algorithm used to estimate the LDA model. " - "Supported: online, em", typeConverter=TypeConverters.toString) - learningOffset = Param(Params._dummy(), "learningOffset", - "A (positive) learning parameter that downweights early iterations." - " Larger values make early iterations count less", - typeConverter=TypeConverters.toFloat) - learningDecay = Param(Params._dummy(), "learningDecay", "Learning rate, set as an" - "exponential decay rate. This should be between (0.5, 1.0] to " - "guarantee asymptotic convergence.", typeConverter=TypeConverters.toFloat) - subsamplingRate = Param(Params._dummy(), "subsamplingRate", - "Fraction of the corpus to be sampled and used in each iteration " - "of mini-batch gradient descent, in range (0, 1].", - typeConverter=TypeConverters.toFloat) - optimizeDocConcentration = Param(Params._dummy(), "optimizeDocConcentration", - "Indicates whether the docConcentration (Dirichlet parameter " - "for document-topic distribution) will be optimized during " - "training.", typeConverter=TypeConverters.toBoolean) - docConcentration = Param(Params._dummy(), "docConcentration", - "Concentration parameter (commonly named \"alpha\") for the " - "prior placed on documents' distributions over topics (\"theta\").", - typeConverter=TypeConverters.toListFloat) - topicConcentration = Param(Params._dummy(), "topicConcentration", - "Concentration parameter (commonly named \"beta\" or \"eta\") for " - "the prior placed on topic' distributions over terms.", - typeConverter=TypeConverters.toFloat) - topicDistributionCol = Param(Params._dummy(), "topicDistributionCol", - "Output column with estimates of the topic mixture distribution " - "for each document (often called \"theta\" in the literature). " - "Returns a vector of zeros for an empty document.", - typeConverter=TypeConverters.toString) - keepLastCheckpoint = Param(Params._dummy(), "keepLastCheckpoint", - "(For EM optimizer) If using checkpointing, this indicates whether" - " to keep the last checkpoint. If false, then the checkpoint will be" - " deleted. Deleting the checkpoint can cause failures if a data" - " partition is lost, so set this bit with care.", - TypeConverters.toBoolean) - - @keyword_only - def __init__(self, featuresCol="features", maxIter=20, seed=None, checkpointInterval=10, - k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51, - subsamplingRate=0.05, optimizeDocConcentration=True, - docConcentration=None, topicConcentration=None, - topicDistributionCol="topicDistribution", keepLastCheckpoint=True): - """ - __init__(self, featuresCol="features", maxIter=20, seed=None, checkpointInterval=10,\ - k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,\ - subsamplingRate=0.05, optimizeDocConcentration=True,\ - docConcentration=None, topicConcentration=None,\ - topicDistributionCol="topicDistribution", keepLastCheckpoint=True) - """ - super(LDA, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.LDA", self.uid) - self._setDefault(maxIter=20, checkpointInterval=10, - k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51, - subsamplingRate=0.05, optimizeDocConcentration=True, - topicDistributionCol="topicDistribution", keepLastCheckpoint=True) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - def _create_model(self, java_model): - if self.getOptimizer() == "em": - return DistributedLDAModel(java_model) - else: - return LocalLDAModel(java_model) - - @keyword_only - @since("2.0.0") - def setParams(self, featuresCol="features", maxIter=20, seed=None, checkpointInterval=10, - k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51, - subsamplingRate=0.05, optimizeDocConcentration=True, - docConcentration=None, topicConcentration=None, - topicDistributionCol="topicDistribution", keepLastCheckpoint=True): - """ - setParams(self, featuresCol="features", maxIter=20, seed=None, checkpointInterval=10,\ - k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,\ - subsamplingRate=0.05, optimizeDocConcentration=True,\ - docConcentration=None, topicConcentration=None,\ - topicDistributionCol="topicDistribution", keepLastCheckpoint=True) - - Sets params for LDA. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.0.0") - def setK(self, value): - """ - Sets the value of :py:attr:`k`. - - >>> algo = LDA().setK(10) - >>> algo.getK() - 10 - """ - return self._set(k=value) - - @since("2.0.0") - def getK(self): - """ - Gets the value of :py:attr:`k` or its default value. - """ - return self.getOrDefault(self.k) - - @since("2.0.0") - def setOptimizer(self, value): - """ - Sets the value of :py:attr:`optimizer`. - Currently only support 'em' and 'online'. - - >>> algo = LDA().setOptimizer("em") - >>> algo.getOptimizer() - 'em' - """ - return self._set(optimizer=value) - - @since("2.0.0") - def getOptimizer(self): - """ - Gets the value of :py:attr:`optimizer` or its default value. - """ - return self.getOrDefault(self.optimizer) - - @since("2.0.0") - def setLearningOffset(self, value): - """ - Sets the value of :py:attr:`learningOffset`. - - >>> algo = LDA().setLearningOffset(100) - >>> algo.getLearningOffset() - 100.0 - """ - return self._set(learningOffset=value) - - @since("2.0.0") - def getLearningOffset(self): - """ - Gets the value of :py:attr:`learningOffset` or its default value. - """ - return self.getOrDefault(self.learningOffset) - - @since("2.0.0") - def setLearningDecay(self, value): - """ - Sets the value of :py:attr:`learningDecay`. - - >>> algo = LDA().setLearningDecay(0.1) - >>> algo.getLearningDecay() - 0.1... - """ - return self._set(learningDecay=value) - - @since("2.0.0") - def getLearningDecay(self): - """ - Gets the value of :py:attr:`learningDecay` or its default value. - """ - return self.getOrDefault(self.learningDecay) - - @since("2.0.0") - def setSubsamplingRate(self, value): - """ - Sets the value of :py:attr:`subsamplingRate`. - - >>> algo = LDA().setSubsamplingRate(0.1) - >>> algo.getSubsamplingRate() - 0.1... - """ - return self._set(subsamplingRate=value) - - @since("2.0.0") - def getSubsamplingRate(self): - """ - Gets the value of :py:attr:`subsamplingRate` or its default value. - """ - return self.getOrDefault(self.subsamplingRate) - - @since("2.0.0") - def setOptimizeDocConcentration(self, value): - """ - Sets the value of :py:attr:`optimizeDocConcentration`. - - >>> algo = LDA().setOptimizeDocConcentration(True) - >>> algo.getOptimizeDocConcentration() - True - """ - return self._set(optimizeDocConcentration=value) - - @since("2.0.0") - def getOptimizeDocConcentration(self): - """ - Gets the value of :py:attr:`optimizeDocConcentration` or its default value. - """ - return self.getOrDefault(self.optimizeDocConcentration) - - @since("2.0.0") - def setDocConcentration(self, value): - """ - Sets the value of :py:attr:`docConcentration`. - - >>> algo = LDA().setDocConcentration([0.1, 0.2]) - >>> algo.getDocConcentration() - [0.1..., 0.2...] - """ - return self._set(docConcentration=value) - - @since("2.0.0") - def getDocConcentration(self): - """ - Gets the value of :py:attr:`docConcentration` or its default value. - """ - return self.getOrDefault(self.docConcentration) - - @since("2.0.0") - def setTopicConcentration(self, value): - """ - Sets the value of :py:attr:`topicConcentration`. - - >>> algo = LDA().setTopicConcentration(0.5) - >>> algo.getTopicConcentration() - 0.5... - """ - return self._set(topicConcentration=value) - - @since("2.0.0") - def getTopicConcentration(self): - """ - Gets the value of :py:attr:`topicConcentration` or its default value. - """ - return self.getOrDefault(self.topicConcentration) - - @since("2.0.0") - def setTopicDistributionCol(self, value): - """ - Sets the value of :py:attr:`topicDistributionCol`. - - >>> algo = LDA().setTopicDistributionCol("topicDistributionCol") - >>> algo.getTopicDistributionCol() - 'topicDistributionCol' - """ - return self._set(topicDistributionCol=value) - - @since("2.0.0") - def getTopicDistributionCol(self): - """ - Gets the value of :py:attr:`topicDistributionCol` or its default value. - """ - return self.getOrDefault(self.topicDistributionCol) - - @since("2.0.0") - def setKeepLastCheckpoint(self, value): - """ - Sets the value of :py:attr:`keepLastCheckpoint`. - - >>> algo = LDA().setKeepLastCheckpoint(False) - >>> algo.getKeepLastCheckpoint() - False - """ - return self._set(keepLastCheckpoint=value) - - @since("2.0.0") - def getKeepLastCheckpoint(self): - """ - Gets the value of :py:attr:`keepLastCheckpoint` or its default value. - """ - return self.getOrDefault(self.keepLastCheckpoint) - - -@inherit_doc -class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReadable, - JavaMLWritable): - """ - .. note:: Experimental - - Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by - `Lin and Cohen `_. From the abstract: - PIC finds a very low-dimensional embedding of a dataset using truncated power - iteration on a normalized pair-wise similarity matrix of the data. - - This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method - to run the PowerIterationClustering algorithm. - - .. seealso:: `Wikipedia on Spectral clustering - `_ - - >>> data = [(1, 0, 0.5), - ... (2, 0, 0.5), (2, 1, 0.7), - ... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), - ... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), - ... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)] - >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight") - >>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight") - >>> assignments = pic.assignClusters(df) - >>> assignments.sort(assignments.id).show(truncate=False) - +---+-------+ - |id |cluster| - +---+-------+ - |0 |1 | - |1 |1 | - |2 |1 | - |3 |1 | - |4 |1 | - |5 |0 | - +---+-------+ - ... - >>> pic_path = temp_path + "/pic" - >>> pic.save(pic_path) - >>> pic2 = PowerIterationClustering.load(pic_path) - >>> pic2.getK() - 2 - >>> pic2.getMaxIter() - 40 - - .. versionadded:: 2.4.0 - """ - - k = Param(Params._dummy(), "k", - "The number of clusters to create. Must be > 1.", - typeConverter=TypeConverters.toInt) - initMode = Param(Params._dummy(), "initMode", - "The initialization algorithm. This can be either " + - "'random' to use a random vector as vertex properties, or 'degree' to use " + - "a normalized sum of similarities with other vertices. Supported options: " + - "'random' and 'degree'.", - typeConverter=TypeConverters.toString) - srcCol = Param(Params._dummy(), "srcCol", - "Name of the input column for source vertex IDs.", - typeConverter=TypeConverters.toString) - dstCol = Param(Params._dummy(), "dstCol", - "Name of the input column for destination vertex IDs.", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst", - weightCol=None): - """ - __init__(self, k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst",\ - weightCol=None) - """ - super(PowerIterationClustering, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.clustering.PowerIterationClustering", self.uid) - self._setDefault(k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.4.0") - def setParams(self, k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst", - weightCol=None): - """ - setParams(self, k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst",\ - weightCol=None) - Sets params for PowerIterationClustering. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.4.0") - def setK(self, value): - """ - Sets the value of :py:attr:`k`. - """ - return self._set(k=value) - - @since("2.4.0") - def getK(self): - """ - Gets the value of :py:attr:`k` or its default value. - """ - return self.getOrDefault(self.k) - - @since("2.4.0") - def setInitMode(self, value): - """ - Sets the value of :py:attr:`initMode`. - """ - return self._set(initMode=value) - - @since("2.4.0") - def getInitMode(self): - """ - Gets the value of :py:attr:`initMode` or its default value. - """ - return self.getOrDefault(self.initMode) - - @since("2.4.0") - def setSrcCol(self, value): - """ - Sets the value of :py:attr:`srcCol`. - """ - return self._set(srcCol=value) - - @since("2.4.0") - def getSrcCol(self): - """ - Gets the value of :py:attr:`srcCol` or its default value. - """ - return self.getOrDefault(self.srcCol) - - @since("2.4.0") - def setDstCol(self, value): - """ - Sets the value of :py:attr:`dstCol`. - """ - return self._set(dstCol=value) - - @since("2.4.0") - def getDstCol(self): - """ - Gets the value of :py:attr:`dstCol` or its default value. - """ - return self.getOrDefault(self.dstCol) - - @since("2.4.0") - def assignClusters(self, dataset): - """ - Run the PIC algorithm and returns a cluster assignment for each input vertex. - - :param dataset: - A dataset with columns src, dst, weight representing the affinity matrix, - which is the matrix A in the PIC paper. Suppose the src column value is i, - the dst column value is j, the weight column value is similarity s,,ij,, - which must be nonnegative. This is a symmetric matrix and hence - s,,ij,, = s,,ji,,. For any (i, j) with nonzero similarity, there should be - either (i, j, s,,ij,,) or (j, i, s,,ji,,) in the input. Rows with i = j are - ignored, because we assume s,,ij,, = 0.0. - - :return: - A dataset that contains columns of vertex id and the corresponding cluster for - the id. The schema of it will be: - - id: Long - - cluster: Int - - .. versionadded:: 2.4.0 - """ - self._transfer_params_to_java() - jdf = self._java_obj.assignClusters(dataset._jdf) - return DataFrame(jdf, dataset.sql_ctx) - - -if __name__ == "__main__": - import doctest - import numpy - import pyspark.ml.clustering - from pyspark.sql import SparkSession - try: - # Numpy 1.14+ changed it's string format. - numpy.set_printoptions(legacy='1.13') - except TypeError: - pass - globs = pyspark.ml.clustering.__dict__.copy() - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("ml.clustering tests")\ - .getOrCreate() - sc = spark.sparkContext - globs['sc'] = sc - globs['spark'] = spark - import tempfile - temp_path = tempfile.mkdtemp() - globs['temp_path'] = temp_path - try: - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - finally: - from shutil import rmtree - try: - rmtree(temp_path) - except OSError: - pass - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/common.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/common.py deleted file mode 100644 index 387c5d7..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/common.py +++ /dev/null @@ -1,138 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -if sys.version >= '3': - long = int - unicode = str - -import py4j.protocol -from py4j.protocol import Py4JJavaError -from py4j.java_gateway import JavaObject -from py4j.java_collections import JavaArray, JavaList - -from pyspark import RDD, SparkContext -from pyspark.serializers import PickleSerializer, AutoBatchedSerializer -from pyspark.sql import DataFrame, SQLContext - -# Hack for support float('inf') in Py4j -_old_smart_decode = py4j.protocol.smart_decode - -_float_str_mapping = { - 'nan': 'NaN', - 'inf': 'Infinity', - '-inf': '-Infinity', -} - - -def _new_smart_decode(obj): - if isinstance(obj, float): - s = str(obj) - return _float_str_mapping.get(s, s) - return _old_smart_decode(obj) - -py4j.protocol.smart_decode = _new_smart_decode - - -_picklable_classes = [ - 'SparseVector', - 'DenseVector', - 'SparseMatrix', - 'DenseMatrix', -] - - -# this will call the ML version of pythonToJava() -def _to_java_object_rdd(rdd): - """ Return an JavaRDD of Object by unpickling - - It will convert each Python object into Java object by Pyrolite, whenever the - RDD is serialized in batch or not. - """ - rdd = rdd._reserialize(AutoBatchedSerializer(PickleSerializer())) - return rdd.ctx._jvm.org.apache.spark.ml.python.MLSerDe.pythonToJava(rdd._jrdd, True) - - -def _py2java(sc, obj): - """ Convert Python object into Java """ - if isinstance(obj, RDD): - obj = _to_java_object_rdd(obj) - elif isinstance(obj, DataFrame): - obj = obj._jdf - elif isinstance(obj, SparkContext): - obj = obj._jsc - elif isinstance(obj, list): - obj = [_py2java(sc, x) for x in obj] - elif isinstance(obj, JavaObject): - pass - elif isinstance(obj, (int, long, float, bool, bytes, unicode)): - pass - else: - data = bytearray(PickleSerializer().dumps(obj)) - obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data) - return obj - - -def _java2py(sc, r, encoding="bytes"): - if isinstance(r, JavaObject): - clsName = r.getClass().getSimpleName() - # convert RDD into JavaRDD - if clsName != 'JavaRDD' and clsName.endswith("RDD"): - r = r.toJavaRDD() - clsName = 'JavaRDD' - - if clsName == 'JavaRDD': - jrdd = sc._jvm.org.apache.spark.ml.python.MLSerDe.javaToPython(r) - return RDD(jrdd, sc) - - if clsName == 'Dataset': - return DataFrame(r, SQLContext.getOrCreate(sc)) - - if clsName in _picklable_classes: - r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(r) - elif isinstance(r, (JavaArray, JavaList)): - try: - r = sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(r) - except Py4JJavaError: - pass # not pickable - - if isinstance(r, (bytearray, bytes)): - r = PickleSerializer().loads(bytes(r), encoding=encoding) - return r - - -def callJavaFunc(sc, func, *args): - """ Call Java Function """ - args = [_py2java(sc, a) for a in args] - return _java2py(sc, func(*args)) - - -def inherit_doc(cls): - """ - A decorator that makes a class inherit documentation from its parents. - """ - for name, func in vars(cls).items(): - # only inherit docstring for public functions - if name.startswith("_"): - continue - if not func.__doc__: - for parent in cls.__bases__: - parent_func = getattr(parent, name, None) - if parent_func and getattr(parent_func, "__doc__", None): - func.__doc__ = parent_func.__doc__ - break - return cls diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/evaluation.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/evaluation.py deleted file mode 100644 index 8eaf076..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/evaluation.py +++ /dev/null @@ -1,450 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -from abc import abstractmethod, ABCMeta - -from pyspark import since, keyword_only -from pyspark.ml.wrapper import JavaParams -from pyspark.ml.param import Param, Params, TypeConverters -from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredictionCol, \ - HasFeaturesCol -from pyspark.ml.common import inherit_doc -from pyspark.ml.util import JavaMLReadable, JavaMLWritable - -__all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator', - 'MulticlassClassificationEvaluator', 'ClusteringEvaluator'] - - -@inherit_doc -class Evaluator(Params): - """ - Base class for evaluators that compute metrics from predictions. - - .. versionadded:: 1.4.0 - """ - - __metaclass__ = ABCMeta - - @abstractmethod - def _evaluate(self, dataset): - """ - Evaluates the output. - - :param dataset: a dataset that contains labels/observations and - predictions - :return: metric - """ - raise NotImplementedError() - - @since("1.4.0") - def evaluate(self, dataset, params=None): - """ - Evaluates the output with optional parameters. - - :param dataset: a dataset that contains labels/observations and - predictions - :param params: an optional param map that overrides embedded - params - :return: metric - """ - if params is None: - params = dict() - if isinstance(params, dict): - if params: - return self.copy(params)._evaluate(dataset) - else: - return self._evaluate(dataset) - else: - raise ValueError("Params must be a param map but got %s." % type(params)) - - @since("1.5.0") - def isLargerBetter(self): - """ - Indicates whether the metric returned by :py:meth:`evaluate` should be maximized - (True, default) or minimized (False). - A given evaluator may support multiple metrics which may be maximized or minimized. - """ - return True - - -@inherit_doc -class JavaEvaluator(JavaParams, Evaluator): - """ - Base class for :py:class:`Evaluator`s that wrap Java/Scala - implementations. - """ - - __metaclass__ = ABCMeta - - def _evaluate(self, dataset): - """ - Evaluates the output. - :param dataset: a dataset that contains labels/observations and predictions. - :return: evaluation metric - """ - self._transfer_params_to_java() - return self._java_obj.evaluate(dataset._jdf) - - def isLargerBetter(self): - self._transfer_params_to_java() - return self._java_obj.isLargerBetter() - - -@inherit_doc -class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol, - JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Evaluator for binary classification, which expects two input columns: rawPrediction and label. - The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label - 1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities). - - >>> from pyspark.ml.linalg import Vectors - >>> scoreAndLabels = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1]), - ... [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)]) - >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"]) - ... - >>> evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw") - >>> evaluator.evaluate(dataset) - 0.70... - >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"}) - 0.83... - >>> bce_path = temp_path + "/bce" - >>> evaluator.save(bce_path) - >>> evaluator2 = BinaryClassificationEvaluator.load(bce_path) - >>> str(evaluator2.getRawPredictionCol()) - 'raw' - - .. versionadded:: 1.4.0 - """ - - metricName = Param(Params._dummy(), "metricName", - "metric name in evaluation (areaUnderROC|areaUnderPR)", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, rawPredictionCol="rawPrediction", labelCol="label", - metricName="areaUnderROC"): - """ - __init__(self, rawPredictionCol="rawPrediction", labelCol="label", \ - metricName="areaUnderROC") - """ - super(BinaryClassificationEvaluator, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid) - self._setDefault(metricName="areaUnderROC") - kwargs = self._input_kwargs - self._set(**kwargs) - - @since("1.4.0") - def setMetricName(self, value): - """ - Sets the value of :py:attr:`metricName`. - """ - return self._set(metricName=value) - - @since("1.4.0") - def getMetricName(self): - """ - Gets the value of metricName or its default value. - """ - return self.getOrDefault(self.metricName) - - @keyword_only - @since("1.4.0") - def setParams(self, rawPredictionCol="rawPrediction", labelCol="label", - metricName="areaUnderROC"): - """ - setParams(self, rawPredictionCol="rawPrediction", labelCol="label", \ - metricName="areaUnderROC") - Sets params for binary classification evaluator. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - -@inherit_doc -class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, - JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Evaluator for Regression, which expects two input - columns: prediction and label. - - >>> scoreAndLabels = [(-28.98343821, -27.0), (20.21491975, 21.5), - ... (-25.98418959, -22.0), (30.69731842, 33.0), (74.69283752, 71.0)] - >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"]) - ... - >>> evaluator = RegressionEvaluator(predictionCol="raw") - >>> evaluator.evaluate(dataset) - 2.842... - >>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"}) - 0.993... - >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"}) - 2.649... - >>> re_path = temp_path + "/re" - >>> evaluator.save(re_path) - >>> evaluator2 = RegressionEvaluator.load(re_path) - >>> str(evaluator2.getPredictionCol()) - 'raw' - - .. versionadded:: 1.4.0 - """ - metricName = Param(Params._dummy(), "metricName", - """metric name in evaluation - one of: - rmse - root mean squared error (default) - mse - mean squared error - r2 - r^2 metric - mae - mean absolute error.""", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, predictionCol="prediction", labelCol="label", - metricName="rmse"): - """ - __init__(self, predictionCol="prediction", labelCol="label", \ - metricName="rmse") - """ - super(RegressionEvaluator, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid) - self._setDefault(metricName="rmse") - kwargs = self._input_kwargs - self._set(**kwargs) - - @since("1.4.0") - def setMetricName(self, value): - """ - Sets the value of :py:attr:`metricName`. - """ - return self._set(metricName=value) - - @since("1.4.0") - def getMetricName(self): - """ - Gets the value of metricName or its default value. - """ - return self.getOrDefault(self.metricName) - - @keyword_only - @since("1.4.0") - def setParams(self, predictionCol="prediction", labelCol="label", - metricName="rmse"): - """ - setParams(self, predictionCol="prediction", labelCol="label", \ - metricName="rmse") - Sets params for regression evaluator. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - -@inherit_doc -class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, - JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Evaluator for Multiclass Classification, which expects two input - columns: prediction and label. - - >>> scoreAndLabels = [(0.0, 0.0), (0.0, 1.0), (0.0, 0.0), - ... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)] - >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"]) - ... - >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") - >>> evaluator.evaluate(dataset) - 0.66... - >>> evaluator.evaluate(dataset, {evaluator.metricName: "accuracy"}) - 0.66... - >>> mce_path = temp_path + "/mce" - >>> evaluator.save(mce_path) - >>> evaluator2 = MulticlassClassificationEvaluator.load(mce_path) - >>> str(evaluator2.getPredictionCol()) - 'prediction' - - .. versionadded:: 1.5.0 - """ - metricName = Param(Params._dummy(), "metricName", - "metric name in evaluation " - "(f1|weightedPrecision|weightedRecall|accuracy)", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, predictionCol="prediction", labelCol="label", - metricName="f1"): - """ - __init__(self, predictionCol="prediction", labelCol="label", \ - metricName="f1") - """ - super(MulticlassClassificationEvaluator, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid) - self._setDefault(metricName="f1") - kwargs = self._input_kwargs - self._set(**kwargs) - - @since("1.5.0") - def setMetricName(self, value): - """ - Sets the value of :py:attr:`metricName`. - """ - return self._set(metricName=value) - - @since("1.5.0") - def getMetricName(self): - """ - Gets the value of metricName or its default value. - """ - return self.getOrDefault(self.metricName) - - @keyword_only - @since("1.5.0") - def setParams(self, predictionCol="prediction", labelCol="label", - metricName="f1"): - """ - setParams(self, predictionCol="prediction", labelCol="label", \ - metricName="f1") - Sets params for multiclass classification evaluator. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - -@inherit_doc -class ClusteringEvaluator(JavaEvaluator, HasPredictionCol, HasFeaturesCol, - JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Evaluator for Clustering results, which expects two input - columns: prediction and features. The metric computes the Silhouette - measure using the squared Euclidean distance. - - The Silhouette is a measure for the validation of the consistency - within clusters. It ranges between 1 and -1, where a value close to - 1 means that the points in a cluster are close to the other points - in the same cluster and far from the points of the other clusters. - - >>> from pyspark.ml.linalg import Vectors - >>> featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]), - ... [([0.0, 0.5], 0.0), ([0.5, 0.0], 0.0), ([10.0, 11.0], 1.0), - ... ([10.5, 11.5], 1.0), ([1.0, 1.0], 0.0), ([8.0, 6.0], 1.0)]) - >>> dataset = spark.createDataFrame(featureAndPredictions, ["features", "prediction"]) - ... - >>> evaluator = ClusteringEvaluator(predictionCol="prediction") - >>> evaluator.evaluate(dataset) - 0.9079... - >>> ce_path = temp_path + "/ce" - >>> evaluator.save(ce_path) - >>> evaluator2 = ClusteringEvaluator.load(ce_path) - >>> str(evaluator2.getPredictionCol()) - 'prediction' - - .. versionadded:: 2.3.0 - """ - metricName = Param(Params._dummy(), "metricName", - "metric name in evaluation (silhouette)", - typeConverter=TypeConverters.toString) - distanceMeasure = Param(Params._dummy(), "distanceMeasure", "The distance measure. " + - "Supported options: 'squaredEuclidean' and 'cosine'.", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, predictionCol="prediction", featuresCol="features", - metricName="silhouette", distanceMeasure="squaredEuclidean"): - """ - __init__(self, predictionCol="prediction", featuresCol="features", \ - metricName="silhouette", distanceMeasure="squaredEuclidean") - """ - super(ClusteringEvaluator, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.evaluation.ClusteringEvaluator", self.uid) - self._setDefault(metricName="silhouette", distanceMeasure="squaredEuclidean") - kwargs = self._input_kwargs - self._set(**kwargs) - - @since("2.3.0") - def setMetricName(self, value): - """ - Sets the value of :py:attr:`metricName`. - """ - return self._set(metricName=value) - - @since("2.3.0") - def getMetricName(self): - """ - Gets the value of metricName or its default value. - """ - return self.getOrDefault(self.metricName) - - @keyword_only - @since("2.3.0") - def setParams(self, predictionCol="prediction", featuresCol="features", - metricName="silhouette", distanceMeasure="squaredEuclidean"): - """ - setParams(self, predictionCol="prediction", featuresCol="features", \ - metricName="silhouette", distanceMeasure="squaredEuclidean") - Sets params for clustering evaluator. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.4.0") - def setDistanceMeasure(self, value): - """ - Sets the value of :py:attr:`distanceMeasure`. - """ - return self._set(distanceMeasure=value) - - @since("2.4.0") - def getDistanceMeasure(self): - """ - Gets the value of `distanceMeasure` - """ - return self.getOrDefault(self.distanceMeasure) - - -if __name__ == "__main__": - import doctest - import tempfile - import pyspark.ml.evaluation - from pyspark.sql import SparkSession - globs = pyspark.ml.evaluation.__dict__.copy() - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("ml.evaluation tests")\ - .getOrCreate() - globs['spark'] = spark - temp_path = tempfile.mkdtemp() - globs['temp_path'] = temp_path - try: - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - finally: - from shutil import rmtree - try: - rmtree(temp_path) - except OSError: - pass - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/feature.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/feature.py deleted file mode 100755 index bc4f4c9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/feature.py +++ /dev/null @@ -1,3901 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -if sys.version > '3': - basestring = str - -from pyspark import since, keyword_only, SparkContext -from pyspark.rdd import ignore_unicode_prefix -from pyspark.ml.linalg import _convert_to_vector -from pyspark.ml.param.shared import * -from pyspark.ml.util import JavaMLReadable, JavaMLWritable -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaTransformer, _jvm -from pyspark.ml.common import inherit_doc - -__all__ = ['Binarizer', - 'BucketedRandomProjectionLSH', 'BucketedRandomProjectionLSHModel', - 'Bucketizer', - 'ChiSqSelector', 'ChiSqSelectorModel', - 'CountVectorizer', 'CountVectorizerModel', - 'DCT', - 'ElementwiseProduct', - 'FeatureHasher', - 'HashingTF', - 'IDF', 'IDFModel', - 'Imputer', 'ImputerModel', - 'IndexToString', - 'MaxAbsScaler', 'MaxAbsScalerModel', - 'MinHashLSH', 'MinHashLSHModel', - 'MinMaxScaler', 'MinMaxScalerModel', - 'NGram', - 'Normalizer', - 'OneHotEncoder', - 'OneHotEncoderEstimator', 'OneHotEncoderModel', - 'PCA', 'PCAModel', - 'PolynomialExpansion', - 'QuantileDiscretizer', - 'RegexTokenizer', - 'RFormula', 'RFormulaModel', - 'SQLTransformer', - 'StandardScaler', 'StandardScalerModel', - 'StopWordsRemover', - 'StringIndexer', 'StringIndexerModel', - 'Tokenizer', - 'VectorAssembler', - 'VectorIndexer', 'VectorIndexerModel', - 'VectorSizeHint', - 'VectorSlicer', - 'Word2Vec', 'Word2VecModel'] - - -@inherit_doc -class Binarizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - Binarize a column of continuous features given a threshold. - - >>> df = spark.createDataFrame([(0.5,)], ["values"]) - >>> binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features") - >>> binarizer.transform(df).head().features - 0.0 - >>> binarizer.setParams(outputCol="freqs").transform(df).head().freqs - 0.0 - >>> params = {binarizer.threshold: -0.5, binarizer.outputCol: "vector"} - >>> binarizer.transform(df, params).head().vector - 1.0 - >>> binarizerPath = temp_path + "/binarizer" - >>> binarizer.save(binarizerPath) - >>> loadedBinarizer = Binarizer.load(binarizerPath) - >>> loadedBinarizer.getThreshold() == binarizer.getThreshold() - True - - .. versionadded:: 1.4.0 - """ - - threshold = Param(Params._dummy(), "threshold", - "threshold in binary classification prediction, in range [0, 1]", - typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, threshold=0.0, inputCol=None, outputCol=None): - """ - __init__(self, threshold=0.0, inputCol=None, outputCol=None) - """ - super(Binarizer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid) - self._setDefault(threshold=0.0) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, threshold=0.0, inputCol=None, outputCol=None): - """ - setParams(self, threshold=0.0, inputCol=None, outputCol=None) - Sets params for this Binarizer. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setThreshold(self, value): - """ - Sets the value of :py:attr:`threshold`. - """ - return self._set(threshold=value) - - @since("1.4.0") - def getThreshold(self): - """ - Gets the value of threshold or its default value. - """ - return self.getOrDefault(self.threshold) - - -class LSHParams(Params): - """ - Mixin for Locality Sensitive Hashing (LSH) algorithm parameters. - """ - - numHashTables = Param(Params._dummy(), "numHashTables", "number of hash tables, where " + - "increasing number of hash tables lowers the false negative rate, " + - "and decreasing it improves the running performance.", - typeConverter=TypeConverters.toInt) - - def __init__(self): - super(LSHParams, self).__init__() - - def setNumHashTables(self, value): - """ - Sets the value of :py:attr:`numHashTables`. - """ - return self._set(numHashTables=value) - - def getNumHashTables(self): - """ - Gets the value of numHashTables or its default value. - """ - return self.getOrDefault(self.numHashTables) - - -class LSHModel(JavaModel): - """ - Mixin for Locality Sensitive Hashing (LSH) models. - """ - - def approxNearestNeighbors(self, dataset, key, numNearestNeighbors, distCol="distCol"): - """ - Given a large dataset and an item, approximately find at most k items which have the - closest distance to the item. If the :py:attr:`outputCol` is missing, the method will - transform the data; if the :py:attr:`outputCol` exists, it will use that. This allows - caching of the transformed data when necessary. - - .. note:: This method is experimental and will likely change behavior in the next release. - - :param dataset: The dataset to search for nearest neighbors of the key. - :param key: Feature vector representing the item to search for. - :param numNearestNeighbors: The maximum number of nearest neighbors. - :param distCol: Output column for storing the distance between each result row and the key. - Use "distCol" as default value if it's not specified. - :return: A dataset containing at most k items closest to the key. A column "distCol" is - added to show the distance between each row and the key. - """ - return self._call_java("approxNearestNeighbors", dataset, key, numNearestNeighbors, - distCol) - - def approxSimilarityJoin(self, datasetA, datasetB, threshold, distCol="distCol"): - """ - Join two datasets to approximately find all pairs of rows whose distance are smaller than - the threshold. If the :py:attr:`outputCol` is missing, the method will transform the data; - if the :py:attr:`outputCol` exists, it will use that. This allows caching of the - transformed data when necessary. - - :param datasetA: One of the datasets to join. - :param datasetB: Another dataset to join. - :param threshold: The threshold for the distance of row pairs. - :param distCol: Output column for storing the distance between each pair of rows. Use - "distCol" as default value if it's not specified. - :return: A joined dataset containing pairs of rows. The original rows are in columns - "datasetA" and "datasetB", and a column "distCol" is added to show the distance - between each pair. - """ - threshold = TypeConverters.toFloat(threshold) - return self._call_java("approxSimilarityJoin", datasetA, datasetB, threshold, distCol) - - -@inherit_doc -class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed, - JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - LSH class for Euclidean distance metrics. - The input is dense or sparse vectors, each of which represents a point in the Euclidean - distance space. The output will be vectors of configurable dimension. Hash values in the same - dimension are calculated by the same hash function. - - .. seealso:: `Stable Distributions - `_ - .. seealso:: `Hashing for Similarity Search: A Survey `_ - - >>> from pyspark.ml.linalg import Vectors - >>> from pyspark.sql.functions import col - >>> data = [(0, Vectors.dense([-1.0, -1.0 ]),), - ... (1, Vectors.dense([-1.0, 1.0 ]),), - ... (2, Vectors.dense([1.0, -1.0 ]),), - ... (3, Vectors.dense([1.0, 1.0]),)] - >>> df = spark.createDataFrame(data, ["id", "features"]) - >>> brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", - ... seed=12345, bucketLength=1.0) - >>> model = brp.fit(df) - >>> model.transform(df).head() - Row(id=0, features=DenseVector([-1.0, -1.0]), hashes=[DenseVector([-1.0])]) - >>> data2 = [(4, Vectors.dense([2.0, 2.0 ]),), - ... (5, Vectors.dense([2.0, 3.0 ]),), - ... (6, Vectors.dense([3.0, 2.0 ]),), - ... (7, Vectors.dense([3.0, 3.0]),)] - >>> df2 = spark.createDataFrame(data2, ["id", "features"]) - >>> model.approxNearestNeighbors(df2, Vectors.dense([1.0, 2.0]), 1).collect() - [Row(id=4, features=DenseVector([2.0, 2.0]), hashes=[DenseVector([1.0])], distCol=1.0)] - >>> model.approxSimilarityJoin(df, df2, 3.0, distCol="EuclideanDistance").select( - ... col("datasetA.id").alias("idA"), - ... col("datasetB.id").alias("idB"), - ... col("EuclideanDistance")).show() - +---+---+-----------------+ - |idA|idB|EuclideanDistance| - +---+---+-----------------+ - | 3| 6| 2.23606797749979| - +---+---+-----------------+ - ... - >>> model.approxSimilarityJoin(df, df2, 3, distCol="EuclideanDistance").select( - ... col("datasetA.id").alias("idA"), - ... col("datasetB.id").alias("idB"), - ... col("EuclideanDistance")).show() - +---+---+-----------------+ - |idA|idB|EuclideanDistance| - +---+---+-----------------+ - | 3| 6| 2.23606797749979| - +---+---+-----------------+ - ... - >>> brpPath = temp_path + "/brp" - >>> brp.save(brpPath) - >>> brp2 = BucketedRandomProjectionLSH.load(brpPath) - >>> brp2.getBucketLength() == brp.getBucketLength() - True - >>> modelPath = temp_path + "/brp-model" - >>> model.save(modelPath) - >>> model2 = BucketedRandomProjectionLSHModel.load(modelPath) - >>> model.transform(df).head().hashes == model2.transform(df).head().hashes - True - - .. versionadded:: 2.2.0 - """ - - bucketLength = Param(Params._dummy(), "bucketLength", "the length of each hash bucket, " + - "a larger bucket lowers the false negative rate.", - typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, seed=None, numHashTables=1, - bucketLength=None): - """ - __init__(self, inputCol=None, outputCol=None, seed=None, numHashTables=1, \ - bucketLength=None) - """ - super(BucketedRandomProjectionLSH, self).__init__() - self._java_obj = \ - self._new_java_obj("org.apache.spark.ml.feature.BucketedRandomProjectionLSH", self.uid) - self._setDefault(numHashTables=1) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.2.0") - def setParams(self, inputCol=None, outputCol=None, seed=None, numHashTables=1, - bucketLength=None): - """ - setParams(self, inputCol=None, outputCol=None, seed=None, numHashTables=1, \ - bucketLength=None) - Sets params for this BucketedRandomProjectionLSH. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.2.0") - def setBucketLength(self, value): - """ - Sets the value of :py:attr:`bucketLength`. - """ - return self._set(bucketLength=value) - - @since("2.2.0") - def getBucketLength(self): - """ - Gets the value of bucketLength or its default value. - """ - return self.getOrDefault(self.bucketLength) - - def _create_model(self, java_model): - return BucketedRandomProjectionLSHModel(java_model) - - -class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): - r""" - .. note:: Experimental - - Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are - stored. The vectors are normalized to be unit vectors and each vector is used in a hash - function: :math:`h_i(x) = floor(r_i \cdot x / bucketLength)` where :math:`r_i` is the - i-th random unit vector. The number of buckets will be `(max L2 norm of input vectors) / - bucketLength`. - - .. versionadded:: 2.2.0 - """ - - -@inherit_doc -class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasHandleInvalid, - JavaMLReadable, JavaMLWritable): - """ - Maps a column of continuous features to a column of feature buckets. - - >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)] - >>> df = spark.createDataFrame(values, ["values"]) - >>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")], - ... inputCol="values", outputCol="buckets") - >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect() - >>> len(bucketed) - 6 - >>> bucketed[0].buckets - 0.0 - >>> bucketed[1].buckets - 0.0 - >>> bucketed[2].buckets - 1.0 - >>> bucketed[3].buckets - 2.0 - >>> bucketizer.setParams(outputCol="b").transform(df).head().b - 0.0 - >>> bucketizerPath = temp_path + "/bucketizer" - >>> bucketizer.save(bucketizerPath) - >>> loadedBucketizer = Bucketizer.load(bucketizerPath) - >>> loadedBucketizer.getSplits() == bucketizer.getSplits() - True - >>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect() - >>> len(bucketed) - 4 - - .. versionadded:: 1.4.0 - """ - - splits = \ - Param(Params._dummy(), "splits", - "Split points for mapping continuous features into buckets. With n+1 splits, " + - "there are n buckets. A bucket defined by splits x,y holds values in the " + - "range [x,y) except the last bucket, which also includes y. The splits " + - "should be of length >= 3 and strictly increasing. Values at -inf, inf must be " + - "explicitly provided to cover all Double values; otherwise, values outside the " + - "splits specified will be treated as errors.", - typeConverter=TypeConverters.toListFloat) - - handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " + - "Options are 'skip' (filter out rows with invalid values), " + - "'error' (throw an error), or 'keep' (keep invalid values in a special " + - "additional bucket).", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"): - """ - __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error") - """ - super(Bucketizer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid) - self._setDefault(handleInvalid="error") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"): - """ - setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error") - Sets params for this Bucketizer. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setSplits(self, value): - """ - Sets the value of :py:attr:`splits`. - """ - return self._set(splits=value) - - @since("1.4.0") - def getSplits(self): - """ - Gets the value of threshold or its default value. - """ - return self.getOrDefault(self.splits) - - -class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol): - """ - Params for :py:attr:`CountVectorizer` and :py:attr:`CountVectorizerModel`. - """ - - minTF = Param( - Params._dummy(), "minTF", "Filter to ignore rare words in" + - " a document. For each document, terms with frequency/count less than the given" + - " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" + - " times the term must appear in the document); if this is a double in [0,1), then this " + - "specifies a fraction (out of the document's token count). Note that the parameter is " + - "only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0", - typeConverter=TypeConverters.toFloat) - minDF = Param( - Params._dummy(), "minDF", "Specifies the minimum number of" + - " different documents a term must appear in to be included in the vocabulary." + - " If this is an integer >= 1, this specifies the number of documents the term must" + - " appear in; if this is a double in [0,1), then this specifies the fraction of documents." + - " Default 1.0", typeConverter=TypeConverters.toFloat) - maxDF = Param( - Params._dummy(), "maxDF", "Specifies the maximum number of" + - " different documents a term could appear in to be included in the vocabulary." + - " A term that appears more than the threshold will be ignored. If this is an" + - " integer >= 1, this specifies the maximum number of documents the term could appear in;" + - " if this is a double in [0,1), then this specifies the maximum" + - " fraction of documents the term could appear in." + - " Default (2^63) - 1", typeConverter=TypeConverters.toFloat) - vocabSize = Param( - Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.", - typeConverter=TypeConverters.toInt) - binary = Param( - Params._dummy(), "binary", "Binary toggle to control the output vector values." + - " If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful" + - " for discrete probabilistic models that model binary events rather than integer counts." + - " Default False", typeConverter=TypeConverters.toBoolean) - - def __init__(self, *args): - super(_CountVectorizerParams, self).__init__(*args) - self._setDefault(minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False) - - @since("1.6.0") - def getMinTF(self): - """ - Gets the value of minTF or its default value. - """ - return self.getOrDefault(self.minTF) - - @since("1.6.0") - def getMinDF(self): - """ - Gets the value of minDF or its default value. - """ - return self.getOrDefault(self.minDF) - - @since("2.4.0") - def getMaxDF(self): - """ - Gets the value of maxDF or its default value. - """ - return self.getOrDefault(self.maxDF) - - @since("1.6.0") - def getVocabSize(self): - """ - Gets the value of vocabSize or its default value. - """ - return self.getOrDefault(self.vocabSize) - - @since("2.0.0") - def getBinary(self): - """ - Gets the value of binary or its default value. - """ - return self.getOrDefault(self.binary) - - -@inherit_doc -class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, JavaMLWritable): - """ - Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`. - - >>> df = spark.createDataFrame( - ... [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])], - ... ["label", "raw"]) - >>> cv = CountVectorizer(inputCol="raw", outputCol="vectors") - >>> model = cv.fit(df) - >>> model.transform(df).show(truncate=False) - +-----+---------------+-------------------------+ - |label|raw |vectors | - +-----+---------------+-------------------------+ - |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])| - |1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])| - +-----+---------------+-------------------------+ - ... - >>> sorted(model.vocabulary) == ['a', 'b', 'c'] - True - >>> countVectorizerPath = temp_path + "/count-vectorizer" - >>> cv.save(countVectorizerPath) - >>> loadedCv = CountVectorizer.load(countVectorizerPath) - >>> loadedCv.getMinDF() == cv.getMinDF() - True - >>> loadedCv.getMinTF() == cv.getMinTF() - True - >>> loadedCv.getVocabSize() == cv.getVocabSize() - True - >>> modelPath = temp_path + "/count-vectorizer-model" - >>> model.save(modelPath) - >>> loadedModel = CountVectorizerModel.load(modelPath) - >>> loadedModel.vocabulary == model.vocabulary - True - >>> fromVocabModel = CountVectorizerModel.from_vocabulary(["a", "b", "c"], - ... inputCol="raw", outputCol="vectors") - >>> fromVocabModel.transform(df).show(truncate=False) - +-----+---------------+-------------------------+ - |label|raw |vectors | - +-----+---------------+-------------------------+ - |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])| - |1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])| - +-----+---------------+-------------------------+ - ... - - .. versionadded:: 1.6.0 - """ - - @keyword_only - def __init__(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False, - inputCol=None, outputCol=None): - """ - __init__(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False,\ - inputCol=None,outputCol=None) - """ - super(CountVectorizer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer", - self.uid) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.6.0") - def setParams(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False, - inputCol=None, outputCol=None): - """ - setParams(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False,\ - inputCol=None, outputCol=None) - Set the params for the CountVectorizer - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.6.0") - def setMinTF(self, value): - """ - Sets the value of :py:attr:`minTF`. - """ - return self._set(minTF=value) - - @since("1.6.0") - def setMinDF(self, value): - """ - Sets the value of :py:attr:`minDF`. - """ - return self._set(minDF=value) - - @since("2.4.0") - def setMaxDF(self, value): - """ - Sets the value of :py:attr:`maxDF`. - """ - return self._set(maxDF=value) - - @since("1.6.0") - def setVocabSize(self, value): - """ - Sets the value of :py:attr:`vocabSize`. - """ - return self._set(vocabSize=value) - - @since("2.0.0") - def setBinary(self, value): - """ - Sets the value of :py:attr:`binary`. - """ - return self._set(binary=value) - - def _create_model(self, java_model): - return CountVectorizerModel(java_model) - - -@inherit_doc -class CountVectorizerModel(JavaModel, _CountVectorizerParams, JavaMLReadable, JavaMLWritable): - """ - Model fitted by :py:class:`CountVectorizer`. - - .. versionadded:: 1.6.0 - """ - - @classmethod - @since("2.4.0") - def from_vocabulary(cls, vocabulary, inputCol, outputCol=None, minTF=None, binary=None): - """ - Construct the model directly from a vocabulary list of strings, - requires an active SparkContext. - """ - sc = SparkContext._active_spark_context - java_class = sc._gateway.jvm.java.lang.String - jvocab = CountVectorizerModel._new_java_array(vocabulary, java_class) - model = CountVectorizerModel._create_from_java_class( - "org.apache.spark.ml.feature.CountVectorizerModel", jvocab) - model.setInputCol(inputCol) - if outputCol is not None: - model.setOutputCol(outputCol) - if minTF is not None: - model.setMinTF(minTF) - if binary is not None: - model.setBinary(binary) - model._set(vocabSize=len(vocabulary)) - return model - - @property - @since("1.6.0") - def vocabulary(self): - """ - An array of terms in the vocabulary. - """ - return self._call_java("vocabulary") - - @since("2.4.0") - def setMinTF(self, value): - """ - Sets the value of :py:attr:`minTF`. - """ - return self._set(minTF=value) - - @since("2.4.0") - def setBinary(self, value): - """ - Sets the value of :py:attr:`binary`. - """ - return self._set(binary=value) - - -@inherit_doc -class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - A feature transformer that takes the 1D discrete cosine transform - of a real vector. No zero padding is performed on the input vector. - It returns a real vector of the same length representing the DCT. - The return vector is scaled such that the transform matrix is - unitary (aka scaled DCT-II). - - .. seealso:: `More information on Wikipedia - `_. - - >>> from pyspark.ml.linalg import Vectors - >>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"]) - >>> dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec") - >>> df2 = dct.transform(df1) - >>> df2.head().resultVec - DenseVector([10.969..., -0.707..., -2.041...]) - >>> df3 = DCT(inverse=True, inputCol="resultVec", outputCol="origVec").transform(df2) - >>> df3.head().origVec - DenseVector([5.0, 8.0, 6.0]) - >>> dctPath = temp_path + "/dct" - >>> dct.save(dctPath) - >>> loadedDtc = DCT.load(dctPath) - >>> loadedDtc.getInverse() - False - - .. versionadded:: 1.6.0 - """ - - inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " + - "default False.", typeConverter=TypeConverters.toBoolean) - - @keyword_only - def __init__(self, inverse=False, inputCol=None, outputCol=None): - """ - __init__(self, inverse=False, inputCol=None, outputCol=None) - """ - super(DCT, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid) - self._setDefault(inverse=False) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.6.0") - def setParams(self, inverse=False, inputCol=None, outputCol=None): - """ - setParams(self, inverse=False, inputCol=None, outputCol=None) - Sets params for this DCT. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.6.0") - def setInverse(self, value): - """ - Sets the value of :py:attr:`inverse`. - """ - return self._set(inverse=value) - - @since("1.6.0") - def getInverse(self): - """ - Gets the value of inverse or its default value. - """ - return self.getOrDefault(self.inverse) - - -@inherit_doc -class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, - JavaMLWritable): - """ - Outputs the Hadamard product (i.e., the element-wise product) of each input vector - with a provided "weight" vector. In other words, it scales each column of the dataset - by a scalar multiplier. - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"]) - >>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), - ... inputCol="values", outputCol="eprod") - >>> ep.transform(df).head().eprod - DenseVector([2.0, 2.0, 9.0]) - >>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod - DenseVector([4.0, 3.0, 15.0]) - >>> elementwiseProductPath = temp_path + "/elementwise-product" - >>> ep.save(elementwiseProductPath) - >>> loadedEp = ElementwiseProduct.load(elementwiseProductPath) - >>> loadedEp.getScalingVec() == ep.getScalingVec() - True - - .. versionadded:: 1.5.0 - """ - - scalingVec = Param(Params._dummy(), "scalingVec", "Vector for hadamard product.", - typeConverter=TypeConverters.toVector) - - @keyword_only - def __init__(self, scalingVec=None, inputCol=None, outputCol=None): - """ - __init__(self, scalingVec=None, inputCol=None, outputCol=None) - """ - super(ElementwiseProduct, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct", - self.uid) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.5.0") - def setParams(self, scalingVec=None, inputCol=None, outputCol=None): - """ - setParams(self, scalingVec=None, inputCol=None, outputCol=None) - Sets params for this ElementwiseProduct. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.0.0") - def setScalingVec(self, value): - """ - Sets the value of :py:attr:`scalingVec`. - """ - return self._set(scalingVec=value) - - @since("2.0.0") - def getScalingVec(self): - """ - Gets the value of scalingVec or its default value. - """ - return self.getOrDefault(self.scalingVec) - - -@inherit_doc -class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable, - JavaMLWritable): - """ - .. note:: Experimental - - Feature hashing projects a set of categorical or numerical features into a feature vector of - specified dimension (typically substantially smaller than that of the original feature - space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing) - to map features to indices in the feature vector. - - The FeatureHasher transformer operates on multiple columns. Each column may contain either - numeric or categorical features. Behavior and handling of column data types is as follows: - - * Numeric columns: - For numeric features, the hash value of the column name is used to map the - feature value to its index in the feature vector. By default, numeric features - are not treated as categorical (even when they are integers). To treat them - as categorical, specify the relevant columns in `categoricalCols`. - - * String columns: - For categorical features, the hash value of the string "column_name=value" - is used to map to the vector index, with an indicator value of `1.0`. - Thus, categorical features are "one-hot" encoded - (similarly to using :py:class:`OneHotEncoder` with `dropLast=false`). - - * Boolean columns: - Boolean values are treated in the same way as string columns. That is, - boolean features are represented as "column_name=true" or "column_name=false", - with an indicator value of `1.0`. - - Null (missing) values are ignored (implicitly zero in the resulting feature vector). - - Since a simple modulo is used to transform the hash function to a vector index, - it is advisable to use a power of two as the `numFeatures` parameter; - otherwise the features will not be mapped evenly to the vector indices. - - >>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")] - >>> cols = ["real", "bool", "stringNum", "string"] - >>> df = spark.createDataFrame(data, cols) - >>> hasher = FeatureHasher(inputCols=cols, outputCol="features") - >>> hasher.transform(df).head().features - SparseVector(262144, {174475: 2.0, 247670: 1.0, 257907: 1.0, 262126: 1.0}) - >>> hasher.setCategoricalCols(["real"]).transform(df).head().features - SparseVector(262144, {171257: 1.0, 247670: 1.0, 257907: 1.0, 262126: 1.0}) - >>> hasherPath = temp_path + "/hasher" - >>> hasher.save(hasherPath) - >>> loadedHasher = FeatureHasher.load(hasherPath) - >>> loadedHasher.getNumFeatures() == hasher.getNumFeatures() - True - >>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features - True - - .. versionadded:: 2.3.0 - """ - - categoricalCols = Param(Params._dummy(), "categoricalCols", - "numeric columns to treat as categorical", - typeConverter=TypeConverters.toListString) - - @keyword_only - def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None): - """ - __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None) - """ - super(FeatureHasher, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid) - self._setDefault(numFeatures=1 << 18) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.3.0") - def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None): - """ - setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None) - Sets params for this FeatureHasher. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.3.0") - def setCategoricalCols(self, value): - """ - Sets the value of :py:attr:`categoricalCols`. - """ - return self._set(categoricalCols=value) - - @since("2.3.0") - def getCategoricalCols(self): - """ - Gets the value of binary or its default value. - """ - return self.getOrDefault(self.categoricalCols) - - -@inherit_doc -class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable, - JavaMLWritable): - """ - Maps a sequence of terms to their term frequencies using the hashing trick. - Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32) - to calculate the hash code value for the term object. - Since a simple modulo is used to transform the hash function to a column index, - it is advisable to use a power of two as the numFeatures parameter; - otherwise the features will not be mapped evenly to the columns. - - >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["words"]) - >>> hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features") - >>> hashingTF.transform(df).head().features - SparseVector(10, {0: 1.0, 1: 1.0, 2: 1.0}) - >>> hashingTF.setParams(outputCol="freqs").transform(df).head().freqs - SparseVector(10, {0: 1.0, 1: 1.0, 2: 1.0}) - >>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"} - >>> hashingTF.transform(df, params).head().vector - SparseVector(5, {0: 1.0, 1: 1.0, 2: 1.0}) - >>> hashingTFPath = temp_path + "/hashing-tf" - >>> hashingTF.save(hashingTFPath) - >>> loadedHashingTF = HashingTF.load(hashingTFPath) - >>> loadedHashingTF.getNumFeatures() == hashingTF.getNumFeatures() - True - - .. versionadded:: 1.3.0 - """ - - binary = Param(Params._dummy(), "binary", "If True, all non zero counts are set to 1. " + - "This is useful for discrete probabilistic models that model binary events " + - "rather than integer counts. Default False.", - typeConverter=TypeConverters.toBoolean) - - @keyword_only - def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None): - """ - __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None) - """ - super(HashingTF, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.HashingTF", self.uid) - self._setDefault(numFeatures=1 << 18, binary=False) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.3.0") - def setParams(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None): - """ - setParams(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None) - Sets params for this HashingTF. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.0.0") - def setBinary(self, value): - """ - Sets the value of :py:attr:`binary`. - """ - return self._set(binary=value) - - @since("2.0.0") - def getBinary(self): - """ - Gets the value of binary or its default value. - """ - return self.getOrDefault(self.binary) - - -@inherit_doc -class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - Compute the Inverse Document Frequency (IDF) given a collection of documents. - - >>> from pyspark.ml.linalg import DenseVector - >>> df = spark.createDataFrame([(DenseVector([1.0, 2.0]),), - ... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"]) - >>> idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf") - >>> model = idf.fit(df) - >>> model.idf - DenseVector([0.0, 0.0]) - >>> model.transform(df).head().idf - DenseVector([0.0, 0.0]) - >>> idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs - DenseVector([0.0, 0.0]) - >>> params = {idf.minDocFreq: 1, idf.outputCol: "vector"} - >>> idf.fit(df, params).transform(df).head().vector - DenseVector([0.2877, 0.0]) - >>> idfPath = temp_path + "/idf" - >>> idf.save(idfPath) - >>> loadedIdf = IDF.load(idfPath) - >>> loadedIdf.getMinDocFreq() == idf.getMinDocFreq() - True - >>> modelPath = temp_path + "/idf-model" - >>> model.save(modelPath) - >>> loadedModel = IDFModel.load(modelPath) - >>> loadedModel.transform(df).head().idf == model.transform(df).head().idf - True - - .. versionadded:: 1.4.0 - """ - - minDocFreq = Param(Params._dummy(), "minDocFreq", - "minimum number of documents in which a term should appear for filtering", - typeConverter=TypeConverters.toInt) - - @keyword_only - def __init__(self, minDocFreq=0, inputCol=None, outputCol=None): - """ - __init__(self, minDocFreq=0, inputCol=None, outputCol=None) - """ - super(IDF, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid) - self._setDefault(minDocFreq=0) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, minDocFreq=0, inputCol=None, outputCol=None): - """ - setParams(self, minDocFreq=0, inputCol=None, outputCol=None) - Sets params for this IDF. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setMinDocFreq(self, value): - """ - Sets the value of :py:attr:`minDocFreq`. - """ - return self._set(minDocFreq=value) - - @since("1.4.0") - def getMinDocFreq(self): - """ - Gets the value of minDocFreq or its default value. - """ - return self.getOrDefault(self.minDocFreq) - - def _create_model(self, java_model): - return IDFModel(java_model) - - -class IDFModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - Model fitted by :py:class:`IDF`. - - .. versionadded:: 1.4.0 - """ - - @property - @since("2.0.0") - def idf(self): - """ - Returns the IDF vector. - """ - return self._call_java("idf") - - -@inherit_doc -class Imputer(JavaEstimator, HasInputCols, JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Imputation estimator for completing missing values, either using the mean or the median - of the columns in which the missing values are located. The input columns should be of - DoubleType or FloatType. Currently Imputer does not support categorical features and - possibly creates incorrect values for a categorical feature. - - Note that the mean/median value is computed after filtering out missing values. - All Null values in the input columns are treated as missing, and so are also imputed. For - computing median, :py:meth:`pyspark.sql.DataFrame.approxQuantile` is used with a - relative error of `0.001`. - - >>> df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0), - ... (4.0, 4.0), (5.0, 5.0)], ["a", "b"]) - >>> imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"]) - >>> model = imputer.fit(df) - >>> model.surrogateDF.show() - +---+---+ - | a| b| - +---+---+ - |3.0|4.0| - +---+---+ - ... - >>> model.transform(df).show() - +---+---+-----+-----+ - | a| b|out_a|out_b| - +---+---+-----+-----+ - |1.0|NaN| 1.0| 4.0| - |2.0|NaN| 2.0| 4.0| - |NaN|3.0| 3.0| 3.0| - ... - >>> imputer.setStrategy("median").setMissingValue(1.0).fit(df).transform(df).show() - +---+---+-----+-----+ - | a| b|out_a|out_b| - +---+---+-----+-----+ - |1.0|NaN| 4.0| NaN| - ... - >>> imputerPath = temp_path + "/imputer" - >>> imputer.save(imputerPath) - >>> loadedImputer = Imputer.load(imputerPath) - >>> loadedImputer.getStrategy() == imputer.getStrategy() - True - >>> loadedImputer.getMissingValue() - 1.0 - >>> modelPath = temp_path + "/imputer-model" - >>> model.save(modelPath) - >>> loadedModel = ImputerModel.load(modelPath) - >>> loadedModel.transform(df).head().out_a == model.transform(df).head().out_a - True - - .. versionadded:: 2.2.0 - """ - - outputCols = Param(Params._dummy(), "outputCols", - "output column names.", typeConverter=TypeConverters.toListString) - - strategy = Param(Params._dummy(), "strategy", - "strategy for imputation. If mean, then replace missing values using the mean " - "value of the feature. If median, then replace missing values using the " - "median value of the feature.", - typeConverter=TypeConverters.toString) - - missingValue = Param(Params._dummy(), "missingValue", - "The placeholder for the missing values. All occurrences of missingValue " - "will be imputed.", typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, strategy="mean", missingValue=float("nan"), inputCols=None, - outputCols=None): - """ - __init__(self, strategy="mean", missingValue=float("nan"), inputCols=None, \ - outputCols=None): - """ - super(Imputer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Imputer", self.uid) - self._setDefault(strategy="mean", missingValue=float("nan")) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.2.0") - def setParams(self, strategy="mean", missingValue=float("nan"), inputCols=None, - outputCols=None): - """ - setParams(self, strategy="mean", missingValue=float("nan"), inputCols=None, \ - outputCols=None) - Sets params for this Imputer. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.2.0") - def setOutputCols(self, value): - """ - Sets the value of :py:attr:`outputCols`. - """ - return self._set(outputCols=value) - - @since("2.2.0") - def getOutputCols(self): - """ - Gets the value of :py:attr:`outputCols` or its default value. - """ - return self.getOrDefault(self.outputCols) - - @since("2.2.0") - def setStrategy(self, value): - """ - Sets the value of :py:attr:`strategy`. - """ - return self._set(strategy=value) - - @since("2.2.0") - def getStrategy(self): - """ - Gets the value of :py:attr:`strategy` or its default value. - """ - return self.getOrDefault(self.strategy) - - @since("2.2.0") - def setMissingValue(self, value): - """ - Sets the value of :py:attr:`missingValue`. - """ - return self._set(missingValue=value) - - @since("2.2.0") - def getMissingValue(self): - """ - Gets the value of :py:attr:`missingValue` or its default value. - """ - return self.getOrDefault(self.missingValue) - - def _create_model(self, java_model): - return ImputerModel(java_model) - - -class ImputerModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Model fitted by :py:class:`Imputer`. - - .. versionadded:: 2.2.0 - """ - - @property - @since("2.2.0") - def surrogateDF(self): - """ - Returns a DataFrame containing inputCols and their corresponding surrogates, - which are used to replace the missing values in the input DataFrame. - """ - return self._call_java("surrogateDF") - - -@inherit_doc -class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - Rescale each feature individually to range [-1, 1] by dividing through the largest maximum - absolute value in each feature. It does not shift/center the data, and thus does not destroy - any sparsity. - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) - >>> maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") - >>> model = maScaler.fit(df) - >>> model.transform(df).show() - +-----+------+ - | a|scaled| - +-----+------+ - |[1.0]| [0.5]| - |[2.0]| [1.0]| - +-----+------+ - ... - >>> scalerPath = temp_path + "/max-abs-scaler" - >>> maScaler.save(scalerPath) - >>> loadedMAScaler = MaxAbsScaler.load(scalerPath) - >>> loadedMAScaler.getInputCol() == maScaler.getInputCol() - True - >>> loadedMAScaler.getOutputCol() == maScaler.getOutputCol() - True - >>> modelPath = temp_path + "/max-abs-scaler-model" - >>> model.save(modelPath) - >>> loadedModel = MaxAbsScalerModel.load(modelPath) - >>> loadedModel.maxAbs == model.maxAbs - True - - .. versionadded:: 2.0.0 - """ - - @keyword_only - def __init__(self, inputCol=None, outputCol=None): - """ - __init__(self, inputCol=None, outputCol=None) - """ - super(MaxAbsScaler, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MaxAbsScaler", self.uid) - self._setDefault() - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.0.0") - def setParams(self, inputCol=None, outputCol=None): - """ - setParams(self, inputCol=None, outputCol=None) - Sets params for this MaxAbsScaler. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return MaxAbsScalerModel(java_model) - - -class MaxAbsScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - Model fitted by :py:class:`MaxAbsScaler`. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def maxAbs(self): - """ - Max Abs vector. - """ - return self._call_java("maxAbs") - - -@inherit_doc -class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed, - JavaMLReadable, JavaMLWritable): - - """ - .. note:: Experimental - - LSH class for Jaccard distance. - The input can be dense or sparse vectors, but it is more efficient if it is sparse. - For example, `Vectors.sparse(10, [(2, 1.0), (3, 1.0), (5, 1.0)])` means there are 10 elements - in the space. This set contains elements 2, 3, and 5. Also, any input vector must have at - least 1 non-zero index, and all non-zero values are treated as binary "1" values. - - .. seealso:: `Wikipedia on MinHash `_ - - >>> from pyspark.ml.linalg import Vectors - >>> from pyspark.sql.functions import col - >>> data = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),), - ... (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),), - ... (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)] - >>> df = spark.createDataFrame(data, ["id", "features"]) - >>> mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345) - >>> model = mh.fit(df) - >>> model.transform(df).head() - Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), hashes=[DenseVector([6179668... - >>> data2 = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),), - ... (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),), - ... (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)] - >>> df2 = spark.createDataFrame(data2, ["id", "features"]) - >>> key = Vectors.sparse(6, [1, 2], [1.0, 1.0]) - >>> model.approxNearestNeighbors(df2, key, 1).collect() - [Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), hashes=[DenseVector([6179668... - >>> model.approxSimilarityJoin(df, df2, 0.6, distCol="JaccardDistance").select( - ... col("datasetA.id").alias("idA"), - ... col("datasetB.id").alias("idB"), - ... col("JaccardDistance")).show() - +---+---+---------------+ - |idA|idB|JaccardDistance| - +---+---+---------------+ - | 0| 5| 0.5| - | 1| 4| 0.5| - +---+---+---------------+ - ... - >>> mhPath = temp_path + "/mh" - >>> mh.save(mhPath) - >>> mh2 = MinHashLSH.load(mhPath) - >>> mh2.getOutputCol() == mh.getOutputCol() - True - >>> modelPath = temp_path + "/mh-model" - >>> model.save(modelPath) - >>> model2 = MinHashLSHModel.load(modelPath) - >>> model.transform(df).head().hashes == model2.transform(df).head().hashes - True - - .. versionadded:: 2.2.0 - """ - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, seed=None, numHashTables=1): - """ - __init__(self, inputCol=None, outputCol=None, seed=None, numHashTables=1) - """ - super(MinHashLSH, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinHashLSH", self.uid) - self._setDefault(numHashTables=1) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.2.0") - def setParams(self, inputCol=None, outputCol=None, seed=None, numHashTables=1): - """ - setParams(self, inputCol=None, outputCol=None, seed=None, numHashTables=1) - Sets params for this MinHashLSH. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return MinHashLSHModel(java_model) - - -class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): - r""" - .. note:: Experimental - - Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each - hash function is picked from the following family of hash functions, where :math:`a_i` and - :math:`b_i` are randomly chosen integers less than prime: - :math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise - independent according to the reference. - - .. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear - permutations." Electronic Journal of Combinatorics 7 (2000): R26. - - .. versionadded:: 2.2.0 - """ - - -@inherit_doc -class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - Rescale each feature individually to a common range [min, max] linearly using column summary - statistics, which is also known as min-max normalization or Rescaling. The rescaled value for - feature E is calculated as, - - Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min - - For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min) - - .. note:: Since zero values will probably be transformed to non-zero values, output of the - transformer will be DenseVector even for sparse input. - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) - >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") - >>> model = mmScaler.fit(df) - >>> model.originalMin - DenseVector([0.0]) - >>> model.originalMax - DenseVector([2.0]) - >>> model.transform(df).show() - +-----+------+ - | a|scaled| - +-----+------+ - |[0.0]| [0.0]| - |[2.0]| [1.0]| - +-----+------+ - ... - >>> minMaxScalerPath = temp_path + "/min-max-scaler" - >>> mmScaler.save(minMaxScalerPath) - >>> loadedMMScaler = MinMaxScaler.load(minMaxScalerPath) - >>> loadedMMScaler.getMin() == mmScaler.getMin() - True - >>> loadedMMScaler.getMax() == mmScaler.getMax() - True - >>> modelPath = temp_path + "/min-max-scaler-model" - >>> model.save(modelPath) - >>> loadedModel = MinMaxScalerModel.load(modelPath) - >>> loadedModel.originalMin == model.originalMin - True - >>> loadedModel.originalMax == model.originalMax - True - - .. versionadded:: 1.6.0 - """ - - min = Param(Params._dummy(), "min", "Lower bound of the output feature range", - typeConverter=TypeConverters.toFloat) - max = Param(Params._dummy(), "max", "Upper bound of the output feature range", - typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): - """ - __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None) - """ - super(MinMaxScaler, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid) - self._setDefault(min=0.0, max=1.0) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.6.0") - def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None): - """ - setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None) - Sets params for this MinMaxScaler. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.6.0") - def setMin(self, value): - """ - Sets the value of :py:attr:`min`. - """ - return self._set(min=value) - - @since("1.6.0") - def getMin(self): - """ - Gets the value of min or its default value. - """ - return self.getOrDefault(self.min) - - @since("1.6.0") - def setMax(self, value): - """ - Sets the value of :py:attr:`max`. - """ - return self._set(max=value) - - @since("1.6.0") - def getMax(self): - """ - Gets the value of max or its default value. - """ - return self.getOrDefault(self.max) - - def _create_model(self, java_model): - return MinMaxScalerModel(java_model) - - -class MinMaxScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - Model fitted by :py:class:`MinMaxScaler`. - - .. versionadded:: 1.6.0 - """ - - @property - @since("2.0.0") - def originalMin(self): - """ - Min value for each original column during fitting. - """ - return self._call_java("originalMin") - - @property - @since("2.0.0") - def originalMax(self): - """ - Max value for each original column during fitting. - """ - return self._call_java("originalMax") - - -@inherit_doc -@ignore_unicode_prefix -class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - A feature transformer that converts the input array of strings into an array of n-grams. Null - values in the input array are ignored. - It returns an array of n-grams where each n-gram is represented by a space-separated string of - words. - When the input is empty, an empty array is returned. - When the input array length is less than n (number of elements per n-gram), no n-grams are - returned. - - >>> df = spark.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])]) - >>> ngram = NGram(n=2, inputCol="inputTokens", outputCol="nGrams") - >>> ngram.transform(df).head() - Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b', u'b c', u'c d', u'd e']) - >>> # Change n-gram length - >>> ngram.setParams(n=4).transform(df).head() - Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e']) - >>> # Temporarily modify output column. - >>> ngram.transform(df, {ngram.outputCol: "output"}).head() - Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], output=[u'a b c d', u'b c d e']) - >>> ngram.transform(df).head() - Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e']) - >>> # Must use keyword arguments to specify params. - >>> ngram.setParams("text") - Traceback (most recent call last): - ... - TypeError: Method setParams forces keyword arguments. - >>> ngramPath = temp_path + "/ngram" - >>> ngram.save(ngramPath) - >>> loadedNGram = NGram.load(ngramPath) - >>> loadedNGram.getN() == ngram.getN() - True - - .. versionadded:: 1.5.0 - """ - - n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)", - typeConverter=TypeConverters.toInt) - - @keyword_only - def __init__(self, n=2, inputCol=None, outputCol=None): - """ - __init__(self, n=2, inputCol=None, outputCol=None) - """ - super(NGram, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid) - self._setDefault(n=2) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.5.0") - def setParams(self, n=2, inputCol=None, outputCol=None): - """ - setParams(self, n=2, inputCol=None, outputCol=None) - Sets params for this NGram. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.5.0") - def setN(self, value): - """ - Sets the value of :py:attr:`n`. - """ - return self._set(n=value) - - @since("1.5.0") - def getN(self): - """ - Gets the value of n or its default value. - """ - return self.getOrDefault(self.n) - - -@inherit_doc -class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - Normalize a vector to have unit norm using the given p-norm. - - >>> from pyspark.ml.linalg import Vectors - >>> svec = Vectors.sparse(4, {1: 4.0, 3: 3.0}) - >>> df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"]) - >>> normalizer = Normalizer(p=2.0, inputCol="dense", outputCol="features") - >>> normalizer.transform(df).head().features - DenseVector([0.6, -0.8]) - >>> normalizer.setParams(inputCol="sparse", outputCol="freqs").transform(df).head().freqs - SparseVector(4, {1: 0.8, 3: 0.6}) - >>> params = {normalizer.p: 1.0, normalizer.inputCol: "dense", normalizer.outputCol: "vector"} - >>> normalizer.transform(df, params).head().vector - DenseVector([0.4286, -0.5714]) - >>> normalizerPath = temp_path + "/normalizer" - >>> normalizer.save(normalizerPath) - >>> loadedNormalizer = Normalizer.load(normalizerPath) - >>> loadedNormalizer.getP() == normalizer.getP() - True - - .. versionadded:: 1.4.0 - """ - - p = Param(Params._dummy(), "p", "the p norm value.", - typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, p=2.0, inputCol=None, outputCol=None): - """ - __init__(self, p=2.0, inputCol=None, outputCol=None) - """ - super(Normalizer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid) - self._setDefault(p=2.0) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, p=2.0, inputCol=None, outputCol=None): - """ - setParams(self, p=2.0, inputCol=None, outputCol=None) - Sets params for this Normalizer. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setP(self, value): - """ - Sets the value of :py:attr:`p`. - """ - return self._set(p=value) - - @since("1.4.0") - def getP(self): - """ - Gets the value of p or its default value. - """ - return self.getOrDefault(self.p) - - -@inherit_doc -class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - A one-hot encoder that maps a column of category indices to a - column of binary vectors, with at most a single one-value per row - that indicates the input category index. - For example with 5 categories, an input value of 2.0 would map to - an output vector of `[0.0, 0.0, 1.0, 0.0]`. - The last category is not included by default (configurable via - :py:attr:`dropLast`) because it makes the vector entries sum up to - one, and hence linearly dependent. - So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`. - - .. note:: This is different from scikit-learn's OneHotEncoder, - which keeps all categories. The output vectors are sparse. - - .. note:: Deprecated in 2.3.0. :py:class:`OneHotEncoderEstimator` will be renamed to - :py:class:`OneHotEncoder` and this :py:class:`OneHotEncoder` will be removed in 3.0.0. - - .. seealso:: - - :py:class:`StringIndexer` for converting categorical values into - category indices - - >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") - >>> model = stringIndexer.fit(stringIndDf) - >>> td = model.transform(stringIndDf) - >>> encoder = OneHotEncoder(inputCol="indexed", outputCol="features") - >>> encoder.transform(td).head().features - SparseVector(2, {0: 1.0}) - >>> encoder.setParams(outputCol="freqs").transform(td).head().freqs - SparseVector(2, {0: 1.0}) - >>> params = {encoder.dropLast: False, encoder.outputCol: "test"} - >>> encoder.transform(td, params).head().test - SparseVector(3, {0: 1.0}) - >>> onehotEncoderPath = temp_path + "/onehot-encoder" - >>> encoder.save(onehotEncoderPath) - >>> loadedEncoder = OneHotEncoder.load(onehotEncoderPath) - >>> loadedEncoder.getDropLast() == encoder.getDropLast() - True - - .. versionadded:: 1.4.0 - """ - - dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category", - typeConverter=TypeConverters.toBoolean) - - @keyword_only - def __init__(self, dropLast=True, inputCol=None, outputCol=None): - """ - __init__(self, dropLast=True, inputCol=None, outputCol=None) - """ - super(OneHotEncoder, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid) - self._setDefault(dropLast=True) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, dropLast=True, inputCol=None, outputCol=None): - """ - setParams(self, dropLast=True, inputCol=None, outputCol=None) - Sets params for this OneHotEncoder. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setDropLast(self, value): - """ - Sets the value of :py:attr:`dropLast`. - """ - return self._set(dropLast=value) - - @since("1.4.0") - def getDropLast(self): - """ - Gets the value of dropLast or its default value. - """ - return self.getOrDefault(self.dropLast) - - -@inherit_doc -class OneHotEncoderEstimator(JavaEstimator, HasInputCols, HasOutputCols, HasHandleInvalid, - JavaMLReadable, JavaMLWritable): - """ - A one-hot encoder that maps a column of category indices to a column of binary vectors, with - at most a single one-value per row that indicates the input category index. - For example with 5 categories, an input value of 2.0 would map to an output vector of - `[0.0, 0.0, 1.0, 0.0]`. - The last category is not included by default (configurable via `dropLast`), - because it makes the vector entries sum up to one, and hence linearly dependent. - So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`. - - Note: This is different from scikit-learn's OneHotEncoder, which keeps all categories. - The output vectors are sparse. - - When `handleInvalid` is configured to 'keep', an extra "category" indicating invalid values is - added as last category. So when `dropLast` is true, invalid values are encoded as all-zeros - vector. - - Note: When encoding multi-column by using `inputCols` and `outputCols` params, input/output - cols come in pairs, specified by the order in the arrays, and each pair is treated - independently. - - See `StringIndexer` for converting categorical values into category indices - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"]) - >>> ohe = OneHotEncoderEstimator(inputCols=["input"], outputCols=["output"]) - >>> model = ohe.fit(df) - >>> model.transform(df).head().output - SparseVector(2, {0: 1.0}) - >>> ohePath = temp_path + "/oheEstimator" - >>> ohe.save(ohePath) - >>> loadedOHE = OneHotEncoderEstimator.load(ohePath) - >>> loadedOHE.getInputCols() == ohe.getInputCols() - True - >>> modelPath = temp_path + "/ohe-model" - >>> model.save(modelPath) - >>> loadedModel = OneHotEncoderModel.load(modelPath) - >>> loadedModel.categorySizes == model.categorySizes - True - - .. versionadded:: 2.3.0 - """ - - handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data during " + - "transform(). Options are 'keep' (invalid data presented as an extra " + - "categorical feature) or error (throw an error). Note that this Param " + - "is only used during transform; during fitting, invalid data will " + - "result in an error.", - typeConverter=TypeConverters.toString) - - dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category", - typeConverter=TypeConverters.toBoolean) - - @keyword_only - def __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True): - """ - __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True) - """ - super(OneHotEncoderEstimator, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.feature.OneHotEncoderEstimator", self.uid) - self._setDefault(handleInvalid="error", dropLast=True) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.3.0") - def setParams(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True): - """ - setParams(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True) - Sets params for this OneHotEncoderEstimator. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.3.0") - def setDropLast(self, value): - """ - Sets the value of :py:attr:`dropLast`. - """ - return self._set(dropLast=value) - - @since("2.3.0") - def getDropLast(self): - """ - Gets the value of dropLast or its default value. - """ - return self.getOrDefault(self.dropLast) - - def _create_model(self, java_model): - return OneHotEncoderModel(java_model) - - -class OneHotEncoderModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - Model fitted by :py:class:`OneHotEncoderEstimator`. - - .. versionadded:: 2.3.0 - """ - - @property - @since("2.3.0") - def categorySizes(self): - """ - Original number of categories for each feature being encoded. - The array contains one value for each input column, in order. - """ - return self._call_java("categorySizes") - - -@inherit_doc -class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, - JavaMLWritable): - """ - Perform feature expansion in a polynomial space. As said in `wikipedia of Polynomial Expansion - `_, "In mathematics, an - expansion of a product of sums expresses it as a sum of products by using the fact that - multiplication distributes over addition". Take a 2-variable feature vector as an example: - `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`. - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"]) - >>> px = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") - >>> px.transform(df).head().expanded - DenseVector([0.5, 0.25, 2.0, 1.0, 4.0]) - >>> px.setParams(outputCol="test").transform(df).head().test - DenseVector([0.5, 0.25, 2.0, 1.0, 4.0]) - >>> polyExpansionPath = temp_path + "/poly-expansion" - >>> px.save(polyExpansionPath) - >>> loadedPx = PolynomialExpansion.load(polyExpansionPath) - >>> loadedPx.getDegree() == px.getDegree() - True - - .. versionadded:: 1.4.0 - """ - - degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)", - typeConverter=TypeConverters.toInt) - - @keyword_only - def __init__(self, degree=2, inputCol=None, outputCol=None): - """ - __init__(self, degree=2, inputCol=None, outputCol=None) - """ - super(PolynomialExpansion, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.feature.PolynomialExpansion", self.uid) - self._setDefault(degree=2) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, degree=2, inputCol=None, outputCol=None): - """ - setParams(self, degree=2, inputCol=None, outputCol=None) - Sets params for this PolynomialExpansion. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setDegree(self, value): - """ - Sets the value of :py:attr:`degree`. - """ - return self._set(degree=value) - - @since("1.4.0") - def getDegree(self): - """ - Gets the value of degree or its default value. - """ - return self.getOrDefault(self.degree) - - -@inherit_doc -class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, - JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned - categorical features. The number of bins can be set using the :py:attr:`numBuckets` parameter. - It is possible that the number of buckets used will be less than this value, for example, if - there are too few distinct values of the input to create enough distinct quantiles. - - NaN handling: Note also that - QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user - can also choose to either keep or remove NaN values within the dataset by setting - :py:attr:`handleInvalid` parameter. If the user chooses to keep NaN values, they will be - handled specially and placed into their own bucket, for example, if 4 buckets are used, then - non-NaN data will be put into buckets[0-3], but NaNs will be counted in a special bucket[4]. - - Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for - :py:meth:`~.DataFrameStatFunctions.approxQuantile` for a detailed description). - The precision of the approximation can be controlled with the - :py:attr:`relativeError` parameter. - The lower and upper bin bounds will be `-Infinity` and `+Infinity`, covering all real values. - - >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)] - >>> df = spark.createDataFrame(values, ["values"]) - >>> qds = QuantileDiscretizer(numBuckets=2, - ... inputCol="values", outputCol="buckets", relativeError=0.01, handleInvalid="error") - >>> qds.getRelativeError() - 0.01 - >>> bucketizer = qds.fit(df) - >>> qds.setHandleInvalid("keep").fit(df).transform(df).count() - 6 - >>> qds.setHandleInvalid("skip").fit(df).transform(df).count() - 4 - >>> splits = bucketizer.getSplits() - >>> splits[0] - -inf - >>> print("%2.1f" % round(splits[1], 1)) - 0.4 - >>> bucketed = bucketizer.transform(df).head() - >>> bucketed.buckets - 0.0 - >>> quantileDiscretizerPath = temp_path + "/quantile-discretizer" - >>> qds.save(quantileDiscretizerPath) - >>> loadedQds = QuantileDiscretizer.load(quantileDiscretizerPath) - >>> loadedQds.getNumBuckets() == qds.getNumBuckets() - True - - .. versionadded:: 2.0.0 - """ - - numBuckets = Param(Params._dummy(), "numBuckets", - "Maximum number of buckets (quantiles, or " + - "categories) into which data points are grouped. Must be >= 2.", - typeConverter=TypeConverters.toInt) - - relativeError = Param(Params._dummy(), "relativeError", "The relative target precision for " + - "the approximate quantile algorithm used to generate buckets. " + - "Must be in the range [0, 1].", - typeConverter=TypeConverters.toFloat) - - handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " + - "Options are skip (filter out rows with invalid values), " + - "error (throw an error), or keep (keep invalid values in a special " + - "additional bucket).", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, - handleInvalid="error"): - """ - __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \ - handleInvalid="error") - """ - super(QuantileDiscretizer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer", - self.uid) - self._setDefault(numBuckets=2, relativeError=0.001, handleInvalid="error") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.0.0") - def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, - handleInvalid="error"): - """ - setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \ - handleInvalid="error") - Set the params for the QuantileDiscretizer - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.0.0") - def setNumBuckets(self, value): - """ - Sets the value of :py:attr:`numBuckets`. - """ - return self._set(numBuckets=value) - - @since("2.0.0") - def getNumBuckets(self): - """ - Gets the value of numBuckets or its default value. - """ - return self.getOrDefault(self.numBuckets) - - @since("2.0.0") - def setRelativeError(self, value): - """ - Sets the value of :py:attr:`relativeError`. - """ - return self._set(relativeError=value) - - @since("2.0.0") - def getRelativeError(self): - """ - Gets the value of relativeError or its default value. - """ - return self.getOrDefault(self.relativeError) - - def _create_model(self, java_model): - """ - Private method to convert the java_model to a Python model. - """ - return Bucketizer(splits=list(java_model.getSplits()), - inputCol=self.getInputCol(), - outputCol=self.getOutputCol(), - handleInvalid=self.getHandleInvalid()) - - -@inherit_doc -@ignore_unicode_prefix -class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - A regex based tokenizer that extracts tokens either by using the - provided regex pattern (in Java dialect) to split the text - (default) or repeatedly matching the regex (if gaps is false). - Optional parameters also allow filtering tokens using a minimal - length. - It returns an array of strings that can be empty. - - >>> df = spark.createDataFrame([("A B c",)], ["text"]) - >>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words") - >>> reTokenizer.transform(df).head() - Row(text=u'A B c', words=[u'a', u'b', u'c']) - >>> # Change a parameter. - >>> reTokenizer.setParams(outputCol="tokens").transform(df).head() - Row(text=u'A B c', tokens=[u'a', u'b', u'c']) - >>> # Temporarily modify a parameter. - >>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head() - Row(text=u'A B c', words=[u'a', u'b', u'c']) - >>> reTokenizer.transform(df).head() - Row(text=u'A B c', tokens=[u'a', u'b', u'c']) - >>> # Must use keyword arguments to specify params. - >>> reTokenizer.setParams("text") - Traceback (most recent call last): - ... - TypeError: Method setParams forces keyword arguments. - >>> regexTokenizerPath = temp_path + "/regex-tokenizer" - >>> reTokenizer.save(regexTokenizerPath) - >>> loadedReTokenizer = RegexTokenizer.load(regexTokenizerPath) - >>> loadedReTokenizer.getMinTokenLength() == reTokenizer.getMinTokenLength() - True - >>> loadedReTokenizer.getGaps() == reTokenizer.getGaps() - True - - .. versionadded:: 1.4.0 - """ - - minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)", - typeConverter=TypeConverters.toInt) - gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens " + - "(False)") - pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing", - typeConverter=TypeConverters.toString) - toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " + - "lowercase before tokenizing", typeConverter=TypeConverters.toBoolean) - - @keyword_only - def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, - outputCol=None, toLowercase=True): - """ - __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, \ - outputCol=None, toLowercase=True) - """ - super(RegexTokenizer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid) - self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+", toLowercase=True) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, - outputCol=None, toLowercase=True): - """ - setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, \ - outputCol=None, toLowercase=True) - Sets params for this RegexTokenizer. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setMinTokenLength(self, value): - """ - Sets the value of :py:attr:`minTokenLength`. - """ - return self._set(minTokenLength=value) - - @since("1.4.0") - def getMinTokenLength(self): - """ - Gets the value of minTokenLength or its default value. - """ - return self.getOrDefault(self.minTokenLength) - - @since("1.4.0") - def setGaps(self, value): - """ - Sets the value of :py:attr:`gaps`. - """ - return self._set(gaps=value) - - @since("1.4.0") - def getGaps(self): - """ - Gets the value of gaps or its default value. - """ - return self.getOrDefault(self.gaps) - - @since("1.4.0") - def setPattern(self, value): - """ - Sets the value of :py:attr:`pattern`. - """ - return self._set(pattern=value) - - @since("1.4.0") - def getPattern(self): - """ - Gets the value of pattern or its default value. - """ - return self.getOrDefault(self.pattern) - - @since("2.0.0") - def setToLowercase(self, value): - """ - Sets the value of :py:attr:`toLowercase`. - """ - return self._set(toLowercase=value) - - @since("2.0.0") - def getToLowercase(self): - """ - Gets the value of toLowercase or its default value. - """ - return self.getOrDefault(self.toLowercase) - - -@inherit_doc -class SQLTransformer(JavaTransformer, JavaMLReadable, JavaMLWritable): - """ - Implements the transforms which are defined by SQL statement. - Currently we only support SQL syntax like 'SELECT ... FROM __THIS__' - where '__THIS__' represents the underlying table of the input dataset. - - >>> df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"]) - >>> sqlTrans = SQLTransformer( - ... statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") - >>> sqlTrans.transform(df).head() - Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0) - >>> sqlTransformerPath = temp_path + "/sql-transformer" - >>> sqlTrans.save(sqlTransformerPath) - >>> loadedSqlTrans = SQLTransformer.load(sqlTransformerPath) - >>> loadedSqlTrans.getStatement() == sqlTrans.getStatement() - True - - .. versionadded:: 1.6.0 - """ - - statement = Param(Params._dummy(), "statement", "SQL statement", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, statement=None): - """ - __init__(self, statement=None) - """ - super(SQLTransformer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.6.0") - def setParams(self, statement=None): - """ - setParams(self, statement=None) - Sets params for this SQLTransformer. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.6.0") - def setStatement(self, value): - """ - Sets the value of :py:attr:`statement`. - """ - return self._set(statement=value) - - @since("1.6.0") - def getStatement(self): - """ - Gets the value of statement or its default value. - """ - return self.getOrDefault(self.statement) - - -@inherit_doc -class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - Standardizes features by removing the mean and scaling to unit variance using column summary - statistics on the samples in the training set. - - The "unit std" is computed using the `corrected sample standard deviation \ - `_, - which is computed as the square root of the unbiased sample variance. - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) - >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled") - >>> model = standardScaler.fit(df) - >>> model.mean - DenseVector([1.0]) - >>> model.std - DenseVector([1.4142]) - >>> model.transform(df).collect()[1].scaled - DenseVector([1.4142]) - >>> standardScalerPath = temp_path + "/standard-scaler" - >>> standardScaler.save(standardScalerPath) - >>> loadedStandardScaler = StandardScaler.load(standardScalerPath) - >>> loadedStandardScaler.getWithMean() == standardScaler.getWithMean() - True - >>> loadedStandardScaler.getWithStd() == standardScaler.getWithStd() - True - >>> modelPath = temp_path + "/standard-scaler-model" - >>> model.save(modelPath) - >>> loadedModel = StandardScalerModel.load(modelPath) - >>> loadedModel.std == model.std - True - >>> loadedModel.mean == model.mean - True - - .. versionadded:: 1.4.0 - """ - - withMean = Param(Params._dummy(), "withMean", "Center data with mean", - typeConverter=TypeConverters.toBoolean) - withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation", - typeConverter=TypeConverters.toBoolean) - - @keyword_only - def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None): - """ - __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None) - """ - super(StandardScaler, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid) - self._setDefault(withMean=False, withStd=True) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None): - """ - setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None) - Sets params for this StandardScaler. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setWithMean(self, value): - """ - Sets the value of :py:attr:`withMean`. - """ - return self._set(withMean=value) - - @since("1.4.0") - def getWithMean(self): - """ - Gets the value of withMean or its default value. - """ - return self.getOrDefault(self.withMean) - - @since("1.4.0") - def setWithStd(self, value): - """ - Sets the value of :py:attr:`withStd`. - """ - return self._set(withStd=value) - - @since("1.4.0") - def getWithStd(self): - """ - Gets the value of withStd or its default value. - """ - return self.getOrDefault(self.withStd) - - def _create_model(self, java_model): - return StandardScalerModel(java_model) - - -class StandardScalerModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - Model fitted by :py:class:`StandardScaler`. - - .. versionadded:: 1.4.0 - """ - - @property - @since("2.0.0") - def std(self): - """ - Standard deviation of the StandardScalerModel. - """ - return self._call_java("std") - - @property - @since("2.0.0") - def mean(self): - """ - Mean of the StandardScalerModel. - """ - return self._call_java("mean") - - -class _StringIndexerParams(JavaParams, HasHandleInvalid, HasInputCol, HasOutputCol): - """ - Params for :py:attr:`StringIndexer` and :py:attr:`StringIndexerModel`. - """ - - stringOrderType = Param(Params._dummy(), "stringOrderType", - "How to order labels of string column. The first label after " + - "ordering is assigned an index of 0. Supported options: " + - "frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc.", - typeConverter=TypeConverters.toString) - - handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid data (unseen " + - "or NULL values) in features and label column of string type. " + - "Options are 'skip' (filter out rows with invalid data), " + - "error (throw an error), or 'keep' (put invalid data " + - "in a special additional bucket, at index numLabels).", - typeConverter=TypeConverters.toString) - - def __init__(self, *args): - super(_StringIndexerParams, self).__init__(*args) - self._setDefault(handleInvalid="error", stringOrderType="frequencyDesc") - - @since("2.3.0") - def getStringOrderType(self): - """ - Gets the value of :py:attr:`stringOrderType` or its default value 'frequencyDesc'. - """ - return self.getOrDefault(self.stringOrderType) - - -@inherit_doc -class StringIndexer(JavaEstimator, _StringIndexerParams, JavaMLReadable, JavaMLWritable): - """ - A label indexer that maps a string column of labels to an ML column of label indices. - If the input column is numeric, we cast it to string and index the string values. - The indices are in [0, numLabels). By default, this is ordered by label frequencies - so the most frequent label gets index 0. The ordering behavior is controlled by - setting :py:attr:`stringOrderType`. Its default value is 'frequencyDesc'. - - >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error", - ... stringOrderType="frequencyDesc") - >>> model = stringIndexer.fit(stringIndDf) - >>> td = model.transform(stringIndDf) - >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), - ... key=lambda x: x[0]) - [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)] - >>> inverter = IndexToString(inputCol="indexed", outputCol="label2", labels=model.labels) - >>> itd = inverter.transform(td) - >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]), - ... key=lambda x: x[0]) - [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')] - >>> stringIndexerPath = temp_path + "/string-indexer" - >>> stringIndexer.save(stringIndexerPath) - >>> loadedIndexer = StringIndexer.load(stringIndexerPath) - >>> loadedIndexer.getHandleInvalid() == stringIndexer.getHandleInvalid() - True - >>> modelPath = temp_path + "/string-indexer-model" - >>> model.save(modelPath) - >>> loadedModel = StringIndexerModel.load(modelPath) - >>> loadedModel.labels == model.labels - True - >>> indexToStringPath = temp_path + "/index-to-string" - >>> inverter.save(indexToStringPath) - >>> loadedInverter = IndexToString.load(indexToStringPath) - >>> loadedInverter.getLabels() == inverter.getLabels() - True - >>> stringIndexer.getStringOrderType() - 'frequencyDesc' - >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error", - ... stringOrderType="alphabetDesc") - >>> model = stringIndexer.fit(stringIndDf) - >>> td = model.transform(stringIndDf) - >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), - ... key=lambda x: x[0]) - [(0, 2.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 2.0), (5, 0.0)] - >>> fromlabelsModel = StringIndexerModel.from_labels(["a", "b", "c"], - ... inputCol="label", outputCol="indexed", handleInvalid="error") - >>> result = fromlabelsModel.transform(stringIndDf) - >>> sorted(set([(i[0], i[1]) for i in result.select(result.id, result.indexed).collect()]), - ... key=lambda x: x[0]) - [(0, 0.0), (1, 1.0), (2, 2.0), (3, 0.0), (4, 0.0), (5, 2.0)] - - .. versionadded:: 1.4.0 - """ - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, handleInvalid="error", - stringOrderType="frequencyDesc"): - """ - __init__(self, inputCol=None, outputCol=None, handleInvalid="error", \ - stringOrderType="frequencyDesc") - """ - super(StringIndexer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, inputCol=None, outputCol=None, handleInvalid="error", - stringOrderType="frequencyDesc"): - """ - setParams(self, inputCol=None, outputCol=None, handleInvalid="error", \ - stringOrderType="frequencyDesc") - Sets params for this StringIndexer. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return StringIndexerModel(java_model) - - @since("2.3.0") - def setStringOrderType(self, value): - """ - Sets the value of :py:attr:`stringOrderType`. - """ - return self._set(stringOrderType=value) - - -class StringIndexerModel(JavaModel, _StringIndexerParams, JavaMLReadable, JavaMLWritable): - """ - Model fitted by :py:class:`StringIndexer`. - - .. versionadded:: 1.4.0 - """ - - @classmethod - @since("2.4.0") - def from_labels(cls, labels, inputCol, outputCol=None, handleInvalid=None): - """ - Construct the model directly from an array of label strings, - requires an active SparkContext. - """ - sc = SparkContext._active_spark_context - java_class = sc._gateway.jvm.java.lang.String - jlabels = StringIndexerModel._new_java_array(labels, java_class) - model = StringIndexerModel._create_from_java_class( - "org.apache.spark.ml.feature.StringIndexerModel", jlabels) - model.setInputCol(inputCol) - if outputCol is not None: - model.setOutputCol(outputCol) - if handleInvalid is not None: - model.setHandleInvalid(handleInvalid) - return model - - @property - @since("1.5.0") - def labels(self): - """ - Ordered list of labels, corresponding to indices to be assigned. - """ - return self._call_java("labels") - - @since("2.4.0") - def setHandleInvalid(self, value): - """ - Sets the value of :py:attr:`handleInvalid`. - """ - return self._set(handleInvalid=value) - - -@inherit_doc -class IndexToString(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - A :py:class:`Transformer` that maps a column of indices back to a new column of - corresponding string values. - The index-string mapping is either from the ML attributes of the input column, - or from user-supplied labels (which take precedence over ML attributes). - See L{StringIndexer} for converting strings into indices. - - .. versionadded:: 1.6.0 - """ - - labels = Param(Params._dummy(), "labels", - "Optional array of labels specifying index-string mapping." + - " If not provided or if empty, then metadata from inputCol is used instead.", - typeConverter=TypeConverters.toListString) - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, labels=None): - """ - __init__(self, inputCol=None, outputCol=None, labels=None) - """ - super(IndexToString, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", - self.uid) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.6.0") - def setParams(self, inputCol=None, outputCol=None, labels=None): - """ - setParams(self, inputCol=None, outputCol=None, labels=None) - Sets params for this IndexToString. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.6.0") - def setLabels(self, value): - """ - Sets the value of :py:attr:`labels`. - """ - return self._set(labels=value) - - @since("1.6.0") - def getLabels(self): - """ - Gets the value of :py:attr:`labels` or its default value. - """ - return self.getOrDefault(self.labels) - - -class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - A feature transformer that filters out stop words from input. - - .. note:: null values from input array are preserved unless adding null to stopWords explicitly. - - >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["text"]) - >>> remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"]) - >>> remover.transform(df).head().words == ['a', 'c'] - True - >>> stopWordsRemoverPath = temp_path + "/stopwords-remover" - >>> remover.save(stopWordsRemoverPath) - >>> loadedRemover = StopWordsRemover.load(stopWordsRemoverPath) - >>> loadedRemover.getStopWords() == remover.getStopWords() - True - >>> loadedRemover.getCaseSensitive() == remover.getCaseSensitive() - True - - .. versionadded:: 1.6.0 - """ - - stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out", - typeConverter=TypeConverters.toListString) - caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " + - "comparison over the stop words", typeConverter=TypeConverters.toBoolean) - locale = Param(Params._dummy(), "locale", "locale of the input. ignored when case sensitive " + - "is true", typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False, - locale=None): - """ - __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false, \ - locale=None) - """ - super(StopWordsRemover, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", - self.uid) - self._setDefault(stopWords=StopWordsRemover.loadDefaultStopWords("english"), - caseSensitive=False, locale=self._java_obj.getLocale()) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.6.0") - def setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False, - locale=None): - """ - setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false, \ - locale=None) - Sets params for this StopWordRemover. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.6.0") - def setStopWords(self, value): - """ - Sets the value of :py:attr:`stopWords`. - """ - return self._set(stopWords=value) - - @since("1.6.0") - def getStopWords(self): - """ - Gets the value of :py:attr:`stopWords` or its default value. - """ - return self.getOrDefault(self.stopWords) - - @since("1.6.0") - def setCaseSensitive(self, value): - """ - Sets the value of :py:attr:`caseSensitive`. - """ - return self._set(caseSensitive=value) - - @since("1.6.0") - def getCaseSensitive(self): - """ - Gets the value of :py:attr:`caseSensitive` or its default value. - """ - return self.getOrDefault(self.caseSensitive) - - @since("2.4.0") - def setLocale(self, value): - """ - Sets the value of :py:attr:`locale`. - """ - return self._set(locale=value) - - @since("2.4.0") - def getLocale(self): - """ - Gets the value of :py:attr:`locale`. - """ - return self.getOrDefault(self.locale) - - @staticmethod - @since("2.0.0") - def loadDefaultStopWords(language): - """ - Loads the default stop words for the given language. - Supported languages: danish, dutch, english, finnish, french, german, hungarian, - italian, norwegian, portuguese, russian, spanish, swedish, turkish - """ - stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover - return list(stopWordsObj.loadDefaultStopWords(language)) - - -@inherit_doc -@ignore_unicode_prefix -class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - A tokenizer that converts the input string to lowercase and then - splits it by white spaces. - - >>> df = spark.createDataFrame([("a b c",)], ["text"]) - >>> tokenizer = Tokenizer(inputCol="text", outputCol="words") - >>> tokenizer.transform(df).head() - Row(text=u'a b c', words=[u'a', u'b', u'c']) - >>> # Change a parameter. - >>> tokenizer.setParams(outputCol="tokens").transform(df).head() - Row(text=u'a b c', tokens=[u'a', u'b', u'c']) - >>> # Temporarily modify a parameter. - >>> tokenizer.transform(df, {tokenizer.outputCol: "words"}).head() - Row(text=u'a b c', words=[u'a', u'b', u'c']) - >>> tokenizer.transform(df).head() - Row(text=u'a b c', tokens=[u'a', u'b', u'c']) - >>> # Must use keyword arguments to specify params. - >>> tokenizer.setParams("text") - Traceback (most recent call last): - ... - TypeError: Method setParams forces keyword arguments. - >>> tokenizerPath = temp_path + "/tokenizer" - >>> tokenizer.save(tokenizerPath) - >>> loadedTokenizer = Tokenizer.load(tokenizerPath) - >>> loadedTokenizer.transform(df).head().tokens == tokenizer.transform(df).head().tokens - True - - .. versionadded:: 1.3.0 - """ - - @keyword_only - def __init__(self, inputCol=None, outputCol=None): - """ - __init__(self, inputCol=None, outputCol=None) - """ - super(Tokenizer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Tokenizer", self.uid) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.3.0") - def setParams(self, inputCol=None, outputCol=None): - """ - setParams(self, inputCol=None, outputCol=None) - Sets params for this Tokenizer. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - -@inherit_doc -class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol, HasHandleInvalid, JavaMLReadable, - JavaMLWritable): - """ - A feature transformer that merges multiple columns into a vector column. - - >>> df = spark.createDataFrame([(1, 0, 3)], ["a", "b", "c"]) - >>> vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features") - >>> vecAssembler.transform(df).head().features - DenseVector([1.0, 0.0, 3.0]) - >>> vecAssembler.setParams(outputCol="freqs").transform(df).head().freqs - DenseVector([1.0, 0.0, 3.0]) - >>> params = {vecAssembler.inputCols: ["b", "a"], vecAssembler.outputCol: "vector"} - >>> vecAssembler.transform(df, params).head().vector - DenseVector([0.0, 1.0]) - >>> vectorAssemblerPath = temp_path + "/vector-assembler" - >>> vecAssembler.save(vectorAssemblerPath) - >>> loadedAssembler = VectorAssembler.load(vectorAssemblerPath) - >>> loadedAssembler.transform(df).head().freqs == vecAssembler.transform(df).head().freqs - True - >>> dfWithNullsAndNaNs = spark.createDataFrame( - ... [(1.0, 2.0, None), (3.0, float("nan"), 4.0), (5.0, 6.0, 7.0)], ["a", "b", "c"]) - >>> vecAssembler2 = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features", - ... handleInvalid="keep") - >>> vecAssembler2.transform(dfWithNullsAndNaNs).show() - +---+---+----+-------------+ - | a| b| c| features| - +---+---+----+-------------+ - |1.0|2.0|null|[1.0,2.0,NaN]| - |3.0|NaN| 4.0|[3.0,NaN,4.0]| - |5.0|6.0| 7.0|[5.0,6.0,7.0]| - +---+---+----+-------------+ - ... - >>> vecAssembler2.setParams(handleInvalid="skip").transform(dfWithNullsAndNaNs).show() - +---+---+---+-------------+ - | a| b| c| features| - +---+---+---+-------------+ - |5.0|6.0|7.0|[5.0,6.0,7.0]| - +---+---+---+-------------+ - ... - - .. versionadded:: 1.4.0 - """ - - handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data (NULL " + - "and NaN values). Options are 'skip' (filter out rows with invalid " + - "data), 'error' (throw an error), or 'keep' (return relevant number " + - "of NaN in the output). Column lengths are taken from the size of ML " + - "Attribute Group, which can be set using `VectorSizeHint` in a " + - "pipeline before `VectorAssembler`. Column lengths can also be " + - "inferred from first rows of the data since it is safe to do so but " + - "only in case of 'error' or 'skip').", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, inputCols=None, outputCol=None, handleInvalid="error"): - """ - __init__(self, inputCols=None, outputCol=None, handleInvalid="error") - """ - super(VectorAssembler, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorAssembler", self.uid) - self._setDefault(handleInvalid="error") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, inputCols=None, outputCol=None, handleInvalid="error"): - """ - setParams(self, inputCols=None, outputCol=None, handleInvalid="error") - Sets params for this VectorAssembler. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - -@inherit_doc -class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, HasHandleInvalid, JavaMLReadable, - JavaMLWritable): - """ - Class for indexing categorical feature columns in a dataset of `Vector`. - - This has 2 usage modes: - - Automatically identify categorical features (default behavior) - - This helps process a dataset of unknown vectors into a dataset with some continuous - features and some categorical features. The choice between continuous and categorical - is based upon a maxCategories parameter. - - Set maxCategories to the maximum number of categorical any categorical feature should - have. - - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}. - If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1}, - and feature 1 will be declared continuous. - - Index all features, if all features are categorical - - If maxCategories is set to be very large, then this will build an index of unique - values for all features. - - Warning: This can cause problems if features are continuous since this will collect ALL - unique values to the driver. - - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}. - If maxCategories >= 3, then both features will be declared categorical. - - This returns a model which can transform categorical features to use 0-based indices. - - Index stability: - - This is not guaranteed to choose the same category index across multiple runs. - - If a categorical feature includes value 0, then this is guaranteed to map value 0 to - index 0. This maintains vector sparsity. - - More stability may be added in the future. - - TODO: Future extensions: The following functionality is planned for the future: - - Preserve metadata in transform; if a feature's metadata is already present, - do not recompute. - - Specify certain features to not index, either via a parameter or via existing metadata. - - Add warning if a categorical feature has only 1 category. - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([(Vectors.dense([-1.0, 0.0]),), - ... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"]) - >>> indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed") - >>> model = indexer.fit(df) - >>> model.transform(df).head().indexed - DenseVector([1.0, 0.0]) - >>> model.numFeatures - 2 - >>> model.categoryMaps - {0: {0.0: 0, -1.0: 1}} - >>> indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test - DenseVector([0.0, 1.0]) - >>> params = {indexer.maxCategories: 3, indexer.outputCol: "vector"} - >>> model2 = indexer.fit(df, params) - >>> model2.transform(df).head().vector - DenseVector([1.0, 0.0]) - >>> vectorIndexerPath = temp_path + "/vector-indexer" - >>> indexer.save(vectorIndexerPath) - >>> loadedIndexer = VectorIndexer.load(vectorIndexerPath) - >>> loadedIndexer.getMaxCategories() == indexer.getMaxCategories() - True - >>> modelPath = temp_path + "/vector-indexer-model" - >>> model.save(modelPath) - >>> loadedModel = VectorIndexerModel.load(modelPath) - >>> loadedModel.numFeatures == model.numFeatures - True - >>> loadedModel.categoryMaps == model.categoryMaps - True - >>> dfWithInvalid = spark.createDataFrame([(Vectors.dense([3.0, 1.0]),)], ["a"]) - >>> indexer.getHandleInvalid() - 'error' - >>> model3 = indexer.setHandleInvalid("skip").fit(df) - >>> model3.transform(dfWithInvalid).count() - 0 - >>> model4 = indexer.setParams(handleInvalid="keep", outputCol="indexed").fit(df) - >>> model4.transform(dfWithInvalid).head().indexed - DenseVector([2.0, 1.0]) - - .. versionadded:: 1.4.0 - """ - - maxCategories = Param(Params._dummy(), "maxCategories", - "Threshold for the number of values a categorical feature can take " + - "(>= 2). If a feature is found to have > maxCategories values, then " + - "it is declared continuous.", typeConverter=TypeConverters.toInt) - - handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data " + - "(unseen labels or NULL values). Options are 'skip' (filter out " + - "rows with invalid data), 'error' (throw an error), or 'keep' (put " + - "invalid data in a special additional bucket, at index of the number " + - "of categories of the feature).", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, maxCategories=20, inputCol=None, outputCol=None, handleInvalid="error"): - """ - __init__(self, maxCategories=20, inputCol=None, outputCol=None, handleInvalid="error") - """ - super(VectorIndexer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid) - self._setDefault(maxCategories=20, handleInvalid="error") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, maxCategories=20, inputCol=None, outputCol=None, handleInvalid="error"): - """ - setParams(self, maxCategories=20, inputCol=None, outputCol=None, handleInvalid="error") - Sets params for this VectorIndexer. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setMaxCategories(self, value): - """ - Sets the value of :py:attr:`maxCategories`. - """ - return self._set(maxCategories=value) - - @since("1.4.0") - def getMaxCategories(self): - """ - Gets the value of maxCategories or its default value. - """ - return self.getOrDefault(self.maxCategories) - - def _create_model(self, java_model): - return VectorIndexerModel(java_model) - - -class VectorIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - Model fitted by :py:class:`VectorIndexer`. - - Transform categorical features to use 0-based indices instead of their original values. - - Categorical features are mapped to indices. - - Continuous features (columns) are left unchanged. - - This also appends metadata to the output column, marking features as Numeric (continuous), - Nominal (categorical), or Binary (either continuous or categorical). - Non-ML metadata is not carried over from the input to the output column. - - This maintains vector sparsity. - - .. versionadded:: 1.4.0 - """ - - @property - @since("1.4.0") - def numFeatures(self): - """ - Number of features, i.e., length of Vectors which this transforms. - """ - return self._call_java("numFeatures") - - @property - @since("1.4.0") - def categoryMaps(self): - """ - Feature value index. Keys are categorical feature indices (column indices). - Values are maps from original features values to 0-based category indices. - If a feature is not in this map, it is treated as continuous. - """ - return self._call_java("javaCategoryMaps") - - -@inherit_doc -class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - This class takes a feature vector and outputs a new feature vector with a subarray - of the original features. - - The subset of features can be specified with either indices (`setIndices()`) - or names (`setNames()`). At least one feature must be selected. Duplicate features - are not allowed, so there can be no overlap between selected indices and names. - - The output vector will order features with the selected indices first (in the order given), - followed by the selected names (in the order given). - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([ - ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),), - ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),), - ... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"]) - >>> vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4]) - >>> vs.transform(df).head().sliced - DenseVector([2.3, 1.0]) - >>> vectorSlicerPath = temp_path + "/vector-slicer" - >>> vs.save(vectorSlicerPath) - >>> loadedVs = VectorSlicer.load(vectorSlicerPath) - >>> loadedVs.getIndices() == vs.getIndices() - True - >>> loadedVs.getNames() == vs.getNames() - True - - .. versionadded:: 1.6.0 - """ - - indices = Param(Params._dummy(), "indices", "An array of indices to select features from " + - "a vector column. There can be no overlap with names.", - typeConverter=TypeConverters.toListInt) - names = Param(Params._dummy(), "names", "An array of feature names to select features from " + - "a vector column. These names must be specified by ML " + - "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " + - "indices.", typeConverter=TypeConverters.toListString) - - @keyword_only - def __init__(self, inputCol=None, outputCol=None, indices=None, names=None): - """ - __init__(self, inputCol=None, outputCol=None, indices=None, names=None) - """ - super(VectorSlicer, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid) - self._setDefault(indices=[], names=[]) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.6.0") - def setParams(self, inputCol=None, outputCol=None, indices=None, names=None): - """ - setParams(self, inputCol=None, outputCol=None, indices=None, names=None): - Sets params for this VectorSlicer. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.6.0") - def setIndices(self, value): - """ - Sets the value of :py:attr:`indices`. - """ - return self._set(indices=value) - - @since("1.6.0") - def getIndices(self): - """ - Gets the value of indices or its default value. - """ - return self.getOrDefault(self.indices) - - @since("1.6.0") - def setNames(self, value): - """ - Sets the value of :py:attr:`names`. - """ - return self._set(names=value) - - @since("1.6.0") - def getNames(self): - """ - Gets the value of names or its default value. - """ - return self.getOrDefault(self.names) - - -@inherit_doc -@ignore_unicode_prefix -class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCol, - JavaMLReadable, JavaMLWritable): - """ - Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further - natural language processing or machine learning process. - - >>> sent = ("a b " * 100 + "a c " * 10).split(" ") - >>> doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"]) - >>> word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model") - >>> model = word2Vec.fit(doc) - >>> model.getVectors().show() - +----+--------------------+ - |word| vector| - +----+--------------------+ - | a|[0.09461779892444...| - | b|[1.15474212169647...| - | c|[-0.3794820010662...| - +----+--------------------+ - ... - >>> model.findSynonymsArray("a", 2) - [(u'b', 0.25053444504737854), (u'c', -0.6980510950088501)] - >>> from pyspark.sql.functions import format_number as fmt - >>> model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show() - +----+----------+ - |word|similarity| - +----+----------+ - | b| 0.25053| - | c| -0.69805| - +----+----------+ - ... - >>> model.transform(doc).head().model - DenseVector([0.5524, -0.4995, -0.3599, 0.0241, 0.3461]) - >>> word2vecPath = temp_path + "/word2vec" - >>> word2Vec.save(word2vecPath) - >>> loadedWord2Vec = Word2Vec.load(word2vecPath) - >>> loadedWord2Vec.getVectorSize() == word2Vec.getVectorSize() - True - >>> loadedWord2Vec.getNumPartitions() == word2Vec.getNumPartitions() - True - >>> loadedWord2Vec.getMinCount() == word2Vec.getMinCount() - True - >>> modelPath = temp_path + "/word2vec-model" - >>> model.save(modelPath) - >>> loadedModel = Word2VecModel.load(modelPath) - >>> loadedModel.getVectors().first().word == model.getVectors().first().word - True - >>> loadedModel.getVectors().first().vector == model.getVectors().first().vector - True - - .. versionadded:: 1.4.0 - """ - - vectorSize = Param(Params._dummy(), "vectorSize", - "the dimension of codes after transforming from words", - typeConverter=TypeConverters.toInt) - numPartitions = Param(Params._dummy(), "numPartitions", - "number of partitions for sentences of words", - typeConverter=TypeConverters.toInt) - minCount = Param(Params._dummy(), "minCount", - "the minimum number of times a token must appear to be included in the " + - "word2vec model's vocabulary", typeConverter=TypeConverters.toInt) - windowSize = Param(Params._dummy(), "windowSize", - "the window size (context words from [-window, window]). Default value is 5", - typeConverter=TypeConverters.toInt) - maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", - "Maximum length (in words) of each sentence in the input data. " + - "Any sentence longer than this threshold will " + - "be divided into chunks up to the size.", - typeConverter=TypeConverters.toInt) - - @keyword_only - def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000): - """ - __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \ - seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000) - """ - super(Word2Vec, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) - self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - windowSize=5, maxSentenceLength=1000) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000): - """ - setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \ - inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000) - Sets params for this Word2Vec. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setVectorSize(self, value): - """ - Sets the value of :py:attr:`vectorSize`. - """ - return self._set(vectorSize=value) - - @since("1.4.0") - def getVectorSize(self): - """ - Gets the value of vectorSize or its default value. - """ - return self.getOrDefault(self.vectorSize) - - @since("1.4.0") - def setNumPartitions(self, value): - """ - Sets the value of :py:attr:`numPartitions`. - """ - return self._set(numPartitions=value) - - @since("1.4.0") - def getNumPartitions(self): - """ - Gets the value of numPartitions or its default value. - """ - return self.getOrDefault(self.numPartitions) - - @since("1.4.0") - def setMinCount(self, value): - """ - Sets the value of :py:attr:`minCount`. - """ - return self._set(minCount=value) - - @since("1.4.0") - def getMinCount(self): - """ - Gets the value of minCount or its default value. - """ - return self.getOrDefault(self.minCount) - - @since("2.0.0") - def setWindowSize(self, value): - """ - Sets the value of :py:attr:`windowSize`. - """ - return self._set(windowSize=value) - - @since("2.0.0") - def getWindowSize(self): - """ - Gets the value of windowSize or its default value. - """ - return self.getOrDefault(self.windowSize) - - @since("2.0.0") - def setMaxSentenceLength(self, value): - """ - Sets the value of :py:attr:`maxSentenceLength`. - """ - return self._set(maxSentenceLength=value) - - @since("2.0.0") - def getMaxSentenceLength(self): - """ - Gets the value of maxSentenceLength or its default value. - """ - return self.getOrDefault(self.maxSentenceLength) - - def _create_model(self, java_model): - return Word2VecModel(java_model) - - -class Word2VecModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - Model fitted by :py:class:`Word2Vec`. - - .. versionadded:: 1.4.0 - """ - - @since("1.5.0") - def getVectors(self): - """ - Returns the vector representation of the words as a dataframe - with two fields, word and vector. - """ - return self._call_java("getVectors") - - @since("1.5.0") - def findSynonyms(self, word, num): - """ - Find "num" number of words closest in similarity to "word". - word can be a string or vector representation. - Returns a dataframe with two fields word and similarity (which - gives the cosine similarity). - """ - if not isinstance(word, basestring): - word = _convert_to_vector(word) - return self._call_java("findSynonyms", word, num) - - @since("2.3.0") - def findSynonymsArray(self, word, num): - """ - Find "num" number of words closest in similarity to "word". - word can be a string or vector representation. - Returns an array with two fields word and similarity (which - gives the cosine similarity). - """ - if not isinstance(word, basestring): - word = _convert_to_vector(word) - tuples = self._java_obj.findSynonymsArray(word, num) - return list(map(lambda st: (st._1(), st._2()), list(tuples))) - - -@inherit_doc -class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): - """ - PCA trains a model to project vectors to a lower dimensional space of the - top :py:attr:`k` principal components. - - >>> from pyspark.ml.linalg import Vectors - >>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), - ... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), - ... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] - >>> df = spark.createDataFrame(data,["features"]) - >>> pca = PCA(k=2, inputCol="features", outputCol="pca_features") - >>> model = pca.fit(df) - >>> model.transform(df).collect()[0].pca_features - DenseVector([1.648..., -4.013...]) - >>> model.explainedVariance - DenseVector([0.794..., 0.205...]) - >>> pcaPath = temp_path + "/pca" - >>> pca.save(pcaPath) - >>> loadedPca = PCA.load(pcaPath) - >>> loadedPca.getK() == pca.getK() - True - >>> modelPath = temp_path + "/pca-model" - >>> model.save(modelPath) - >>> loadedModel = PCAModel.load(modelPath) - >>> loadedModel.pc == model.pc - True - >>> loadedModel.explainedVariance == model.explainedVariance - True - - .. versionadded:: 1.5.0 - """ - - k = Param(Params._dummy(), "k", "the number of principal components", - typeConverter=TypeConverters.toInt) - - @keyword_only - def __init__(self, k=None, inputCol=None, outputCol=None): - """ - __init__(self, k=None, inputCol=None, outputCol=None) - """ - super(PCA, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.5.0") - def setParams(self, k=None, inputCol=None, outputCol=None): - """ - setParams(self, k=None, inputCol=None, outputCol=None) - Set params for this PCA. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.5.0") - def setK(self, value): - """ - Sets the value of :py:attr:`k`. - """ - return self._set(k=value) - - @since("1.5.0") - def getK(self): - """ - Gets the value of k or its default value. - """ - return self.getOrDefault(self.k) - - def _create_model(self, java_model): - return PCAModel(java_model) - - -class PCAModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space. - - .. versionadded:: 1.5.0 - """ - - @property - @since("2.0.0") - def pc(self): - """ - Returns a principal components Matrix. - Each column is one principal component. - """ - return self._call_java("pc") - - @property - @since("2.0.0") - def explainedVariance(self): - """ - Returns a vector of proportions of variance - explained by each principal component. - """ - return self._call_java("explainedVariance") - - -@inherit_doc -class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, HasHandleInvalid, - JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Implements the transforms required for fitting a dataset against an - R model formula. Currently we support a limited subset of the R - operators, including '~', '.', ':', '+', and '-'. Also see the `R formula docs - `_. - - >>> df = spark.createDataFrame([ - ... (1.0, 1.0, "a"), - ... (0.0, 2.0, "b"), - ... (0.0, 0.0, "a") - ... ], ["y", "x", "s"]) - >>> rf = RFormula(formula="y ~ x + s") - >>> model = rf.fit(df) - >>> model.transform(df).show() - +---+---+---+---------+-----+ - | y| x| s| features|label| - +---+---+---+---------+-----+ - |1.0|1.0| a|[1.0,1.0]| 1.0| - |0.0|2.0| b|[2.0,0.0]| 0.0| - |0.0|0.0| a|[0.0,1.0]| 0.0| - +---+---+---+---------+-----+ - ... - >>> rf.fit(df, {rf.formula: "y ~ . - s"}).transform(df).show() - +---+---+---+--------+-----+ - | y| x| s|features|label| - +---+---+---+--------+-----+ - |1.0|1.0| a| [1.0]| 1.0| - |0.0|2.0| b| [2.0]| 0.0| - |0.0|0.0| a| [0.0]| 0.0| - +---+---+---+--------+-----+ - ... - >>> rFormulaPath = temp_path + "/rFormula" - >>> rf.save(rFormulaPath) - >>> loadedRF = RFormula.load(rFormulaPath) - >>> loadedRF.getFormula() == rf.getFormula() - True - >>> loadedRF.getFeaturesCol() == rf.getFeaturesCol() - True - >>> loadedRF.getLabelCol() == rf.getLabelCol() - True - >>> loadedRF.getHandleInvalid() == rf.getHandleInvalid() - True - >>> str(loadedRF) - 'RFormula(y ~ x + s) (uid=...)' - >>> modelPath = temp_path + "/rFormulaModel" - >>> model.save(modelPath) - >>> loadedModel = RFormulaModel.load(modelPath) - >>> loadedModel.uid == model.uid - True - >>> loadedModel.transform(df).show() - +---+---+---+---------+-----+ - | y| x| s| features|label| - +---+---+---+---------+-----+ - |1.0|1.0| a|[1.0,1.0]| 1.0| - |0.0|2.0| b|[2.0,0.0]| 0.0| - |0.0|0.0| a|[0.0,1.0]| 0.0| - +---+---+---+---------+-----+ - ... - >>> str(loadedModel) - 'RFormulaModel(ResolvedRFormula(label=y, terms=[x,s], hasIntercept=true)) (uid=...)' - - .. versionadded:: 1.5.0 - """ - - formula = Param(Params._dummy(), "formula", "R model formula", - typeConverter=TypeConverters.toString) - - forceIndexLabel = Param(Params._dummy(), "forceIndexLabel", - "Force to index label whether it is numeric or string", - typeConverter=TypeConverters.toBoolean) - - stringIndexerOrderType = Param(Params._dummy(), "stringIndexerOrderType", - "How to order categories of a string feature column used by " + - "StringIndexer. The last category after ordering is dropped " + - "when encoding strings. Supported options: frequencyDesc, " + - "frequencyAsc, alphabetDesc, alphabetAsc. The default value " + - "is frequencyDesc. When the ordering is set to alphabetDesc, " + - "RFormula drops the same category as R when encoding strings.", - typeConverter=TypeConverters.toString) - - handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " + - "Options are 'skip' (filter out rows with invalid values), " + - "'error' (throw an error), or 'keep' (put invalid data in a special " + - "additional bucket, at index numLabels).", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, formula=None, featuresCol="features", labelCol="label", - forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", - handleInvalid="error"): - """ - __init__(self, formula=None, featuresCol="features", labelCol="label", \ - forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", \ - handleInvalid="error") - """ - super(RFormula, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid) - self._setDefault(forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", - handleInvalid="error") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.5.0") - def setParams(self, formula=None, featuresCol="features", labelCol="label", - forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", - handleInvalid="error"): - """ - setParams(self, formula=None, featuresCol="features", labelCol="label", \ - forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", \ - handleInvalid="error") - Sets params for RFormula. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.5.0") - def setFormula(self, value): - """ - Sets the value of :py:attr:`formula`. - """ - return self._set(formula=value) - - @since("1.5.0") - def getFormula(self): - """ - Gets the value of :py:attr:`formula`. - """ - return self.getOrDefault(self.formula) - - @since("2.1.0") - def setForceIndexLabel(self, value): - """ - Sets the value of :py:attr:`forceIndexLabel`. - """ - return self._set(forceIndexLabel=value) - - @since("2.1.0") - def getForceIndexLabel(self): - """ - Gets the value of :py:attr:`forceIndexLabel`. - """ - return self.getOrDefault(self.forceIndexLabel) - - @since("2.3.0") - def setStringIndexerOrderType(self, value): - """ - Sets the value of :py:attr:`stringIndexerOrderType`. - """ - return self._set(stringIndexerOrderType=value) - - @since("2.3.0") - def getStringIndexerOrderType(self): - """ - Gets the value of :py:attr:`stringIndexerOrderType` or its default value 'frequencyDesc'. - """ - return self.getOrDefault(self.stringIndexerOrderType) - - def _create_model(self, java_model): - return RFormulaModel(java_model) - - def __str__(self): - formulaStr = self.getFormula() if self.isDefined(self.formula) else "" - return "RFormula(%s) (uid=%s)" % (formulaStr, self.uid) - - -class RFormulaModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Model fitted by :py:class:`RFormula`. Fitting is required to determine the - factor levels of formula terms. - - .. versionadded:: 1.5.0 - """ - - def __str__(self): - resolvedFormula = self._call_java("resolvedFormula") - return "RFormulaModel(%s) (uid=%s)" % (resolvedFormula, self.uid) - - -@inherit_doc -class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, JavaMLReadable, - JavaMLWritable): - """ - .. note:: Experimental - - Chi-Squared feature selection, which selects categorical features to use for predicting a - categorical label. - The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`, - `fdr`, `fwe`. - - * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. - - * `percentile` is similar but chooses a fraction of all features - instead of a fixed number. - - * `fpr` chooses all features whose p-values are below a threshold, - thus controlling the false positive rate of selection. - - * `fdr` uses the `Benjamini-Hochberg procedure `_ - to choose all features whose false discovery rate is below a threshold. - - * `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by - 1/numFeatures, thus controlling the family-wise error rate of selection. - - By default, the selection method is `numTopFeatures`, with the default number of top features - set to 50. - - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame( - ... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), - ... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), - ... (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], - ... ["features", "label"]) - >>> selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") - >>> model = selector.fit(df) - >>> model.transform(df).head().selectedFeatures - DenseVector([18.0]) - >>> model.selectedFeatures - [2] - >>> chiSqSelectorPath = temp_path + "/chi-sq-selector" - >>> selector.save(chiSqSelectorPath) - >>> loadedSelector = ChiSqSelector.load(chiSqSelectorPath) - >>> loadedSelector.getNumTopFeatures() == selector.getNumTopFeatures() - True - >>> modelPath = temp_path + "/chi-sq-selector-model" - >>> model.save(modelPath) - >>> loadedModel = ChiSqSelectorModel.load(modelPath) - >>> loadedModel.selectedFeatures == model.selectedFeatures - True - - .. versionadded:: 2.0.0 - """ - - selectorType = Param(Params._dummy(), "selectorType", - "The selector type of the ChisqSelector. " + - "Supported options: numTopFeatures (default), percentile, fpr, fdr, fwe.", - typeConverter=TypeConverters.toString) - - numTopFeatures = \ - Param(Params._dummy(), "numTopFeatures", - "Number of features that selector will select, ordered by ascending p-value. " + - "If the number of features is < numTopFeatures, then this will select " + - "all features.", typeConverter=TypeConverters.toInt) - - percentile = Param(Params._dummy(), "percentile", "Percentile of features that selector " + - "will select, ordered by ascending p-value.", - typeConverter=TypeConverters.toFloat) - - fpr = Param(Params._dummy(), "fpr", "The highest p-value for features to be kept.", - typeConverter=TypeConverters.toFloat) - - fdr = Param(Params._dummy(), "fdr", "The upper bound of the expected false discovery rate.", - typeConverter=TypeConverters.toFloat) - - fwe = Param(Params._dummy(), "fwe", "The upper bound of the expected family-wise error rate.", - typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, - labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05, - fdr=0.05, fwe=0.05): - """ - __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, \ - labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05, \ - fdr=0.05, fwe=0.05) - """ - super(ChiSqSelector, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid) - self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, - fpr=0.05, fdr=0.05, fwe=0.05) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.0.0") - def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, - labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05, - fdr=0.05, fwe=0.05): - """ - setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, \ - labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05, \ - fdr=0.05, fwe=0.05) - Sets params for this ChiSqSelector. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.1.0") - def setSelectorType(self, value): - """ - Sets the value of :py:attr:`selectorType`. - """ - return self._set(selectorType=value) - - @since("2.1.0") - def getSelectorType(self): - """ - Gets the value of selectorType or its default value. - """ - return self.getOrDefault(self.selectorType) - - @since("2.0.0") - def setNumTopFeatures(self, value): - """ - Sets the value of :py:attr:`numTopFeatures`. - Only applicable when selectorType = "numTopFeatures". - """ - return self._set(numTopFeatures=value) - - @since("2.0.0") - def getNumTopFeatures(self): - """ - Gets the value of numTopFeatures or its default value. - """ - return self.getOrDefault(self.numTopFeatures) - - @since("2.1.0") - def setPercentile(self, value): - """ - Sets the value of :py:attr:`percentile`. - Only applicable when selectorType = "percentile". - """ - return self._set(percentile=value) - - @since("2.1.0") - def getPercentile(self): - """ - Gets the value of percentile or its default value. - """ - return self.getOrDefault(self.percentile) - - @since("2.1.0") - def setFpr(self, value): - """ - Sets the value of :py:attr:`fpr`. - Only applicable when selectorType = "fpr". - """ - return self._set(fpr=value) - - @since("2.1.0") - def getFpr(self): - """ - Gets the value of fpr or its default value. - """ - return self.getOrDefault(self.fpr) - - @since("2.2.0") - def setFdr(self, value): - """ - Sets the value of :py:attr:`fdr`. - Only applicable when selectorType = "fdr". - """ - return self._set(fdr=value) - - @since("2.2.0") - def getFdr(self): - """ - Gets the value of fdr or its default value. - """ - return self.getOrDefault(self.fdr) - - @since("2.2.0") - def setFwe(self, value): - """ - Sets the value of :py:attr:`fwe`. - Only applicable when selectorType = "fwe". - """ - return self._set(fwe=value) - - @since("2.2.0") - def getFwe(self): - """ - Gets the value of fwe or its default value. - """ - return self.getOrDefault(self.fwe) - - def _create_model(self, java_model): - return ChiSqSelectorModel(java_model) - - -class ChiSqSelectorModel(JavaModel, JavaMLReadable, JavaMLWritable): - """ - .. note:: Experimental - - Model fitted by :py:class:`ChiSqSelector`. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def selectedFeatures(self): - """ - List of indices to select (filter). - """ - return self._call_java("selectedFeatures") - - -@inherit_doc -class VectorSizeHint(JavaTransformer, HasInputCol, HasHandleInvalid, JavaMLReadable, - JavaMLWritable): - """ - .. note:: Experimental - - A feature transformer that adds size information to the metadata of a vector column. - VectorAssembler needs size information for its input columns and cannot be used on streaming - dataframes without this metadata. - - .. note:: VectorSizeHint modifies `inputCol` to include size metadata and does not have an - outputCol. - - >>> from pyspark.ml.linalg import Vectors - >>> from pyspark.ml import Pipeline, PipelineModel - >>> data = [(Vectors.dense([1., 2., 3.]), 4.)] - >>> df = spark.createDataFrame(data, ["vector", "float"]) - >>> - >>> sizeHint = VectorSizeHint(inputCol="vector", size=3, handleInvalid="skip") - >>> vecAssembler = VectorAssembler(inputCols=["vector", "float"], outputCol="assembled") - >>> pipeline = Pipeline(stages=[sizeHint, vecAssembler]) - >>> - >>> pipelineModel = pipeline.fit(df) - >>> pipelineModel.transform(df).head().assembled - DenseVector([1.0, 2.0, 3.0, 4.0]) - >>> vectorSizeHintPath = temp_path + "/vector-size-hint-pipeline" - >>> pipelineModel.save(vectorSizeHintPath) - >>> loadedPipeline = PipelineModel.load(vectorSizeHintPath) - >>> loaded = loadedPipeline.transform(df).head().assembled - >>> expected = pipelineModel.transform(df).head().assembled - >>> loaded == expected - True - - .. versionadded:: 2.3.0 - """ - - size = Param(Params._dummy(), "size", "Size of vectors in column.", - typeConverter=TypeConverters.toInt) - - handleInvalid = Param(Params._dummy(), "handleInvalid", - "How to handle invalid vectors in inputCol. Invalid vectors include " - "nulls and vectors with the wrong size. The options are `skip` (filter " - "out rows with invalid vectors), `error` (throw an error) and " - "`optimistic` (do not check the vector size, and keep all rows). " - "`error` by default.", - TypeConverters.toString) - - @keyword_only - def __init__(self, inputCol=None, size=None, handleInvalid="error"): - """ - __init__(self, inputCol=None, size=None, handleInvalid="error") - """ - super(VectorSizeHint, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSizeHint", self.uid) - self._setDefault(handleInvalid="error") - self.setParams(**self._input_kwargs) - - @keyword_only - @since("2.3.0") - def setParams(self, inputCol=None, size=None, handleInvalid="error"): - """ - setParams(self, inputCol=None, size=None, handleInvalid="error") - Sets params for this VectorSizeHint. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.3.0") - def getSize(self): - """ Gets size param, the size of vectors in `inputCol`.""" - return self.getOrDefault(self.size) - - @since("2.3.0") - def setSize(self, value): - """ Sets size param, the size of vectors in `inputCol`.""" - return self._set(size=value) - - -if __name__ == "__main__": - import doctest - import tempfile - - import pyspark.ml.feature - from pyspark.sql import Row, SparkSession - - globs = globals().copy() - features = pyspark.ml.feature.__dict__.copy() - globs.update(features) - - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("ml.feature tests")\ - .getOrCreate() - sc = spark.sparkContext - globs['sc'] = sc - globs['spark'] = spark - testData = sc.parallelize([Row(id=0, label="a"), Row(id=1, label="b"), - Row(id=2, label="c"), Row(id=3, label="a"), - Row(id=4, label="a"), Row(id=5, label="c")], 2) - globs['stringIndDf'] = spark.createDataFrame(testData) - temp_path = tempfile.mkdtemp() - globs['temp_path'] = temp_path - try: - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - finally: - from shutil import rmtree - try: - rmtree(temp_path) - except OSError: - pass - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/fpm.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/fpm.py deleted file mode 100644 index 886ad84..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/fpm.py +++ /dev/null @@ -1,349 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from pyspark import keyword_only, since -from pyspark.sql import DataFrame -from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, _jvm -from pyspark.ml.param.shared import * - -__all__ = ["FPGrowth", "FPGrowthModel", "PrefixSpan"] - - -class HasMinSupport(Params): - """ - Mixin for param minSupport. - """ - - minSupport = Param( - Params._dummy(), - "minSupport", - "Minimal support level of the frequent pattern. [0.0, 1.0]. " + - "Any pattern that appears more than (minSupport * size-of-the-dataset) " + - "times will be output in the frequent itemsets.", - typeConverter=TypeConverters.toFloat) - - def setMinSupport(self, value): - """ - Sets the value of :py:attr:`minSupport`. - """ - return self._set(minSupport=value) - - def getMinSupport(self): - """ - Gets the value of minSupport or its default value. - """ - return self.getOrDefault(self.minSupport) - - -class HasNumPartitions(Params): - """ - Mixin for param numPartitions: Number of partitions (at least 1) used by parallel FP-growth. - """ - - numPartitions = Param( - Params._dummy(), - "numPartitions", - "Number of partitions (at least 1) used by parallel FP-growth. " + - "By default the param is not set, " + - "and partition number of the input dataset is used.", - typeConverter=TypeConverters.toInt) - - def setNumPartitions(self, value): - """ - Sets the value of :py:attr:`numPartitions`. - """ - return self._set(numPartitions=value) - - def getNumPartitions(self): - """ - Gets the value of :py:attr:`numPartitions` or its default value. - """ - return self.getOrDefault(self.numPartitions) - - -class HasMinConfidence(Params): - """ - Mixin for param minConfidence. - """ - - minConfidence = Param( - Params._dummy(), - "minConfidence", - "Minimal confidence for generating Association Rule. [0.0, 1.0]. " + - "minConfidence will not affect the mining for frequent itemsets, " + - "but will affect the association rules generation.", - typeConverter=TypeConverters.toFloat) - - def setMinConfidence(self, value): - """ - Sets the value of :py:attr:`minConfidence`. - """ - return self._set(minConfidence=value) - - def getMinConfidence(self): - """ - Gets the value of minConfidence or its default value. - """ - return self.getOrDefault(self.minConfidence) - - -class HasItemsCol(Params): - """ - Mixin for param itemsCol: items column name. - """ - - itemsCol = Param(Params._dummy(), "itemsCol", - "items column name", typeConverter=TypeConverters.toString) - - def setItemsCol(self, value): - """ - Sets the value of :py:attr:`itemsCol`. - """ - return self._set(itemsCol=value) - - def getItemsCol(self): - """ - Gets the value of itemsCol or its default value. - """ - return self.getOrDefault(self.itemsCol) - - -class FPGrowthModel(JavaModel, JavaMLWritable, JavaMLReadable): - """ - .. note:: Experimental - - Model fitted by FPGrowth. - - .. versionadded:: 2.2.0 - """ - @property - @since("2.2.0") - def freqItemsets(self): - """ - DataFrame with two columns: - * `items` - Itemset of the same type as the input column. - * `freq` - Frequency of the itemset (`LongType`). - """ - return self._call_java("freqItemsets") - - @property - @since("2.2.0") - def associationRules(self): - """ - DataFrame with four columns: - * `antecedent` - Array of the same type as the input column. - * `consequent` - Array of the same type as the input column. - * `confidence` - Confidence for the rule (`DoubleType`). - * `lift` - Lift for the rule (`DoubleType`). - """ - return self._call_java("associationRules") - - -class FPGrowth(JavaEstimator, HasItemsCol, HasPredictionCol, - HasMinSupport, HasNumPartitions, HasMinConfidence, - JavaMLWritable, JavaMLReadable): - - r""" - .. note:: Experimental - - A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in - Li et al., PFP: Parallel FP-Growth for Query Recommendation [LI2008]_. - PFP distributes computation in such a way that each worker executes an - independent group of mining tasks. The FP-Growth algorithm is described in - Han et al., Mining frequent patterns without candidate generation [HAN2000]_ - - .. [LI2008] http://dx.doi.org/10.1145/1454008.1454027 - .. [HAN2000] http://dx.doi.org/10.1145/335191.335372 - - .. note:: null values in the feature column are ignored during fit(). - .. note:: Internally `transform` `collects` and `broadcasts` association rules. - - >>> from pyspark.sql.functions import split - >>> data = (spark.read - ... .text("data/mllib/sample_fpgrowth.txt") - ... .select(split("value", "\s+").alias("items"))) - >>> data.show(truncate=False) - +------------------------+ - |items | - +------------------------+ - |[r, z, h, k, p] | - |[z, y, x, w, v, u, t, s]| - |[s, x, o, n, r] | - |[x, z, y, m, t, s, q, e]| - |[z] | - |[x, z, y, r, q, t, p] | - +------------------------+ - >>> fp = FPGrowth(minSupport=0.2, minConfidence=0.7) - >>> fpm = fp.fit(data) - >>> fpm.freqItemsets.show(5) - +---------+----+ - | items|freq| - +---------+----+ - | [s]| 3| - | [s, x]| 3| - |[s, x, z]| 2| - | [s, z]| 2| - | [r]| 3| - +---------+----+ - only showing top 5 rows - >>> fpm.associationRules.show(5) - +----------+----------+----------+ - |antecedent|consequent|confidence| - +----------+----------+----------+ - | [t, s]| [y]| 1.0| - | [t, s]| [x]| 1.0| - | [t, s]| [z]| 1.0| - | [p]| [r]| 1.0| - | [p]| [z]| 1.0| - +----------+----------+----------+ - only showing top 5 rows - >>> new_data = spark.createDataFrame([(["t", "s"], )], ["items"]) - >>> sorted(fpm.transform(new_data).first().prediction) - ['x', 'y', 'z'] - - .. versionadded:: 2.2.0 - """ - @keyword_only - def __init__(self, minSupport=0.3, minConfidence=0.8, itemsCol="items", - predictionCol="prediction", numPartitions=None): - """ - __init__(self, minSupport=0.3, minConfidence=0.8, itemsCol="items", \ - predictionCol="prediction", numPartitions=None) - """ - super(FPGrowth, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.fpm.FPGrowth", self.uid) - self._setDefault(minSupport=0.3, minConfidence=0.8, - itemsCol="items", predictionCol="prediction") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.2.0") - def setParams(self, minSupport=0.3, minConfidence=0.8, itemsCol="items", - predictionCol="prediction", numPartitions=None): - """ - setParams(self, minSupport=0.3, minConfidence=0.8, itemsCol="items", \ - predictionCol="prediction", numPartitions=None) - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return FPGrowthModel(java_model) - - -class PrefixSpan(JavaParams): - """ - .. note:: Experimental - - A parallel PrefixSpan algorithm to mine frequent sequential patterns. - The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns - Efficiently by Prefix-Projected Pattern Growth - (see here). - This class is not yet an Estimator/Transformer, use :py:func:`findFrequentSequentialPatterns` - method to run the PrefixSpan algorithm. - - @see Sequential Pattern Mining - (Wikipedia) - .. versionadded:: 2.4.0 - - """ - - minSupport = Param(Params._dummy(), "minSupport", "The minimal support level of the " + - "sequential pattern. Sequential pattern that appears more than " + - "(minSupport * size-of-the-dataset) times will be output. Must be >= 0.", - typeConverter=TypeConverters.toFloat) - - maxPatternLength = Param(Params._dummy(), "maxPatternLength", - "The maximal length of the sequential pattern. Must be > 0.", - typeConverter=TypeConverters.toInt) - - maxLocalProjDBSize = Param(Params._dummy(), "maxLocalProjDBSize", - "The maximum number of items (including delimiters used in the " + - "internal storage format) allowed in a projected database before " + - "local processing. If a projected database exceeds this size, " + - "another iteration of distributed prefix growth is run. " + - "Must be > 0.", - typeConverter=TypeConverters.toInt) - - sequenceCol = Param(Params._dummy(), "sequenceCol", "The name of the sequence column in " + - "dataset, rows with nulls in this column are ignored.", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, - sequenceCol="sequence"): - """ - __init__(self, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, \ - sequenceCol="sequence") - """ - super(PrefixSpan, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.fpm.PrefixSpan", self.uid) - self._setDefault(minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, - sequenceCol="sequence") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("2.4.0") - def setParams(self, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, - sequenceCol="sequence"): - """ - setParams(self, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, \ - sequenceCol="sequence") - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.4.0") - def findFrequentSequentialPatterns(self, dataset): - """ - .. note:: Experimental - - Finds the complete set of frequent sequential patterns in the input sequences of itemsets. - - :param dataset: A dataframe containing a sequence column which is - `ArrayType(ArrayType(T))` type, T is the item type for the input dataset. - :return: A `DataFrame` that contains columns of sequence and corresponding frequency. - The schema of it will be: - - `sequence: ArrayType(ArrayType(T))` (T is the item type) - - `freq: Long` - - >>> from pyspark.ml.fpm import PrefixSpan - >>> from pyspark.sql import Row - >>> df = sc.parallelize([Row(sequence=[[1, 2], [3]]), - ... Row(sequence=[[1], [3, 2], [1, 2]]), - ... Row(sequence=[[1, 2], [5]]), - ... Row(sequence=[[6]])]).toDF() - >>> prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5) - >>> prefixSpan.findFrequentSequentialPatterns(df).sort("sequence").show(truncate=False) - +----------+----+ - |sequence |freq| - +----------+----+ - |[[1]] |3 | - |[[1], [3]]|2 | - |[[1, 2]] |3 | - |[[2]] |3 | - |[[3]] |2 | - +----------+----+ - - .. versionadded:: 2.4.0 - """ - self._transfer_params_to_java() - jdf = self._java_obj.findFrequentSequentialPatterns(dataset._jdf) - return DataFrame(jdf, dataset.sql_ctx) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/image.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/image.py deleted file mode 100644 index a1aacea..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/image.py +++ /dev/null @@ -1,273 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -.. attribute:: ImageSchema - - An attribute of this module that contains the instance of :class:`_ImageSchema`. - -.. autoclass:: _ImageSchema - :members: -""" - -import sys -import warnings - -import numpy as np -from distutils.version import LooseVersion - -from pyspark import SparkContext -from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string -from pyspark.sql import DataFrame, SparkSession - -__all__ = ["ImageSchema"] - - -class _ImageSchema(object): - """ - Internal class for `pyspark.ml.image.ImageSchema` attribute. Meant to be private and - not to be instantized. Use `pyspark.ml.image.ImageSchema` attribute to access the - APIs of this class. - """ - - def __init__(self): - self._imageSchema = None - self._ocvTypes = None - self._columnSchema = None - self._imageFields = None - self._undefinedImageType = None - - @property - def imageSchema(self): - """ - Returns the image schema. - - :return: a :class:`StructType` with a single column of images - named "image" (nullable) and having the same type returned by :meth:`columnSchema`. - - .. versionadded:: 2.3.0 - """ - - if self._imageSchema is None: - ctx = SparkContext._active_spark_context - jschema = ctx._jvm.org.apache.spark.ml.image.ImageSchema.imageSchema() - self._imageSchema = _parse_datatype_json_string(jschema.json()) - return self._imageSchema - - @property - def ocvTypes(self): - """ - Returns the OpenCV type mapping supported. - - :return: a dictionary containing the OpenCV type mapping supported. - - .. versionadded:: 2.3.0 - """ - - if self._ocvTypes is None: - ctx = SparkContext._active_spark_context - self._ocvTypes = dict(ctx._jvm.org.apache.spark.ml.image.ImageSchema.javaOcvTypes()) - return self._ocvTypes - - @property - def columnSchema(self): - """ - Returns the schema for the image column. - - :return: a :class:`StructType` for image column, - ``struct``. - - .. versionadded:: 2.4.0 - """ - - if self._columnSchema is None: - ctx = SparkContext._active_spark_context - jschema = ctx._jvm.org.apache.spark.ml.image.ImageSchema.columnSchema() - self._columnSchema = _parse_datatype_json_string(jschema.json()) - return self._columnSchema - - @property - def imageFields(self): - """ - Returns field names of image columns. - - :return: a list of field names. - - .. versionadded:: 2.3.0 - """ - - if self._imageFields is None: - ctx = SparkContext._active_spark_context - self._imageFields = list(ctx._jvm.org.apache.spark.ml.image.ImageSchema.imageFields()) - return self._imageFields - - @property - def undefinedImageType(self): - """ - Returns the name of undefined image type for the invalid image. - - .. versionadded:: 2.3.0 - """ - - if self._undefinedImageType is None: - ctx = SparkContext._active_spark_context - self._undefinedImageType = \ - ctx._jvm.org.apache.spark.ml.image.ImageSchema.undefinedImageType() - return self._undefinedImageType - - def toNDArray(self, image): - """ - Converts an image to an array with metadata. - - :param `Row` image: A row that contains the image to be converted. It should - have the attributes specified in `ImageSchema.imageSchema`. - :return: a `numpy.ndarray` that is an image. - - .. versionadded:: 2.3.0 - """ - - if not isinstance(image, Row): - raise TypeError( - "image argument should be pyspark.sql.types.Row; however, " - "it got [%s]." % type(image)) - - if any(not hasattr(image, f) for f in self.imageFields): - raise ValueError( - "image argument should have attributes specified in " - "ImageSchema.imageSchema [%s]." % ", ".join(self.imageFields)) - - height = image.height - width = image.width - nChannels = image.nChannels - return np.ndarray( - shape=(height, width, nChannels), - dtype=np.uint8, - buffer=image.data, - strides=(width * nChannels, nChannels, 1)) - - def toImage(self, array, origin=""): - """ - Converts an array with metadata to a two-dimensional image. - - :param `numpy.ndarray` array: The array to convert to image. - :param str origin: Path to the image, optional. - :return: a :class:`Row` that is a two dimensional image. - - .. versionadded:: 2.3.0 - """ - - if not isinstance(array, np.ndarray): - raise TypeError( - "array argument should be numpy.ndarray; however, it got [%s]." % type(array)) - - if array.ndim != 3: - raise ValueError("Invalid array shape") - - height, width, nChannels = array.shape - ocvTypes = ImageSchema.ocvTypes - if nChannels == 1: - mode = ocvTypes["CV_8UC1"] - elif nChannels == 3: - mode = ocvTypes["CV_8UC3"] - elif nChannels == 4: - mode = ocvTypes["CV_8UC4"] - else: - raise ValueError("Invalid number of channels") - - # Running `bytearray(numpy.array([1]))` fails in specific Python versions - # with a specific Numpy version, for example in Python 3.6.0 and NumPy 1.13.3. - # Here, it avoids it by converting it to bytes. - if LooseVersion(np.__version__) >= LooseVersion('1.9'): - data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes()) - else: - # Numpy prior to 1.9 don't have `tobytes` method. - data = bytearray(array.astype(dtype=np.uint8).ravel()) - - # Creating new Row with _create_row(), because Row(name = value, ... ) - # orders fields by name, which conflicts with expected schema order - # when the new DataFrame is created by UDF - return _create_row(self.imageFields, - [origin, height, width, nChannels, mode, data]) - - def readImages(self, path, recursive=False, numPartitions=-1, - dropImageFailures=False, sampleRatio=1.0, seed=0): - """ - Reads the directory of images from the local or remote source. - - .. note:: If multiple jobs are run in parallel with different sampleRatio or recursive flag, - there may be a race condition where one job overwrites the hadoop configs of another. - - .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but - potentially non-deterministic. - - .. note:: Deprecated in 2.4.0. Use `spark.read.format("image").load(path)` instead and - this `readImages` will be removed in 3.0.0. - - :param str path: Path to the image directory. - :param bool recursive: Recursive search flag. - :param int numPartitions: Number of DataFrame partitions. - :param bool dropImageFailures: Drop the files that are not valid images. - :param float sampleRatio: Fraction of the images loaded. - :param int seed: Random number seed. - :return: a :class:`DataFrame` with a single column of "images", - see ImageSchema for details. - - >>> df = ImageSchema.readImages('data/mllib/images/origin/kittens', recursive=True) - >>> df.count() - 5 - - .. versionadded:: 2.3.0 - """ - warnings.warn("`ImageSchema.readImage` is deprecated. " + - "Use `spark.read.format(\"image\").load(path)` instead.", DeprecationWarning) - spark = SparkSession.builder.getOrCreate() - image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema - jsession = spark._jsparkSession - jresult = image_schema.readImages(path, jsession, recursive, numPartitions, - dropImageFailures, float(sampleRatio), seed) - return DataFrame(jresult, spark._wrapped) - - -ImageSchema = _ImageSchema() - - -# Monkey patch to disallow instantiation of this class. -def _disallow_instance(_): - raise RuntimeError("Creating instance of _ImageSchema class is disallowed.") -_ImageSchema.__init__ = _disallow_instance - - -def _test(): - import doctest - import pyspark.ml.image - globs = pyspark.ml.image.__dict__.copy() - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("ml.image tests")\ - .getOrCreate() - globs['spark'] = spark - - (failure_count, test_count) = doctest.testmod( - pyspark.ml.image, globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/linalg/__init__.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/linalg/__init__.py deleted file mode 100644 index 9da9836..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/linalg/__init__.py +++ /dev/null @@ -1,1173 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -MLlib utilities for linear algebra. For dense vectors, MLlib -uses the NumPy C{array} type, so you can simply pass NumPy arrays -around. For sparse vectors, users can construct a L{SparseVector} -object from MLlib or pass SciPy C{scipy.sparse} column vectors if -SciPy is available in their environment. -""" - -import sys -import array -import struct - -if sys.version >= '3': - basestring = str - xrange = range - import copyreg as copy_reg - long = int -else: - from itertools import izip as zip - import copy_reg - -import numpy as np - -from pyspark import since -from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \ - IntegerType, ByteType, BooleanType - - -__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors', - 'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices'] - - -if sys.version_info[:2] == (2, 7): - # speed up pickling array in Python 2.7 - def fast_pickle_array(ar): - return array.array, (ar.typecode, ar.tostring()) - copy_reg.pickle(array.array, fast_pickle_array) - - -# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods, -# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices. - -try: - import scipy.sparse - _have_scipy = True -except: - # No SciPy in environment, but that's okay - _have_scipy = False - - -def _convert_to_vector(l): - if isinstance(l, Vector): - return l - elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange): - return DenseVector(l) - elif _have_scipy and scipy.sparse.issparse(l): - assert l.shape[1] == 1, "Expected column vector" - # Make sure the converted csc_matrix has sorted indices. - csc = l.tocsc() - if not csc.has_sorted_indices: - csc.sort_indices() - return SparseVector(l.shape[0], csc.indices, csc.data) - else: - raise TypeError("Cannot convert type %s into Vector" % type(l)) - - -def _vector_size(v): - """ - Returns the size of the vector. - - >>> _vector_size([1., 2., 3.]) - 3 - >>> _vector_size((1., 2., 3.)) - 3 - >>> _vector_size(array.array('d', [1., 2., 3.])) - 3 - >>> _vector_size(np.zeros(3)) - 3 - >>> _vector_size(np.zeros((3, 1))) - 3 - >>> _vector_size(np.zeros((1, 3))) - Traceback (most recent call last): - ... - ValueError: Cannot treat an ndarray of shape (1, 3) as a vector - """ - if isinstance(v, Vector): - return len(v) - elif type(v) in (array.array, list, tuple, xrange): - return len(v) - elif type(v) == np.ndarray: - if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1): - return len(v) - else: - raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape)) - elif _have_scipy and scipy.sparse.issparse(v): - assert v.shape[1] == 1, "Expected column vector" - return v.shape[0] - else: - raise TypeError("Cannot treat type %s as a vector" % type(v)) - - -def _format_float(f, digits=4): - s = str(round(f, digits)) - if '.' in s: - s = s[:s.index('.') + 1 + digits] - return s - - -def _format_float_list(l): - return [_format_float(x) for x in l] - - -def _double_to_long_bits(value): - if np.isnan(value): - value = float('nan') - # pack double into 64 bits, then unpack as long int - return struct.unpack('Q', struct.pack('d', value))[0] - - -class VectorUDT(UserDefinedType): - """ - SQL user-defined type (UDT) for Vector. - """ - - @classmethod - def sqlType(cls): - return StructType([ - StructField("type", ByteType(), False), - StructField("size", IntegerType(), True), - StructField("indices", ArrayType(IntegerType(), False), True), - StructField("values", ArrayType(DoubleType(), False), True)]) - - @classmethod - def module(cls): - return "pyspark.ml.linalg" - - @classmethod - def scalaUDT(cls): - return "org.apache.spark.ml.linalg.VectorUDT" - - def serialize(self, obj): - if isinstance(obj, SparseVector): - indices = [int(i) for i in obj.indices] - values = [float(v) for v in obj.values] - return (0, obj.size, indices, values) - elif isinstance(obj, DenseVector): - values = [float(v) for v in obj] - return (1, None, None, values) - else: - raise TypeError("cannot serialize %r of type %r" % (obj, type(obj))) - - def deserialize(self, datum): - assert len(datum) == 4, \ - "VectorUDT.deserialize given row with length %d but requires 4" % len(datum) - tpe = datum[0] - if tpe == 0: - return SparseVector(datum[1], datum[2], datum[3]) - elif tpe == 1: - return DenseVector(datum[3]) - else: - raise ValueError("do not recognize type %r" % tpe) - - def simpleString(self): - return "vector" - - -class MatrixUDT(UserDefinedType): - """ - SQL user-defined type (UDT) for Matrix. - """ - - @classmethod - def sqlType(cls): - return StructType([ - StructField("type", ByteType(), False), - StructField("numRows", IntegerType(), False), - StructField("numCols", IntegerType(), False), - StructField("colPtrs", ArrayType(IntegerType(), False), True), - StructField("rowIndices", ArrayType(IntegerType(), False), True), - StructField("values", ArrayType(DoubleType(), False), True), - StructField("isTransposed", BooleanType(), False)]) - - @classmethod - def module(cls): - return "pyspark.ml.linalg" - - @classmethod - def scalaUDT(cls): - return "org.apache.spark.ml.linalg.MatrixUDT" - - def serialize(self, obj): - if isinstance(obj, SparseMatrix): - colPtrs = [int(i) for i in obj.colPtrs] - rowIndices = [int(i) for i in obj.rowIndices] - values = [float(v) for v in obj.values] - return (0, obj.numRows, obj.numCols, colPtrs, - rowIndices, values, bool(obj.isTransposed)) - elif isinstance(obj, DenseMatrix): - values = [float(v) for v in obj.values] - return (1, obj.numRows, obj.numCols, None, None, values, - bool(obj.isTransposed)) - else: - raise TypeError("cannot serialize type %r" % (type(obj))) - - def deserialize(self, datum): - assert len(datum) == 7, \ - "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum) - tpe = datum[0] - if tpe == 0: - return SparseMatrix(*datum[1:]) - elif tpe == 1: - return DenseMatrix(datum[1], datum[2], datum[5], datum[6]) - else: - raise ValueError("do not recognize type %r" % tpe) - - def simpleString(self): - return "matrix" - - -class Vector(object): - - __UDT__ = VectorUDT() - - """ - Abstract class for DenseVector and SparseVector - """ - def toArray(self): - """ - Convert the vector into an numpy.ndarray - - :return: numpy.ndarray - """ - raise NotImplementedError - - -class DenseVector(Vector): - """ - A dense vector represented by a value array. We use numpy array for - storage and arithmetics will be delegated to the underlying numpy - array. - - >>> v = Vectors.dense([1.0, 2.0]) - >>> u = Vectors.dense([3.0, 4.0]) - >>> v + u - DenseVector([4.0, 6.0]) - >>> 2 - v - DenseVector([1.0, 0.0]) - >>> v / 2 - DenseVector([0.5, 1.0]) - >>> v * u - DenseVector([3.0, 8.0]) - >>> u / v - DenseVector([3.0, 2.0]) - >>> u % 2 - DenseVector([1.0, 0.0]) - >>> -v - DenseVector([-1.0, -2.0]) - """ - def __init__(self, ar): - if isinstance(ar, bytes): - ar = np.frombuffer(ar, dtype=np.float64) - elif not isinstance(ar, np.ndarray): - ar = np.array(ar, dtype=np.float64) - if ar.dtype != np.float64: - ar = ar.astype(np.float64) - self.array = ar - - def __reduce__(self): - return DenseVector, (self.array.tostring(),) - - def numNonzeros(self): - """ - Number of nonzero elements. This scans all active values and count non zeros - """ - return np.count_nonzero(self.array) - - def norm(self, p): - """ - Calculates the norm of a DenseVector. - - >>> a = DenseVector([0, -1, 2, -3]) - >>> a.norm(2) - 3.7... - >>> a.norm(1) - 6.0 - """ - return np.linalg.norm(self.array, p) - - def dot(self, other): - """ - Compute the dot product of two Vectors. We support - (Numpy array, list, SparseVector, or SciPy sparse) - and a target NumPy array that is either 1- or 2-dimensional. - Equivalent to calling numpy.dot of the two vectors. - - >>> dense = DenseVector(array.array('d', [1., 2.])) - >>> dense.dot(dense) - 5.0 - >>> dense.dot(SparseVector(2, [0, 1], [2., 1.])) - 4.0 - >>> dense.dot(range(1, 3)) - 5.0 - >>> dense.dot(np.array(range(1, 3))) - 5.0 - >>> dense.dot([1.,]) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F')) - array([ 5., 11.]) - >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F')) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - """ - if type(other) == np.ndarray: - if other.ndim > 1: - assert len(self) == other.shape[0], "dimension mismatch" - return np.dot(self.array, other) - elif _have_scipy and scipy.sparse.issparse(other): - assert len(self) == other.shape[0], "dimension mismatch" - return other.transpose().dot(self.toArray()) - else: - assert len(self) == _vector_size(other), "dimension mismatch" - if isinstance(other, SparseVector): - return other.dot(self) - elif isinstance(other, Vector): - return np.dot(self.toArray(), other.toArray()) - else: - return np.dot(self.toArray(), other) - - def squared_distance(self, other): - """ - Squared distance of two Vectors. - - >>> dense1 = DenseVector(array.array('d', [1., 2.])) - >>> dense1.squared_distance(dense1) - 0.0 - >>> dense2 = np.array([2., 1.]) - >>> dense1.squared_distance(dense2) - 2.0 - >>> dense3 = [2., 1.] - >>> dense1.squared_distance(dense3) - 2.0 - >>> sparse1 = SparseVector(2, [0, 1], [2., 1.]) - >>> dense1.squared_distance(sparse1) - 2.0 - >>> dense1.squared_distance([1.,]) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> dense1.squared_distance(SparseVector(1, [0,], [1.,])) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - """ - assert len(self) == _vector_size(other), "dimension mismatch" - if isinstance(other, SparseVector): - return other.squared_distance(self) - elif _have_scipy and scipy.sparse.issparse(other): - return _convert_to_vector(other).squared_distance(self) - - if isinstance(other, Vector): - other = other.toArray() - elif not isinstance(other, np.ndarray): - other = np.array(other) - diff = self.toArray() - other - return np.dot(diff, diff) - - def toArray(self): - """ - Returns an numpy.ndarray - """ - return self.array - - @property - def values(self): - """ - Returns a list of values - """ - return self.array - - def __getitem__(self, item): - return self.array[item] - - def __len__(self): - return len(self.array) - - def __str__(self): - return "[" + ",".join([str(v) for v in self.array]) + "]" - - def __repr__(self): - return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array)) - - def __eq__(self, other): - if isinstance(other, DenseVector): - return np.array_equal(self.array, other.array) - elif isinstance(other, SparseVector): - if len(self) != other.size: - return False - return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values) - return False - - def __ne__(self, other): - return not self == other - - def __hash__(self): - size = len(self) - result = 31 + size - nnz = 0 - i = 0 - while i < size and nnz < 128: - if self.array[i] != 0: - result = 31 * result + i - bits = _double_to_long_bits(self.array[i]) - result = 31 * result + (bits ^ (bits >> 32)) - nnz += 1 - i += 1 - return result - - def __getattr__(self, item): - return getattr(self.array, item) - - def __neg__(self): - return DenseVector(-self.array) - - def _delegate(op): - def func(self, other): - if isinstance(other, DenseVector): - other = other.array - return DenseVector(getattr(self.array, op)(other)) - return func - - __add__ = _delegate("__add__") - __sub__ = _delegate("__sub__") - __mul__ = _delegate("__mul__") - __div__ = _delegate("__div__") - __truediv__ = _delegate("__truediv__") - __mod__ = _delegate("__mod__") - __radd__ = _delegate("__radd__") - __rsub__ = _delegate("__rsub__") - __rmul__ = _delegate("__rmul__") - __rdiv__ = _delegate("__rdiv__") - __rtruediv__ = _delegate("__rtruediv__") - __rmod__ = _delegate("__rmod__") - - -class SparseVector(Vector): - """ - A simple sparse vector class for passing data to MLlib. Users may - alternatively pass SciPy's {scipy.sparse} data types. - """ - def __init__(self, size, *args): - """ - Create a sparse vector, using either a dictionary, a list of - (index, value) pairs, or two separate arrays of indices and - values (sorted by index). - - :param size: Size of the vector. - :param args: Active entries, as a dictionary {index: value, ...}, - a list of tuples [(index, value), ...], or a list of strictly - increasing indices and a list of corresponding values [index, ...], - [value, ...]. Inactive entries are treated as zeros. - - >>> SparseVector(4, {1: 1.0, 3: 5.5}) - SparseVector(4, {1: 1.0, 3: 5.5}) - >>> SparseVector(4, [(1, 1.0), (3, 5.5)]) - SparseVector(4, {1: 1.0, 3: 5.5}) - >>> SparseVector(4, [1, 3], [1.0, 5.5]) - SparseVector(4, {1: 1.0, 3: 5.5}) - >>> SparseVector(4, {1:1.0, 6:2.0}) - Traceback (most recent call last): - ... - AssertionError: Index 6 is out of the size of vector with size=4 - >>> SparseVector(4, {-1:1.0}) - Traceback (most recent call last): - ... - AssertionError: Contains negative index -1 - """ - self.size = int(size) - """ Size of the vector. """ - assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments" - if len(args) == 1: - pairs = args[0] - if type(pairs) == dict: - pairs = pairs.items() - pairs = sorted(pairs) - self.indices = np.array([p[0] for p in pairs], dtype=np.int32) - """ A list of indices corresponding to active entries. """ - self.values = np.array([p[1] for p in pairs], dtype=np.float64) - """ A list of values corresponding to active entries. """ - else: - if isinstance(args[0], bytes): - assert isinstance(args[1], bytes), "values should be string too" - if args[0]: - self.indices = np.frombuffer(args[0], np.int32) - self.values = np.frombuffer(args[1], np.float64) - else: - # np.frombuffer() doesn't work well with empty string in older version - self.indices = np.array([], dtype=np.int32) - self.values = np.array([], dtype=np.float64) - else: - self.indices = np.array(args[0], dtype=np.int32) - self.values = np.array(args[1], dtype=np.float64) - assert len(self.indices) == len(self.values), "index and value arrays not same length" - for i in xrange(len(self.indices) - 1): - if self.indices[i] >= self.indices[i + 1]: - raise TypeError( - "Indices %s and %s are not strictly increasing" - % (self.indices[i], self.indices[i + 1])) - - if self.indices.size > 0: - assert np.max(self.indices) < self.size, \ - "Index %d is out of the size of vector with size=%d" \ - % (np.max(self.indices), self.size) - assert np.min(self.indices) >= 0, \ - "Contains negative index %d" % (np.min(self.indices)) - - def numNonzeros(self): - """ - Number of nonzero elements. This scans all active values and count non zeros. - """ - return np.count_nonzero(self.values) - - def norm(self, p): - """ - Calculates the norm of a SparseVector. - - >>> a = SparseVector(4, [0, 1], [3., -4.]) - >>> a.norm(1) - 7.0 - >>> a.norm(2) - 5.0 - """ - return np.linalg.norm(self.values, p) - - def __reduce__(self): - return ( - SparseVector, - (self.size, self.indices.tostring(), self.values.tostring())) - - def dot(self, other): - """ - Dot product with a SparseVector or 1- or 2-dimensional Numpy array. - - >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) - >>> a.dot(a) - 25.0 - >>> a.dot(array.array('d', [1., 2., 3., 4.])) - 22.0 - >>> b = SparseVector(4, [2], [1.0]) - >>> a.dot(b) - 0.0 - >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]])) - array([ 22., 22.]) - >>> a.dot([1., 2., 3.]) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> a.dot(np.array([1., 2.])) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> a.dot(DenseVector([1., 2.])) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> a.dot(np.zeros((3, 2))) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - """ - - if isinstance(other, np.ndarray): - if other.ndim not in [2, 1]: - raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim) - assert len(self) == other.shape[0], "dimension mismatch" - return np.dot(self.values, other[self.indices]) - - assert len(self) == _vector_size(other), "dimension mismatch" - - if isinstance(other, DenseVector): - return np.dot(other.array[self.indices], self.values) - - elif isinstance(other, SparseVector): - # Find out common indices. - self_cmind = np.in1d(self.indices, other.indices, assume_unique=True) - self_values = self.values[self_cmind] - if self_values.size == 0: - return 0.0 - else: - other_cmind = np.in1d(other.indices, self.indices, assume_unique=True) - return np.dot(self_values, other.values[other_cmind]) - - else: - return self.dot(_convert_to_vector(other)) - - def squared_distance(self, other): - """ - Squared distance from a SparseVector or 1-dimensional NumPy array. - - >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) - >>> a.squared_distance(a) - 0.0 - >>> a.squared_distance(array.array('d', [1., 2., 3., 4.])) - 11.0 - >>> a.squared_distance(np.array([1., 2., 3., 4.])) - 11.0 - >>> b = SparseVector(4, [2], [1.0]) - >>> a.squared_distance(b) - 26.0 - >>> b.squared_distance(a) - 26.0 - >>> b.squared_distance([1., 2.]) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> b.squared_distance(SparseVector(3, [1,], [1.0,])) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - """ - assert len(self) == _vector_size(other), "dimension mismatch" - - if isinstance(other, np.ndarray) or isinstance(other, DenseVector): - if isinstance(other, np.ndarray) and other.ndim != 1: - raise Exception("Cannot call squared_distance with %d-dimensional array" % - other.ndim) - if isinstance(other, DenseVector): - other = other.array - sparse_ind = np.zeros(other.size, dtype=bool) - sparse_ind[self.indices] = True - dist = other[sparse_ind] - self.values - result = np.dot(dist, dist) - - other_ind = other[~sparse_ind] - result += np.dot(other_ind, other_ind) - return result - - elif isinstance(other, SparseVector): - result = 0.0 - i, j = 0, 0 - while i < len(self.indices) and j < len(other.indices): - if self.indices[i] == other.indices[j]: - diff = self.values[i] - other.values[j] - result += diff * diff - i += 1 - j += 1 - elif self.indices[i] < other.indices[j]: - result += self.values[i] * self.values[i] - i += 1 - else: - result += other.values[j] * other.values[j] - j += 1 - while i < len(self.indices): - result += self.values[i] * self.values[i] - i += 1 - while j < len(other.indices): - result += other.values[j] * other.values[j] - j += 1 - return result - else: - return self.squared_distance(_convert_to_vector(other)) - - def toArray(self): - """ - Returns a copy of this SparseVector as a 1-dimensional NumPy array. - """ - arr = np.zeros((self.size,), dtype=np.float64) - arr[self.indices] = self.values - return arr - - def __len__(self): - return self.size - - def __str__(self): - inds = "[" + ",".join([str(i) for i in self.indices]) + "]" - vals = "[" + ",".join([str(v) for v in self.values]) + "]" - return "(" + ",".join((str(self.size), inds, vals)) + ")" - - def __repr__(self): - inds = self.indices - vals = self.values - entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i])) - for i in xrange(len(inds))]) - return "SparseVector({0}, {{{1}}})".format(self.size, entries) - - def __eq__(self, other): - if isinstance(other, SparseVector): - return other.size == self.size and np.array_equal(other.indices, self.indices) \ - and np.array_equal(other.values, self.values) - elif isinstance(other, DenseVector): - if self.size != len(other): - return False - return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array) - return False - - def __getitem__(self, index): - inds = self.indices - vals = self.values - if not isinstance(index, int): - raise TypeError( - "Indices must be of type integer, got type %s" % type(index)) - - if index >= self.size or index < -self.size: - raise IndexError("Index %d out of bounds." % index) - if index < 0: - index += self.size - - if (inds.size == 0) or (index > inds.item(-1)): - return 0. - - insert_index = np.searchsorted(inds, index) - row_ind = inds[insert_index] - if row_ind == index: - return vals[insert_index] - return 0. - - def __ne__(self, other): - return not self.__eq__(other) - - def __hash__(self): - result = 31 + self.size - nnz = 0 - i = 0 - while i < len(self.values) and nnz < 128: - if self.values[i] != 0: - result = 31 * result + int(self.indices[i]) - bits = _double_to_long_bits(self.values[i]) - result = 31 * result + (bits ^ (bits >> 32)) - nnz += 1 - i += 1 - return result - - -class Vectors(object): - - """ - Factory methods for working with vectors. - - .. note:: Dense vectors are simply represented as NumPy array objects, - so there is no need to covert them for use in MLlib. For sparse vectors, - the factory methods in this class create an MLlib-compatible type, or users - can pass in SciPy's C{scipy.sparse} column vectors. - """ - - @staticmethod - def sparse(size, *args): - """ - Create a sparse vector, using either a dictionary, a list of - (index, value) pairs, or two separate arrays of indices and - values (sorted by index). - - :param size: Size of the vector. - :param args: Non-zero entries, as a dictionary, list of tuples, - or two sorted lists containing indices and values. - - >>> Vectors.sparse(4, {1: 1.0, 3: 5.5}) - SparseVector(4, {1: 1.0, 3: 5.5}) - >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) - SparseVector(4, {1: 1.0, 3: 5.5}) - >>> Vectors.sparse(4, [1, 3], [1.0, 5.5]) - SparseVector(4, {1: 1.0, 3: 5.5}) - """ - return SparseVector(size, *args) - - @staticmethod - def dense(*elements): - """ - Create a dense vector of 64-bit floats from a Python list or numbers. - - >>> Vectors.dense([1, 2, 3]) - DenseVector([1.0, 2.0, 3.0]) - >>> Vectors.dense(1.0, 2.0) - DenseVector([1.0, 2.0]) - """ - if len(elements) == 1 and not isinstance(elements[0], (float, int, long)): - # it's list, numpy.array or other iterable object. - elements = elements[0] - return DenseVector(elements) - - @staticmethod - def squared_distance(v1, v2): - """ - Squared distance between two vectors. - a and b can be of type SparseVector, DenseVector, np.ndarray - or array.array. - - >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) - >>> b = Vectors.dense([2, 5, 4, 1]) - >>> a.squared_distance(b) - 51.0 - """ - v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2) - return v1.squared_distance(v2) - - @staticmethod - def norm(vector, p): - """ - Find norm of the given vector. - """ - return _convert_to_vector(vector).norm(p) - - @staticmethod - def zeros(size): - return DenseVector(np.zeros(size)) - - @staticmethod - def _equals(v1_indices, v1_values, v2_indices, v2_values): - """ - Check equality between sparse/dense vectors, - v1_indices and v2_indices assume to be strictly increasing. - """ - v1_size = len(v1_values) - v2_size = len(v2_values) - k1 = 0 - k2 = 0 - all_equal = True - while all_equal: - while k1 < v1_size and v1_values[k1] == 0: - k1 += 1 - while k2 < v2_size and v2_values[k2] == 0: - k2 += 1 - - if k1 >= v1_size or k2 >= v2_size: - return k1 >= v1_size and k2 >= v2_size - - all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2] - k1 += 1 - k2 += 1 - return all_equal - - -class Matrix(object): - - __UDT__ = MatrixUDT() - - """ - Represents a local matrix. - """ - def __init__(self, numRows, numCols, isTransposed=False): - self.numRows = numRows - self.numCols = numCols - self.isTransposed = isTransposed - - def toArray(self): - """ - Returns its elements in a NumPy ndarray. - """ - raise NotImplementedError - - @staticmethod - def _convert_to_array(array_like, dtype): - """ - Convert Matrix attributes which are array-like or buffer to array. - """ - if isinstance(array_like, bytes): - return np.frombuffer(array_like, dtype=dtype) - return np.asarray(array_like, dtype=dtype) - - -class DenseMatrix(Matrix): - """ - Column-major dense matrix. - """ - def __init__(self, numRows, numCols, values, isTransposed=False): - Matrix.__init__(self, numRows, numCols, isTransposed) - values = self._convert_to_array(values, np.float64) - assert len(values) == numRows * numCols - self.values = values - - def __reduce__(self): - return DenseMatrix, ( - self.numRows, self.numCols, self.values.tostring(), - int(self.isTransposed)) - - def __str__(self): - """ - Pretty printing of a DenseMatrix - - >>> dm = DenseMatrix(2, 2, range(4)) - >>> print(dm) - DenseMatrix([[ 0., 2.], - [ 1., 3.]]) - >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True) - >>> print(dm) - DenseMatrix([[ 0., 1.], - [ 2., 3.]]) - """ - # Inspired by __repr__ in scipy matrices. - array_lines = repr(self.toArray()).splitlines() - - # We need to adjust six spaces which is the difference in number - # of letters between "DenseMatrix" and "array" - x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]]) - return array_lines[0].replace("array", "DenseMatrix") + "\n" + x - - def __repr__(self): - """ - Representation of a DenseMatrix - - >>> dm = DenseMatrix(2, 2, range(4)) - >>> dm - DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False) - """ - # If the number of values are less than seventeen then return as it is. - # Else return first eight values and last eight values. - if len(self.values) < 17: - entries = _format_float_list(self.values) - else: - entries = ( - _format_float_list(self.values[:8]) + - ["..."] + - _format_float_list(self.values[-8:]) - ) - - entries = ", ".join(entries) - return "DenseMatrix({0}, {1}, [{2}], {3})".format( - self.numRows, self.numCols, entries, self.isTransposed) - - def toArray(self): - """ - Return an numpy.ndarray - - >>> m = DenseMatrix(2, 2, range(4)) - >>> m.toArray() - array([[ 0., 2.], - [ 1., 3.]]) - """ - if self.isTransposed: - return np.asfortranarray( - self.values.reshape((self.numRows, self.numCols))) - else: - return self.values.reshape((self.numRows, self.numCols), order='F') - - def toSparse(self): - """Convert to SparseMatrix""" - if self.isTransposed: - values = np.ravel(self.toArray(), order='F') - else: - values = self.values - indices = np.nonzero(values)[0] - colCounts = np.bincount(indices // self.numRows) - colPtrs = np.cumsum(np.hstack( - (0, colCounts, np.zeros(self.numCols - colCounts.size)))) - values = values[indices] - rowIndices = indices % self.numRows - - return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values) - - def __getitem__(self, indices): - i, j = indices - if i < 0 or i >= self.numRows: - raise IndexError("Row index %d is out of range [0, %d)" - % (i, self.numRows)) - if j >= self.numCols or j < 0: - raise IndexError("Column index %d is out of range [0, %d)" - % (j, self.numCols)) - - if self.isTransposed: - return self.values[i * self.numCols + j] - else: - return self.values[i + j * self.numRows] - - def __eq__(self, other): - if (not isinstance(other, DenseMatrix) or - self.numRows != other.numRows or - self.numCols != other.numCols): - return False - - self_values = np.ravel(self.toArray(), order='F') - other_values = np.ravel(other.toArray(), order='F') - return all(self_values == other_values) - - -class SparseMatrix(Matrix): - """Sparse Matrix stored in CSC format.""" - def __init__(self, numRows, numCols, colPtrs, rowIndices, values, - isTransposed=False): - Matrix.__init__(self, numRows, numCols, isTransposed) - self.colPtrs = self._convert_to_array(colPtrs, np.int32) - self.rowIndices = self._convert_to_array(rowIndices, np.int32) - self.values = self._convert_to_array(values, np.float64) - - if self.isTransposed: - if self.colPtrs.size != numRows + 1: - raise ValueError("Expected colPtrs of size %d, got %d." - % (numRows + 1, self.colPtrs.size)) - else: - if self.colPtrs.size != numCols + 1: - raise ValueError("Expected colPtrs of size %d, got %d." - % (numCols + 1, self.colPtrs.size)) - if self.rowIndices.size != self.values.size: - raise ValueError("Expected rowIndices of length %d, got %d." - % (self.rowIndices.size, self.values.size)) - - def __str__(self): - """ - Pretty printing of a SparseMatrix - - >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - >>> print(sm1) - 2 X 2 CSCMatrix - (0,0) 2.0 - (1,0) 3.0 - (1,1) 4.0 - >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) - >>> print(sm1) - 2 X 2 CSRMatrix - (0,0) 2.0 - (0,1) 3.0 - (1,1) 4.0 - """ - spstr = "{0} X {1} ".format(self.numRows, self.numCols) - if self.isTransposed: - spstr += "CSRMatrix\n" - else: - spstr += "CSCMatrix\n" - - cur_col = 0 - smlist = [] - - # Display first 16 values. - if len(self.values) <= 16: - zipindval = zip(self.rowIndices, self.values) - else: - zipindval = zip(self.rowIndices[:16], self.values[:16]) - for i, (rowInd, value) in enumerate(zipindval): - if self.colPtrs[cur_col + 1] <= i: - cur_col += 1 - if self.isTransposed: - smlist.append('({0},{1}) {2}'.format( - cur_col, rowInd, _format_float(value))) - else: - smlist.append('({0},{1}) {2}'.format( - rowInd, cur_col, _format_float(value))) - spstr += "\n".join(smlist) - - if len(self.values) > 16: - spstr += "\n.." * 2 - return spstr - - def __repr__(self): - """ - Representation of a SparseMatrix - - >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - >>> sm1 - SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False) - """ - rowIndices = list(self.rowIndices) - colPtrs = list(self.colPtrs) - - if len(self.values) <= 16: - values = _format_float_list(self.values) - - else: - values = ( - _format_float_list(self.values[:8]) + - ["..."] + - _format_float_list(self.values[-8:]) - ) - rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:] - - if len(self.colPtrs) > 16: - colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:] - - values = ", ".join(values) - rowIndices = ", ".join([str(ind) for ind in rowIndices]) - colPtrs = ", ".join([str(ptr) for ptr in colPtrs]) - return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format( - self.numRows, self.numCols, colPtrs, rowIndices, - values, self.isTransposed) - - def __reduce__(self): - return SparseMatrix, ( - self.numRows, self.numCols, self.colPtrs.tostring(), - self.rowIndices.tostring(), self.values.tostring(), - int(self.isTransposed)) - - def __getitem__(self, indices): - i, j = indices - if i < 0 or i >= self.numRows: - raise IndexError("Row index %d is out of range [0, %d)" - % (i, self.numRows)) - if j < 0 or j >= self.numCols: - raise IndexError("Column index %d is out of range [0, %d)" - % (j, self.numCols)) - - # If a CSR matrix is given, then the row index should be searched - # for in ColPtrs, and the column index should be searched for in the - # corresponding slice obtained from rowIndices. - if self.isTransposed: - j, i = i, j - - colStart = self.colPtrs[j] - colEnd = self.colPtrs[j + 1] - nz = self.rowIndices[colStart: colEnd] - ind = np.searchsorted(nz, i) + colStart - if ind < colEnd and self.rowIndices[ind] == i: - return self.values[ind] - else: - return 0.0 - - def toArray(self): - """ - Return an numpy.ndarray - """ - A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F') - for k in xrange(self.colPtrs.size - 1): - startptr = self.colPtrs[k] - endptr = self.colPtrs[k + 1] - if self.isTransposed: - A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr] - else: - A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr] - return A - - def toDense(self): - densevals = np.ravel(self.toArray(), order='F') - return DenseMatrix(self.numRows, self.numCols, densevals) - - # TODO: More efficient implementation: - def __eq__(self, other): - return np.all(self.toArray() == other.toArray()) - - -class Matrices(object): - @staticmethod - def dense(numRows, numCols, values): - """ - Create a DenseMatrix - """ - return DenseMatrix(numRows, numCols, values) - - @staticmethod - def sparse(numRows, numCols, colPtrs, rowIndices, values): - """ - Create a SparseMatrix - """ - return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values) - - -def _test(): - import doctest - try: - # Numpy 1.14+ changed it's string format. - np.set_printoptions(legacy='1.13') - except TypeError: - pass - (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS) - if failure_count: - sys.exit(-1) - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/param/__init__.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/param/__init__.py deleted file mode 100644 index 043c25c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/param/__init__.py +++ /dev/null @@ -1,511 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import array -import sys -if sys.version > '3': - basestring = str - xrange = range - unicode = str - -from abc import ABCMeta -import copy -import numpy as np - -from py4j.java_gateway import JavaObject - -from pyspark.ml.linalg import DenseVector, Vector, Matrix -from pyspark.ml.util import Identifiable - - -__all__ = ['Param', 'Params', 'TypeConverters'] - - -class Param(object): - """ - A param with self-contained documentation. - - .. versionadded:: 1.3.0 - """ - - def __init__(self, parent, name, doc, typeConverter=None): - if not isinstance(parent, Identifiable): - raise TypeError("Parent must be an Identifiable but got type %s." % type(parent)) - self.parent = parent.uid - self.name = str(name) - self.doc = str(doc) - self.typeConverter = TypeConverters.identity if typeConverter is None else typeConverter - - def _copy_new_parent(self, parent): - """Copy the current param to a new parent, must be a dummy param.""" - if self.parent == "undefined": - param = copy.copy(self) - param.parent = parent.uid - return param - else: - raise ValueError("Cannot copy from non-dummy parent %s." % parent) - - def __str__(self): - return str(self.parent) + "__" + self.name - - def __repr__(self): - return "Param(parent=%r, name=%r, doc=%r)" % (self.parent, self.name, self.doc) - - def __hash__(self): - return hash(str(self)) - - def __eq__(self, other): - if isinstance(other, Param): - return self.parent == other.parent and self.name == other.name - else: - return False - - -class TypeConverters(object): - """ - .. note:: DeveloperApi - - Factory methods for common type conversion functions for `Param.typeConverter`. - - .. versionadded:: 2.0.0 - """ - - @staticmethod - def _is_numeric(value): - vtype = type(value) - return vtype in [int, float, np.float64, np.int64] or vtype.__name__ == 'long' - - @staticmethod - def _is_integer(value): - return TypeConverters._is_numeric(value) and float(value).is_integer() - - @staticmethod - def _can_convert_to_list(value): - vtype = type(value) - return vtype in [list, np.ndarray, tuple, xrange, array.array] or isinstance(value, Vector) - - @staticmethod - def _can_convert_to_string(value): - vtype = type(value) - return isinstance(value, basestring) or vtype in [np.unicode_, np.string_, np.str_] - - @staticmethod - def identity(value): - """ - Dummy converter that just returns value. - """ - return value - - @staticmethod - def toList(value): - """ - Convert a value to a list, if possible. - """ - if type(value) == list: - return value - elif type(value) in [np.ndarray, tuple, xrange, array.array]: - return list(value) - elif isinstance(value, Vector): - return list(value.toArray()) - else: - raise TypeError("Could not convert %s to list" % value) - - @staticmethod - def toListFloat(value): - """ - Convert a value to list of floats, if possible. - """ - if TypeConverters._can_convert_to_list(value): - value = TypeConverters.toList(value) - if all(map(lambda v: TypeConverters._is_numeric(v), value)): - return [float(v) for v in value] - raise TypeError("Could not convert %s to list of floats" % value) - - @staticmethod - def toListInt(value): - """ - Convert a value to list of ints, if possible. - """ - if TypeConverters._can_convert_to_list(value): - value = TypeConverters.toList(value) - if all(map(lambda v: TypeConverters._is_integer(v), value)): - return [int(v) for v in value] - raise TypeError("Could not convert %s to list of ints" % value) - - @staticmethod - def toListString(value): - """ - Convert a value to list of strings, if possible. - """ - if TypeConverters._can_convert_to_list(value): - value = TypeConverters.toList(value) - if all(map(lambda v: TypeConverters._can_convert_to_string(v), value)): - return [TypeConverters.toString(v) for v in value] - raise TypeError("Could not convert %s to list of strings" % value) - - @staticmethod - def toVector(value): - """ - Convert a value to a MLlib Vector, if possible. - """ - if isinstance(value, Vector): - return value - elif TypeConverters._can_convert_to_list(value): - value = TypeConverters.toList(value) - if all(map(lambda v: TypeConverters._is_numeric(v), value)): - return DenseVector(value) - raise TypeError("Could not convert %s to vector" % value) - - @staticmethod - def toMatrix(value): - """ - Convert a value to a MLlib Matrix, if possible. - """ - if isinstance(value, Matrix): - return value - raise TypeError("Could not convert %s to matrix" % value) - - @staticmethod - def toFloat(value): - """ - Convert a value to a float, if possible. - """ - if TypeConverters._is_numeric(value): - return float(value) - else: - raise TypeError("Could not convert %s to float" % value) - - @staticmethod - def toInt(value): - """ - Convert a value to an int, if possible. - """ - if TypeConverters._is_integer(value): - return int(value) - else: - raise TypeError("Could not convert %s to int" % value) - - @staticmethod - def toString(value): - """ - Convert a value to a string, if possible. - """ - if isinstance(value, basestring): - return value - elif type(value) in [np.string_, np.str_]: - return str(value) - elif type(value) == np.unicode_: - return unicode(value) - else: - raise TypeError("Could not convert %s to string type" % type(value)) - - @staticmethod - def toBoolean(value): - """ - Convert a value to a boolean, if possible. - """ - if type(value) == bool: - return value - else: - raise TypeError("Boolean Param requires value of type bool. Found %s." % type(value)) - - -class Params(Identifiable): - """ - Components that take parameters. This also provides an internal - param map to store parameter values attached to the instance. - - .. versionadded:: 1.3.0 - """ - - __metaclass__ = ABCMeta - - def __init__(self): - super(Params, self).__init__() - #: internal param map for user-supplied values param map - self._paramMap = {} - - #: internal param map for default values - self._defaultParamMap = {} - - #: value returned by :py:func:`params` - self._params = None - - # Copy the params from the class to the object - self._copy_params() - - def _copy_params(self): - """ - Copy all params defined on the class to current object. - """ - cls = type(self) - src_name_attrs = [(x, getattr(cls, x)) for x in dir(cls)] - src_params = list(filter(lambda nameAttr: isinstance(nameAttr[1], Param), src_name_attrs)) - for name, param in src_params: - setattr(self, name, param._copy_new_parent(self)) - - @property - def params(self): - """ - Returns all params ordered by name. The default implementation - uses :py:func:`dir` to get all attributes of type - :py:class:`Param`. - """ - if self._params is None: - self._params = list(filter(lambda attr: isinstance(attr, Param), - [getattr(self, x) for x in dir(self) if x != "params" and - not isinstance(getattr(type(self), x, None), property)])) - return self._params - - def explainParam(self, param): - """ - Explains a single param and returns its name, doc, and optional - default value and user-supplied value in a string. - """ - param = self._resolveParam(param) - values = [] - if self.isDefined(param): - if param in self._defaultParamMap: - values.append("default: %s" % self._defaultParamMap[param]) - if param in self._paramMap: - values.append("current: %s" % self._paramMap[param]) - else: - values.append("undefined") - valueStr = "(" + ", ".join(values) + ")" - return "%s: %s %s" % (param.name, param.doc, valueStr) - - def explainParams(self): - """ - Returns the documentation of all params with their optionally - default values and user-supplied values. - """ - return "\n".join([self.explainParam(param) for param in self.params]) - - def getParam(self, paramName): - """ - Gets a param by its name. - """ - param = getattr(self, paramName) - if isinstance(param, Param): - return param - else: - raise ValueError("Cannot find param with name %s." % paramName) - - def isSet(self, param): - """ - Checks whether a param is explicitly set by user. - """ - param = self._resolveParam(param) - return param in self._paramMap - - def hasDefault(self, param): - """ - Checks whether a param has a default value. - """ - param = self._resolveParam(param) - return param in self._defaultParamMap - - def isDefined(self, param): - """ - Checks whether a param is explicitly set by user or has - a default value. - """ - return self.isSet(param) or self.hasDefault(param) - - def hasParam(self, paramName): - """ - Tests whether this instance contains a param with a given - (string) name. - """ - if isinstance(paramName, basestring): - p = getattr(self, paramName, None) - return isinstance(p, Param) - else: - raise TypeError("hasParam(): paramName must be a string") - - def getOrDefault(self, param): - """ - Gets the value of a param in the user-supplied param map or its - default value. Raises an error if neither is set. - """ - param = self._resolveParam(param) - if param in self._paramMap: - return self._paramMap[param] - else: - return self._defaultParamMap[param] - - def extractParamMap(self, extra=None): - """ - Extracts the embedded default param values and user-supplied - values, and then merges them with extra values from input into - a flat param map, where the latter value is used if there exist - conflicts, i.e., with ordering: default param values < - user-supplied values < extra. - - :param extra: extra param values - :return: merged param map - """ - if extra is None: - extra = dict() - paramMap = self._defaultParamMap.copy() - paramMap.update(self._paramMap) - paramMap.update(extra) - return paramMap - - def copy(self, extra=None): - """ - Creates a copy of this instance with the same uid and some - extra params. The default implementation creates a - shallow copy using :py:func:`copy.copy`, and then copies the - embedded and extra parameters over and returns the copy. - Subclasses should override this method if the default approach - is not sufficient. - - :param extra: Extra parameters to copy to the new instance - :return: Copy of this instance - """ - if extra is None: - extra = dict() - that = copy.copy(self) - that._paramMap = {} - that._defaultParamMap = {} - return self._copyValues(that, extra) - - def set(self, param, value): - """ - Sets a parameter in the embedded param map. - """ - self._shouldOwn(param) - try: - value = param.typeConverter(value) - except ValueError as e: - raise ValueError('Invalid param value given for param "%s". %s' % (param.name, e)) - self._paramMap[param] = value - - def _shouldOwn(self, param): - """ - Validates that the input param belongs to this Params instance. - """ - if not (self.uid == param.parent and self.hasParam(param.name)): - raise ValueError("Param %r does not belong to %r." % (param, self)) - - def _resolveParam(self, param): - """ - Resolves a param and validates the ownership. - - :param param: param name or the param instance, which must - belong to this Params instance - :return: resolved param instance - """ - if isinstance(param, Param): - self._shouldOwn(param) - return param - elif isinstance(param, basestring): - return self.getParam(param) - else: - raise ValueError("Cannot resolve %r as a param." % param) - - @staticmethod - def _dummy(): - """ - Returns a dummy Params instance used as a placeholder to - generate docs. - """ - dummy = Params() - dummy.uid = "undefined" - return dummy - - def _set(self, **kwargs): - """ - Sets user-supplied params. - """ - for param, value in kwargs.items(): - p = getattr(self, param) - if value is not None: - try: - value = p.typeConverter(value) - except TypeError as e: - raise TypeError('Invalid param value given for param "%s". %s' % (p.name, e)) - self._paramMap[p] = value - return self - - def _clear(self, param): - """ - Clears a param from the param map if it has been explicitly set. - """ - if self.isSet(param): - del self._paramMap[param] - - def _setDefault(self, **kwargs): - """ - Sets default params. - """ - for param, value in kwargs.items(): - p = getattr(self, param) - if value is not None and not isinstance(value, JavaObject): - try: - value = p.typeConverter(value) - except TypeError as e: - raise TypeError('Invalid default param value given for param "%s". %s' - % (p.name, e)) - self._defaultParamMap[p] = value - return self - - def _copyValues(self, to, extra=None): - """ - Copies param values from this instance to another instance for - params shared by them. - - :param to: the target instance - :param extra: extra params to be copied - :return: the target instance with param values copied - """ - paramMap = self._paramMap.copy() - if extra is not None: - paramMap.update(extra) - for param in self.params: - # copy default params - if param in self._defaultParamMap and to.hasParam(param.name): - to._defaultParamMap[to.getParam(param.name)] = self._defaultParamMap[param] - # copy explicitly set params - if param in paramMap and to.hasParam(param.name): - to._set(**{param.name: paramMap[param]}) - return to - - def _resetUid(self, newUid): - """ - Changes the uid of this instance. This updates both - the stored uid and the parent uid of params and param maps. - This is used by persistence (loading). - :param newUid: new uid to use, which is converted to unicode - :return: same instance, but with the uid and Param.parent values - updated, including within param maps - """ - newUid = unicode(newUid) - self.uid = newUid - newDefaultParamMap = dict() - newParamMap = dict() - for param in self.params: - newParam = copy.copy(param) - newParam.parent = newUid - if param in self._defaultParamMap: - newDefaultParamMap[newParam] = self._defaultParamMap[param] - if param in self._paramMap: - newParamMap[newParam] = self._paramMap[param] - param.parent = newUid - self._defaultParamMap = newDefaultParamMap - self._paramMap = newParamMap - return self diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/param/_shared_params_code_gen.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/param/_shared_params_code_gen.py deleted file mode 100644 index e45ba84..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/param/_shared_params_code_gen.py +++ /dev/null @@ -1,215 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -header = """# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#""" - -# Code generator for shared params (shared.py). Run under this folder with: -# python _shared_params_code_gen.py > shared.py - - -def _gen_param_header(name, doc, defaultValueStr, typeConverter): - """ - Generates the header part for shared variables - - :param name: param name - :param doc: param doc - """ - template = '''class Has$Name(Params): - """ - Mixin for param $name: $doc - """ - - $name = Param(Params._dummy(), "$name", "$doc", typeConverter=$typeConverter) - - def __init__(self): - super(Has$Name, self).__init__()''' - - if defaultValueStr is not None: - template += ''' - self._setDefault($name=$defaultValueStr)''' - - Name = name[0].upper() + name[1:] - if typeConverter is None: - typeConverter = str(None) - return template \ - .replace("$name", name) \ - .replace("$Name", Name) \ - .replace("$doc", doc) \ - .replace("$defaultValueStr", str(defaultValueStr)) \ - .replace("$typeConverter", typeConverter) - - -def _gen_param_code(name, doc, defaultValueStr): - """ - Generates Python code for a shared param class. - - :param name: param name - :param doc: param doc - :param defaultValueStr: string representation of the default value - :return: code string - """ - # TODO: How to correctly inherit instance attributes? - template = ''' - def set$Name(self, value): - """ - Sets the value of :py:attr:`$name`. - """ - return self._set($name=value) - - def get$Name(self): - """ - Gets the value of $name or its default value. - """ - return self.getOrDefault(self.$name)''' - - Name = name[0].upper() + name[1:] - return template \ - .replace("$name", name) \ - .replace("$Name", Name) \ - .replace("$doc", doc) \ - .replace("$defaultValueStr", str(defaultValueStr)) - -if __name__ == "__main__": - print(header) - print("\n# DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.\n") - print("from pyspark.ml.param import *\n\n") - shared = [ - ("maxIter", "max number of iterations (>= 0).", None, "TypeConverters.toInt"), - ("regParam", "regularization parameter (>= 0).", None, "TypeConverters.toFloat"), - ("featuresCol", "features column name.", "'features'", "TypeConverters.toString"), - ("labelCol", "label column name.", "'label'", "TypeConverters.toString"), - ("predictionCol", "prediction column name.", "'prediction'", "TypeConverters.toString"), - ("probabilityCol", "Column name for predicted class conditional probabilities. " + - "Note: Not all models output well-calibrated probability estimates! These probabilities " + - "should be treated as confidences, not precise probabilities.", "'probability'", - "TypeConverters.toString"), - ("rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", "'rawPrediction'", - "TypeConverters.toString"), - ("inputCol", "input column name.", None, "TypeConverters.toString"), - ("inputCols", "input column names.", None, "TypeConverters.toListString"), - ("outputCol", "output column name.", "self.uid + '__output'", "TypeConverters.toString"), - ("outputCols", "output column names.", None, "TypeConverters.toListString"), - ("numFeatures", "number of features.", None, "TypeConverters.toInt"), - ("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " + - "E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: " + - "this setting will be ignored if the checkpoint directory is not set in the SparkContext.", - None, "TypeConverters.toInt"), - ("seed", "random seed.", "hash(type(self).__name__)", "TypeConverters.toInt"), - ("tol", "the convergence tolerance for iterative algorithms (>= 0).", None, - "TypeConverters.toFloat"), - ("stepSize", "Step size to be used for each iteration of optimization (>= 0).", None, - "TypeConverters.toFloat"), - ("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " + - "out rows with bad values), or error (which will throw an error). More options may be " + - "added later.", None, "TypeConverters.toString"), - ("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " + - "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0", - "TypeConverters.toFloat"), - ("fitIntercept", "whether to fit an intercept term.", "True", "TypeConverters.toBoolean"), - ("standardization", "whether to standardize the training features before fitting the " + - "model.", "True", "TypeConverters.toBoolean"), - ("thresholds", "Thresholds in multi-class classification to adjust the probability of " + - "predicting each class. Array must have length equal to the number of classes, with " + - "values > 0, excepting that at most one value may be 0. " + - "The class with largest value p/t is predicted, where p is the original " + - "probability of that class and t is the class's threshold.", None, - "TypeConverters.toListFloat"), - ("threshold", "threshold in binary classification prediction, in range [0, 1]", - "0.5", "TypeConverters.toFloat"), - ("weightCol", "weight column name. If this is not set or empty, we treat " + - "all instance weights as 1.0.", None, "TypeConverters.toString"), - ("solver", "the solver algorithm for optimization. If this is not set or empty, " + - "default value is 'auto'.", "'auto'", "TypeConverters.toString"), - ("varianceCol", "column name for the biased sample variance of prediction.", - None, "TypeConverters.toString"), - ("aggregationDepth", "suggested depth for treeAggregate (>= 2).", "2", - "TypeConverters.toInt"), - ("parallelism", "the number of threads to use when running parallel algorithms (>= 1).", - "1", "TypeConverters.toInt"), - ("collectSubModels", "Param for whether to collect a list of sub-models trained during " + - "tuning. If set to false, then only the single best sub-model will be available after " + - "fitting. If set to true, then all sub-models will be available. Warning: For large " + - "models, collecting all sub-models can cause OOMs on the Spark driver.", - "False", "TypeConverters.toBoolean"), - ("loss", "the loss function to be optimized.", None, "TypeConverters.toString"), - ("distanceMeasure", "the distance measure. Supported options: 'euclidean' and 'cosine'.", - "'euclidean'", "TypeConverters.toString")] - - code = [] - for name, doc, defaultValueStr, typeConverter in shared: - param_code = _gen_param_header(name, doc, defaultValueStr, typeConverter) - code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr)) - - decisionTreeParams = [ - ("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; " + - "depth 1 means 1 internal node + 2 leaf nodes.", "TypeConverters.toInt"), - ("maxBins", "Max number of bins for" + - " discretizing continuous features. Must be >=2 and >= number of categories for any" + - " categorical feature.", "TypeConverters.toInt"), - ("minInstancesPerNode", "Minimum number of instances each child must have after split. " + - "If a split causes the left or right child to have fewer than minInstancesPerNode, the " + - "split will be discarded as invalid. Should be >= 1.", "TypeConverters.toInt"), - ("minInfoGain", "Minimum information gain for a split to be considered at a tree node.", - "TypeConverters.toFloat"), - ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation. If too small," + - " then 1 node will be split per iteration, and its aggregates may exceed this size.", - "TypeConverters.toInt"), - ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " + - "instances with nodes. If true, the algorithm will cache node IDs for each instance. " + - "Caching can speed up training of deeper trees. Users can set how often should the " + - "cache be checkpointed or disable it by setting checkpointInterval.", - "TypeConverters.toBoolean")] - - decisionTreeCode = '''class DecisionTreeParams(Params): - """ - Mixin for Decision Tree parameters. - """ - - $dummyPlaceHolders - - def __init__(self): - super(DecisionTreeParams, self).__init__()''' - dtParamMethods = "" - dummyPlaceholders = "" - paramTemplate = """$name = Param($owner, "$name", "$doc", typeConverter=$typeConverterStr)""" - for name, doc, typeConverterStr in decisionTreeParams: - if typeConverterStr is None: - typeConverterStr = str(None) - variable = paramTemplate.replace("$name", name).replace("$doc", doc) \ - .replace("$typeConverterStr", typeConverterStr) - dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n " - dtParamMethods += _gen_param_code(name, doc, None) + "\n" - code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + "\n" + - dtParamMethods) - print("\n\n\n".join(code)) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/param/shared.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/param/shared.py deleted file mode 100644 index 618f5bf..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/param/shared.py +++ /dev/null @@ -1,816 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py. - -from pyspark.ml.param import * - - -class HasMaxIter(Params): - """ - Mixin for param maxIter: max number of iterations (>= 0). - """ - - maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0).", typeConverter=TypeConverters.toInt) - - def __init__(self): - super(HasMaxIter, self).__init__() - - def setMaxIter(self, value): - """ - Sets the value of :py:attr:`maxIter`. - """ - return self._set(maxIter=value) - - def getMaxIter(self): - """ - Gets the value of maxIter or its default value. - """ - return self.getOrDefault(self.maxIter) - - -class HasRegParam(Params): - """ - Mixin for param regParam: regularization parameter (>= 0). - """ - - regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0).", typeConverter=TypeConverters.toFloat) - - def __init__(self): - super(HasRegParam, self).__init__() - - def setRegParam(self, value): - """ - Sets the value of :py:attr:`regParam`. - """ - return self._set(regParam=value) - - def getRegParam(self): - """ - Gets the value of regParam or its default value. - """ - return self.getOrDefault(self.regParam) - - -class HasFeaturesCol(Params): - """ - Mixin for param featuresCol: features column name. - """ - - featuresCol = Param(Params._dummy(), "featuresCol", "features column name.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasFeaturesCol, self).__init__() - self._setDefault(featuresCol='features') - - def setFeaturesCol(self, value): - """ - Sets the value of :py:attr:`featuresCol`. - """ - return self._set(featuresCol=value) - - def getFeaturesCol(self): - """ - Gets the value of featuresCol or its default value. - """ - return self.getOrDefault(self.featuresCol) - - -class HasLabelCol(Params): - """ - Mixin for param labelCol: label column name. - """ - - labelCol = Param(Params._dummy(), "labelCol", "label column name.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasLabelCol, self).__init__() - self._setDefault(labelCol='label') - - def setLabelCol(self, value): - """ - Sets the value of :py:attr:`labelCol`. - """ - return self._set(labelCol=value) - - def getLabelCol(self): - """ - Gets the value of labelCol or its default value. - """ - return self.getOrDefault(self.labelCol) - - -class HasPredictionCol(Params): - """ - Mixin for param predictionCol: prediction column name. - """ - - predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasPredictionCol, self).__init__() - self._setDefault(predictionCol='prediction') - - def setPredictionCol(self, value): - """ - Sets the value of :py:attr:`predictionCol`. - """ - return self._set(predictionCol=value) - - def getPredictionCol(self): - """ - Gets the value of predictionCol or its default value. - """ - return self.getOrDefault(self.predictionCol) - - -class HasProbabilityCol(Params): - """ - Mixin for param probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. - """ - - probabilityCol = Param(Params._dummy(), "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasProbabilityCol, self).__init__() - self._setDefault(probabilityCol='probability') - - def setProbabilityCol(self, value): - """ - Sets the value of :py:attr:`probabilityCol`. - """ - return self._set(probabilityCol=value) - - def getProbabilityCol(self): - """ - Gets the value of probabilityCol or its default value. - """ - return self.getOrDefault(self.probabilityCol) - - -class HasRawPredictionCol(Params): - """ - Mixin for param rawPredictionCol: raw prediction (a.k.a. confidence) column name. - """ - - rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasRawPredictionCol, self).__init__() - self._setDefault(rawPredictionCol='rawPrediction') - - def setRawPredictionCol(self, value): - """ - Sets the value of :py:attr:`rawPredictionCol`. - """ - return self._set(rawPredictionCol=value) - - def getRawPredictionCol(self): - """ - Gets the value of rawPredictionCol or its default value. - """ - return self.getOrDefault(self.rawPredictionCol) - - -class HasInputCol(Params): - """ - Mixin for param inputCol: input column name. - """ - - inputCol = Param(Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasInputCol, self).__init__() - - def setInputCol(self, value): - """ - Sets the value of :py:attr:`inputCol`. - """ - return self._set(inputCol=value) - - def getInputCol(self): - """ - Gets the value of inputCol or its default value. - """ - return self.getOrDefault(self.inputCol) - - -class HasInputCols(Params): - """ - Mixin for param inputCols: input column names. - """ - - inputCols = Param(Params._dummy(), "inputCols", "input column names.", typeConverter=TypeConverters.toListString) - - def __init__(self): - super(HasInputCols, self).__init__() - - def setInputCols(self, value): - """ - Sets the value of :py:attr:`inputCols`. - """ - return self._set(inputCols=value) - - def getInputCols(self): - """ - Gets the value of inputCols or its default value. - """ - return self.getOrDefault(self.inputCols) - - -class HasOutputCol(Params): - """ - Mixin for param outputCol: output column name. - """ - - outputCol = Param(Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasOutputCol, self).__init__() - self._setDefault(outputCol=self.uid + '__output') - - def setOutputCol(self, value): - """ - Sets the value of :py:attr:`outputCol`. - """ - return self._set(outputCol=value) - - def getOutputCol(self): - """ - Gets the value of outputCol or its default value. - """ - return self.getOrDefault(self.outputCol) - - -class HasOutputCols(Params): - """ - Mixin for param outputCols: output column names. - """ - - outputCols = Param(Params._dummy(), "outputCols", "output column names.", typeConverter=TypeConverters.toListString) - - def __init__(self): - super(HasOutputCols, self).__init__() - - def setOutputCols(self, value): - """ - Sets the value of :py:attr:`outputCols`. - """ - return self._set(outputCols=value) - - def getOutputCols(self): - """ - Gets the value of outputCols or its default value. - """ - return self.getOrDefault(self.outputCols) - - -class HasNumFeatures(Params): - """ - Mixin for param numFeatures: number of features. - """ - - numFeatures = Param(Params._dummy(), "numFeatures", "number of features.", typeConverter=TypeConverters.toInt) - - def __init__(self): - super(HasNumFeatures, self).__init__() - - def setNumFeatures(self, value): - """ - Sets the value of :py:attr:`numFeatures`. - """ - return self._set(numFeatures=value) - - def getNumFeatures(self): - """ - Gets the value of numFeatures or its default value. - """ - return self.getOrDefault(self.numFeatures) - - -class HasCheckpointInterval(Params): - """ - Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. - """ - - checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.", typeConverter=TypeConverters.toInt) - - def __init__(self): - super(HasCheckpointInterval, self).__init__() - - def setCheckpointInterval(self, value): - """ - Sets the value of :py:attr:`checkpointInterval`. - """ - return self._set(checkpointInterval=value) - - def getCheckpointInterval(self): - """ - Gets the value of checkpointInterval or its default value. - """ - return self.getOrDefault(self.checkpointInterval) - - -class HasSeed(Params): - """ - Mixin for param seed: random seed. - """ - - seed = Param(Params._dummy(), "seed", "random seed.", typeConverter=TypeConverters.toInt) - - def __init__(self): - super(HasSeed, self).__init__() - self._setDefault(seed=hash(type(self).__name__)) - - def setSeed(self, value): - """ - Sets the value of :py:attr:`seed`. - """ - return self._set(seed=value) - - def getSeed(self): - """ - Gets the value of seed or its default value. - """ - return self.getOrDefault(self.seed) - - -class HasTol(Params): - """ - Mixin for param tol: the convergence tolerance for iterative algorithms (>= 0). - """ - - tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms (>= 0).", typeConverter=TypeConverters.toFloat) - - def __init__(self): - super(HasTol, self).__init__() - - def setTol(self, value): - """ - Sets the value of :py:attr:`tol`. - """ - return self._set(tol=value) - - def getTol(self): - """ - Gets the value of tol or its default value. - """ - return self.getOrDefault(self.tol) - - -class HasStepSize(Params): - """ - Mixin for param stepSize: Step size to be used for each iteration of optimization (>= 0). - """ - - stepSize = Param(Params._dummy(), "stepSize", "Step size to be used for each iteration of optimization (>= 0).", typeConverter=TypeConverters.toFloat) - - def __init__(self): - super(HasStepSize, self).__init__() - - def setStepSize(self, value): - """ - Sets the value of :py:attr:`stepSize`. - """ - return self._set(stepSize=value) - - def getStepSize(self): - """ - Gets the value of stepSize or its default value. - """ - return self.getOrDefault(self.stepSize) - - -class HasHandleInvalid(Params): - """ - Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an error). More options may be added later. - """ - - handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an error). More options may be added later.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasHandleInvalid, self).__init__() - - def setHandleInvalid(self, value): - """ - Sets the value of :py:attr:`handleInvalid`. - """ - return self._set(handleInvalid=value) - - def getHandleInvalid(self): - """ - Gets the value of handleInvalid or its default value. - """ - return self.getOrDefault(self.handleInvalid) - - -class HasElasticNetParam(Params): - """ - Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. - """ - - elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", typeConverter=TypeConverters.toFloat) - - def __init__(self): - super(HasElasticNetParam, self).__init__() - self._setDefault(elasticNetParam=0.0) - - def setElasticNetParam(self, value): - """ - Sets the value of :py:attr:`elasticNetParam`. - """ - return self._set(elasticNetParam=value) - - def getElasticNetParam(self): - """ - Gets the value of elasticNetParam or its default value. - """ - return self.getOrDefault(self.elasticNetParam) - - -class HasFitIntercept(Params): - """ - Mixin for param fitIntercept: whether to fit an intercept term. - """ - - fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.", typeConverter=TypeConverters.toBoolean) - - def __init__(self): - super(HasFitIntercept, self).__init__() - self._setDefault(fitIntercept=True) - - def setFitIntercept(self, value): - """ - Sets the value of :py:attr:`fitIntercept`. - """ - return self._set(fitIntercept=value) - - def getFitIntercept(self): - """ - Gets the value of fitIntercept or its default value. - """ - return self.getOrDefault(self.fitIntercept) - - -class HasStandardization(Params): - """ - Mixin for param standardization: whether to standardize the training features before fitting the model. - """ - - standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.", typeConverter=TypeConverters.toBoolean) - - def __init__(self): - super(HasStandardization, self).__init__() - self._setDefault(standardization=True) - - def setStandardization(self, value): - """ - Sets the value of :py:attr:`standardization`. - """ - return self._set(standardization=value) - - def getStandardization(self): - """ - Gets the value of standardization or its default value. - """ - return self.getOrDefault(self.standardization) - - -class HasThresholds(Params): - """ - Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0, excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold. - """ - - thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0, excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold.", typeConverter=TypeConverters.toListFloat) - - def __init__(self): - super(HasThresholds, self).__init__() - - def setThresholds(self, value): - """ - Sets the value of :py:attr:`thresholds`. - """ - return self._set(thresholds=value) - - def getThresholds(self): - """ - Gets the value of thresholds or its default value. - """ - return self.getOrDefault(self.thresholds) - - -class HasThreshold(Params): - """ - Mixin for param threshold: threshold in binary classification prediction, in range [0, 1] - """ - - threshold = Param(Params._dummy(), "threshold", "threshold in binary classification prediction, in range [0, 1]", typeConverter=TypeConverters.toFloat) - - def __init__(self): - super(HasThreshold, self).__init__() - self._setDefault(threshold=0.5) - - def setThreshold(self, value): - """ - Sets the value of :py:attr:`threshold`. - """ - return self._set(threshold=value) - - def getThreshold(self): - """ - Gets the value of threshold or its default value. - """ - return self.getOrDefault(self.threshold) - - -class HasWeightCol(Params): - """ - Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. - """ - - weightCol = Param(Params._dummy(), "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasWeightCol, self).__init__() - - def setWeightCol(self, value): - """ - Sets the value of :py:attr:`weightCol`. - """ - return self._set(weightCol=value) - - def getWeightCol(self): - """ - Gets the value of weightCol or its default value. - """ - return self.getOrDefault(self.weightCol) - - -class HasSolver(Params): - """ - Mixin for param solver: the solver algorithm for optimization. If this is not set or empty, default value is 'auto'. - """ - - solver = Param(Params._dummy(), "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasSolver, self).__init__() - self._setDefault(solver='auto') - - def setSolver(self, value): - """ - Sets the value of :py:attr:`solver`. - """ - return self._set(solver=value) - - def getSolver(self): - """ - Gets the value of solver or its default value. - """ - return self.getOrDefault(self.solver) - - -class HasVarianceCol(Params): - """ - Mixin for param varianceCol: column name for the biased sample variance of prediction. - """ - - varianceCol = Param(Params._dummy(), "varianceCol", "column name for the biased sample variance of prediction.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasVarianceCol, self).__init__() - - def setVarianceCol(self, value): - """ - Sets the value of :py:attr:`varianceCol`. - """ - return self._set(varianceCol=value) - - def getVarianceCol(self): - """ - Gets the value of varianceCol or its default value. - """ - return self.getOrDefault(self.varianceCol) - - -class HasAggregationDepth(Params): - """ - Mixin for param aggregationDepth: suggested depth for treeAggregate (>= 2). - """ - - aggregationDepth = Param(Params._dummy(), "aggregationDepth", "suggested depth for treeAggregate (>= 2).", typeConverter=TypeConverters.toInt) - - def __init__(self): - super(HasAggregationDepth, self).__init__() - self._setDefault(aggregationDepth=2) - - def setAggregationDepth(self, value): - """ - Sets the value of :py:attr:`aggregationDepth`. - """ - return self._set(aggregationDepth=value) - - def getAggregationDepth(self): - """ - Gets the value of aggregationDepth or its default value. - """ - return self.getOrDefault(self.aggregationDepth) - - -class HasParallelism(Params): - """ - Mixin for param parallelism: the number of threads to use when running parallel algorithms (>= 1). - """ - - parallelism = Param(Params._dummy(), "parallelism", "the number of threads to use when running parallel algorithms (>= 1).", typeConverter=TypeConverters.toInt) - - def __init__(self): - super(HasParallelism, self).__init__() - self._setDefault(parallelism=1) - - def setParallelism(self, value): - """ - Sets the value of :py:attr:`parallelism`. - """ - return self._set(parallelism=value) - - def getParallelism(self): - """ - Gets the value of parallelism or its default value. - """ - return self.getOrDefault(self.parallelism) - - -class HasCollectSubModels(Params): - """ - Mixin for param collectSubModels: Param for whether to collect a list of sub-models trained during tuning. If set to false, then only the single best sub-model will be available after fitting. If set to true, then all sub-models will be available. Warning: For large models, collecting all sub-models can cause OOMs on the Spark driver. - """ - - collectSubModels = Param(Params._dummy(), "collectSubModels", "Param for whether to collect a list of sub-models trained during tuning. If set to false, then only the single best sub-model will be available after fitting. If set to true, then all sub-models will be available. Warning: For large models, collecting all sub-models can cause OOMs on the Spark driver.", typeConverter=TypeConverters.toBoolean) - - def __init__(self): - super(HasCollectSubModels, self).__init__() - self._setDefault(collectSubModels=False) - - def setCollectSubModels(self, value): - """ - Sets the value of :py:attr:`collectSubModels`. - """ - return self._set(collectSubModels=value) - - def getCollectSubModels(self): - """ - Gets the value of collectSubModels or its default value. - """ - return self.getOrDefault(self.collectSubModels) - - -class HasLoss(Params): - """ - Mixin for param loss: the loss function to be optimized. - """ - - loss = Param(Params._dummy(), "loss", "the loss function to be optimized.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasLoss, self).__init__() - - def setLoss(self, value): - """ - Sets the value of :py:attr:`loss`. - """ - return self._set(loss=value) - - def getLoss(self): - """ - Gets the value of loss or its default value. - """ - return self.getOrDefault(self.loss) - - -class DecisionTreeParams(Params): - """ - Mixin for Decision Tree parameters. - """ - - maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", typeConverter=TypeConverters.toInt) - maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.", typeConverter=TypeConverters.toInt) - minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.", typeConverter=TypeConverters.toInt) - minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.", typeConverter=TypeConverters.toFloat) - maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.", typeConverter=TypeConverters.toInt) - cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.", typeConverter=TypeConverters.toBoolean) - - - def __init__(self): - super(DecisionTreeParams, self).__init__() - - def setMaxDepth(self, value): - """ - Sets the value of :py:attr:`maxDepth`. - """ - return self._set(maxDepth=value) - - def getMaxDepth(self): - """ - Gets the value of maxDepth or its default value. - """ - return self.getOrDefault(self.maxDepth) - - def setMaxBins(self, value): - """ - Sets the value of :py:attr:`maxBins`. - """ - return self._set(maxBins=value) - - def getMaxBins(self): - """ - Gets the value of maxBins or its default value. - """ - return self.getOrDefault(self.maxBins) - - def setMinInstancesPerNode(self, value): - """ - Sets the value of :py:attr:`minInstancesPerNode`. - """ - return self._set(minInstancesPerNode=value) - - def getMinInstancesPerNode(self): - """ - Gets the value of minInstancesPerNode or its default value. - """ - return self.getOrDefault(self.minInstancesPerNode) - - def setMinInfoGain(self, value): - """ - Sets the value of :py:attr:`minInfoGain`. - """ - return self._set(minInfoGain=value) - - def getMinInfoGain(self): - """ - Gets the value of minInfoGain or its default value. - """ - return self.getOrDefault(self.minInfoGain) - - def setMaxMemoryInMB(self, value): - """ - Sets the value of :py:attr:`maxMemoryInMB`. - """ - return self._set(maxMemoryInMB=value) - - def getMaxMemoryInMB(self): - """ - Gets the value of maxMemoryInMB or its default value. - """ - return self.getOrDefault(self.maxMemoryInMB) - - def setCacheNodeIds(self, value): - """ - Sets the value of :py:attr:`cacheNodeIds`. - """ - return self._set(cacheNodeIds=value) - - def getCacheNodeIds(self): - """ - Gets the value of cacheNodeIds or its default value. - """ - return self.getOrDefault(self.cacheNodeIds) - - -class HasDistanceMeasure(Params): - """ - Mixin for param distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. - """ - - distanceMeasure = Param(Params._dummy(), "distanceMeasure", "the distance measure. Supported options: 'euclidean' and 'cosine'.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasDistanceMeasure, self).__init__() - self._setDefault(distanceMeasure='euclidean') - - def setDistanceMeasure(self, value): - """ - Sets the value of :py:attr:`distanceMeasure`. - """ - return self._set(distanceMeasure=value) - - def getDistanceMeasure(self): - """ - Gets the value of distanceMeasure or its default value. - """ - return self.getOrDefault(self.distanceMeasure) - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/pipeline.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/pipeline.py deleted file mode 100644 index 0975302..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/pipeline.py +++ /dev/null @@ -1,390 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import os - -if sys.version > '3': - basestring = str - -from pyspark import since, keyword_only, SparkContext -from pyspark.ml.base import Estimator, Model, Transformer -from pyspark.ml.param import Param, Params -from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaParams -from pyspark.ml.common import inherit_doc - - -@inherit_doc -class Pipeline(Estimator, MLReadable, MLWritable): - """ - A simple pipeline, which acts as an estimator. A Pipeline consists - of a sequence of stages, each of which is either an - :py:class:`Estimator` or a :py:class:`Transformer`. When - :py:meth:`Pipeline.fit` is called, the stages are executed in - order. If a stage is an :py:class:`Estimator`, its - :py:meth:`Estimator.fit` method will be called on the input - dataset to fit a model. Then the model, which is a transformer, - will be used to transform the dataset as the input to the next - stage. If a stage is a :py:class:`Transformer`, its - :py:meth:`Transformer.transform` method will be called to produce - the dataset for the next stage. The fitted model from a - :py:class:`Pipeline` is a :py:class:`PipelineModel`, which - consists of fitted models and transformers, corresponding to the - pipeline stages. If stages is an empty list, the pipeline acts as an - identity transformer. - - .. versionadded:: 1.3.0 - """ - - stages = Param(Params._dummy(), "stages", "a list of pipeline stages") - - @keyword_only - def __init__(self, stages=None): - """ - __init__(self, stages=None) - """ - super(Pipeline, self).__init__() - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @since("1.3.0") - def setStages(self, value): - """ - Set pipeline stages. - - :param value: a list of transformers or estimators - :return: the pipeline instance - """ - return self._set(stages=value) - - @since("1.3.0") - def getStages(self): - """ - Get pipeline stages. - """ - return self.getOrDefault(self.stages) - - @keyword_only - @since("1.3.0") - def setParams(self, stages=None): - """ - setParams(self, stages=None) - Sets params for Pipeline. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _fit(self, dataset): - stages = self.getStages() - for stage in stages: - if not (isinstance(stage, Estimator) or isinstance(stage, Transformer)): - raise TypeError( - "Cannot recognize a pipeline stage of type %s." % type(stage)) - indexOfLastEstimator = -1 - for i, stage in enumerate(stages): - if isinstance(stage, Estimator): - indexOfLastEstimator = i - transformers = [] - for i, stage in enumerate(stages): - if i <= indexOfLastEstimator: - if isinstance(stage, Transformer): - transformers.append(stage) - dataset = stage.transform(dataset) - else: # must be an Estimator - model = stage.fit(dataset) - transformers.append(model) - if i < indexOfLastEstimator: - dataset = model.transform(dataset) - else: - transformers.append(stage) - return PipelineModel(transformers) - - @since("1.4.0") - def copy(self, extra=None): - """ - Creates a copy of this instance. - - :param extra: extra parameters - :returns: new instance - """ - if extra is None: - extra = dict() - that = Params.copy(self, extra) - stages = [stage.copy(extra) for stage in that.getStages()] - return that.setStages(stages) - - @since("2.0.0") - def write(self): - """Returns an MLWriter instance for this ML instance.""" - allStagesAreJava = PipelineSharedReadWrite.checkStagesForJava(self.getStages()) - if allStagesAreJava: - return JavaMLWriter(self) - return PipelineWriter(self) - - @classmethod - @since("2.0.0") - def read(cls): - """Returns an MLReader instance for this class.""" - return PipelineReader(cls) - - @classmethod - def _from_java(cls, java_stage): - """ - Given a Java Pipeline, create and return a Python wrapper of it. - Used for ML persistence. - """ - # Create a new instance of this stage. - py_stage = cls() - # Load information from java_stage to the instance. - py_stages = [JavaParams._from_java(s) for s in java_stage.getStages()] - py_stage.setStages(py_stages) - py_stage._resetUid(java_stage.uid()) - return py_stage - - def _to_java(self): - """ - Transfer this instance to a Java Pipeline. Used for ML persistence. - - :return: Java object equivalent to this instance. - """ - - gateway = SparkContext._gateway - cls = SparkContext._jvm.org.apache.spark.ml.PipelineStage - java_stages = gateway.new_array(cls, len(self.getStages())) - for idx, stage in enumerate(self.getStages()): - java_stages[idx] = stage._to_java() - - _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.Pipeline", self.uid) - _java_obj.setStages(java_stages) - - return _java_obj - - -@inherit_doc -class PipelineWriter(MLWriter): - """ - (Private) Specialization of :py:class:`MLWriter` for :py:class:`Pipeline` types - """ - - def __init__(self, instance): - super(PipelineWriter, self).__init__() - self.instance = instance - - def saveImpl(self, path): - stages = self.instance.getStages() - PipelineSharedReadWrite.validateStages(stages) - PipelineSharedReadWrite.saveImpl(self.instance, stages, self.sc, path) - - -@inherit_doc -class PipelineReader(MLReader): - """ - (Private) Specialization of :py:class:`MLReader` for :py:class:`Pipeline` types - """ - - def __init__(self, cls): - super(PipelineReader, self).__init__() - self.cls = cls - - def load(self, path): - metadata = DefaultParamsReader.loadMetadata(path, self.sc) - if 'language' not in metadata['paramMap'] or metadata['paramMap']['language'] != 'Python': - return JavaMLReader(self.cls).load(path) - else: - uid, stages = PipelineSharedReadWrite.load(metadata, self.sc, path) - return Pipeline(stages=stages)._resetUid(uid) - - -@inherit_doc -class PipelineModelWriter(MLWriter): - """ - (Private) Specialization of :py:class:`MLWriter` for :py:class:`PipelineModel` types - """ - - def __init__(self, instance): - super(PipelineModelWriter, self).__init__() - self.instance = instance - - def saveImpl(self, path): - stages = self.instance.stages - PipelineSharedReadWrite.validateStages(stages) - PipelineSharedReadWrite.saveImpl(self.instance, stages, self.sc, path) - - -@inherit_doc -class PipelineModelReader(MLReader): - """ - (Private) Specialization of :py:class:`MLReader` for :py:class:`PipelineModel` types - """ - - def __init__(self, cls): - super(PipelineModelReader, self).__init__() - self.cls = cls - - def load(self, path): - metadata = DefaultParamsReader.loadMetadata(path, self.sc) - if 'language' not in metadata['paramMap'] or metadata['paramMap']['language'] != 'Python': - return JavaMLReader(self.cls).load(path) - else: - uid, stages = PipelineSharedReadWrite.load(metadata, self.sc, path) - return PipelineModel(stages=stages)._resetUid(uid) - - -@inherit_doc -class PipelineModel(Model, MLReadable, MLWritable): - """ - Represents a compiled pipeline with transformers and fitted models. - - .. versionadded:: 1.3.0 - """ - - def __init__(self, stages): - super(PipelineModel, self).__init__() - self.stages = stages - - def _transform(self, dataset): - for t in self.stages: - dataset = t.transform(dataset) - return dataset - - @since("1.4.0") - def copy(self, extra=None): - """ - Creates a copy of this instance. - - :param extra: extra parameters - :returns: new instance - """ - if extra is None: - extra = dict() - stages = [stage.copy(extra) for stage in self.stages] - return PipelineModel(stages) - - @since("2.0.0") - def write(self): - """Returns an MLWriter instance for this ML instance.""" - allStagesAreJava = PipelineSharedReadWrite.checkStagesForJava(self.stages) - if allStagesAreJava: - return JavaMLWriter(self) - return PipelineModelWriter(self) - - @classmethod - @since("2.0.0") - def read(cls): - """Returns an MLReader instance for this class.""" - return PipelineModelReader(cls) - - @classmethod - def _from_java(cls, java_stage): - """ - Given a Java PipelineModel, create and return a Python wrapper of it. - Used for ML persistence. - """ - # Load information from java_stage to the instance. - py_stages = [JavaParams._from_java(s) for s in java_stage.stages()] - # Create a new instance of this stage. - py_stage = cls(py_stages) - py_stage._resetUid(java_stage.uid()) - return py_stage - - def _to_java(self): - """ - Transfer this instance to a Java PipelineModel. Used for ML persistence. - - :return: Java object equivalent to this instance. - """ - - gateway = SparkContext._gateway - cls = SparkContext._jvm.org.apache.spark.ml.Transformer - java_stages = gateway.new_array(cls, len(self.stages)) - for idx, stage in enumerate(self.stages): - java_stages[idx] = stage._to_java() - - _java_obj =\ - JavaParams._new_java_obj("org.apache.spark.ml.PipelineModel", self.uid, java_stages) - - return _java_obj - - -@inherit_doc -class PipelineSharedReadWrite(): - """ - .. note:: DeveloperApi - - Functions for :py:class:`MLReader` and :py:class:`MLWriter` shared between - :py:class:`Pipeline` and :py:class:`PipelineModel` - - .. versionadded:: 2.3.0 - """ - - @staticmethod - def checkStagesForJava(stages): - return all(isinstance(stage, JavaMLWritable) for stage in stages) - - @staticmethod - def validateStages(stages): - """ - Check that all stages are Writable - """ - for stage in stages: - if not isinstance(stage, MLWritable): - raise ValueError("Pipeline write will fail on this pipeline " + - "because stage %s of type %s is not MLWritable", - stage.uid, type(stage)) - - @staticmethod - def saveImpl(instance, stages, sc, path): - """ - Save metadata and stages for a :py:class:`Pipeline` or :py:class:`PipelineModel` - - save metadata to path/metadata - - save stages to stages/IDX_UID - """ - stageUids = [stage.uid for stage in stages] - jsonParams = {'stageUids': stageUids, 'language': 'Python'} - DefaultParamsWriter.saveMetadata(instance, path, sc, paramMap=jsonParams) - stagesDir = os.path.join(path, "stages") - for index, stage in enumerate(stages): - stage.write().save(PipelineSharedReadWrite - .getStagePath(stage.uid, index, len(stages), stagesDir)) - - @staticmethod - def load(metadata, sc, path): - """ - Load metadata and stages for a :py:class:`Pipeline` or :py:class:`PipelineModel` - - :return: (UID, list of stages) - """ - stagesDir = os.path.join(path, "stages") - stageUids = metadata['paramMap']['stageUids'] - stages = [] - for index, stageUid in enumerate(stageUids): - stagePath = \ - PipelineSharedReadWrite.getStagePath(stageUid, index, len(stageUids), stagesDir) - stage = DefaultParamsReader.loadParamsInstance(stagePath, sc) - stages.append(stage) - return (metadata['uid'], stages) - - @staticmethod - def getStagePath(stageUid, stageIdx, numStages, stagesDir): - """ - Get path for saving the given stage. - """ - stageIdxDigits = len(str(numStages)) - stageDir = str(stageIdx).zfill(stageIdxDigits) + "_" + stageUid - stagePath = os.path.join(stagesDir, stageDir) - return stagePath diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/recommendation.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/recommendation.py deleted file mode 100644 index a8eae9b..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/recommendation.py +++ /dev/null @@ -1,485 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys - -from pyspark import since, keyword_only -from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel -from pyspark.ml.param.shared import * -from pyspark.ml.common import inherit_doc - - -__all__ = ['ALS', 'ALSModel'] - - -@inherit_doc -class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, HasRegParam, HasSeed, - JavaMLWritable, JavaMLReadable): - """ - Alternating Least Squares (ALS) matrix factorization. - - ALS attempts to estimate the ratings matrix `R` as the product of - two lower-rank matrices, `X` and `Y`, i.e. `X * Yt = R`. Typically - these approximations are called 'factor' matrices. The general - approach is iterative. During each iteration, one of the factor - matrices is held constant, while the other is solved for using least - squares. The newly-solved factor matrix is then held constant while - solving for the other factor matrix. - - This is a blocked implementation of the ALS factorization algorithm - that groups the two sets of factors (referred to as "users" and - "products") into blocks and reduces communication by only sending - one copy of each user vector to each product block on each - iteration, and only for the product blocks that need that user's - feature vector. This is achieved by pre-computing some information - about the ratings matrix to determine the "out-links" of each user - (which blocks of products it will contribute to) and "in-link" - information for each product (which of the feature vectors it - receives from each user block it will depend on). This allows us to - send only an array of feature vectors between each user block and - product block, and have the product block find the users' ratings - and update the products based on these messages. - - For implicit preference data, the algorithm used is based on - `"Collaborative Filtering for Implicit Feedback Datasets", - `_, adapted for the blocked - approach used here. - - Essentially instead of finding the low-rank approximations to the - rating matrix `R`, this finds the approximations for a preference - matrix `P` where the elements of `P` are 1 if r > 0 and 0 if r <= 0. - The ratings then act as 'confidence' values related to strength of - indicated user preferences rather than explicit ratings given to - items. - - >>> df = spark.createDataFrame( - ... [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)], - ... ["user", "item", "rating"]) - >>> als = ALS(rank=10, maxIter=5, seed=0) - >>> model = als.fit(df) - >>> model.rank - 10 - >>> model.userFactors.orderBy("id").collect() - [Row(id=0, features=[...]), Row(id=1, ...), Row(id=2, ...)] - >>> test = spark.createDataFrame([(0, 2), (1, 0), (2, 0)], ["user", "item"]) - >>> predictions = sorted(model.transform(test).collect(), key=lambda r: r[0]) - >>> predictions[0] - Row(user=0, item=2, prediction=-0.13807615637779236) - >>> predictions[1] - Row(user=1, item=0, prediction=2.6258413791656494) - >>> predictions[2] - Row(user=2, item=0, prediction=-1.5018409490585327) - >>> user_recs = model.recommendForAllUsers(3) - >>> user_recs.where(user_recs.user == 0)\ - .select("recommendations.item", "recommendations.rating").collect() - [Row(item=[0, 1, 2], rating=[3.910..., 1.992..., -0.138...])] - >>> item_recs = model.recommendForAllItems(3) - >>> item_recs.where(item_recs.item == 2)\ - .select("recommendations.user", "recommendations.rating").collect() - [Row(user=[2, 1, 0], rating=[4.901..., 3.981..., -0.138...])] - >>> user_subset = df.where(df.user == 2) - >>> user_subset_recs = model.recommendForUserSubset(user_subset, 3) - >>> user_subset_recs.select("recommendations.item", "recommendations.rating").first() - Row(item=[2, 1, 0], rating=[4.901..., 1.056..., -1.501...]) - >>> item_subset = df.where(df.item == 0) - >>> item_subset_recs = model.recommendForItemSubset(item_subset, 3) - >>> item_subset_recs.select("recommendations.user", "recommendations.rating").first() - Row(user=[0, 1, 2], rating=[3.910..., 2.625..., -1.501...]) - >>> als_path = temp_path + "/als" - >>> als.save(als_path) - >>> als2 = ALS.load(als_path) - >>> als.getMaxIter() - 5 - >>> model_path = temp_path + "/als_model" - >>> model.save(model_path) - >>> model2 = ALSModel.load(model_path) - >>> model.rank == model2.rank - True - >>> sorted(model.userFactors.collect()) == sorted(model2.userFactors.collect()) - True - >>> sorted(model.itemFactors.collect()) == sorted(model2.itemFactors.collect()) - True - - .. versionadded:: 1.4.0 - """ - - rank = Param(Params._dummy(), "rank", "rank of the factorization", - typeConverter=TypeConverters.toInt) - numUserBlocks = Param(Params._dummy(), "numUserBlocks", "number of user blocks", - typeConverter=TypeConverters.toInt) - numItemBlocks = Param(Params._dummy(), "numItemBlocks", "number of item blocks", - typeConverter=TypeConverters.toInt) - implicitPrefs = Param(Params._dummy(), "implicitPrefs", "whether to use implicit preference", - typeConverter=TypeConverters.toBoolean) - alpha = Param(Params._dummy(), "alpha", "alpha for implicit preference", - typeConverter=TypeConverters.toFloat) - userCol = Param(Params._dummy(), "userCol", "column name for user ids. Ids must be within " + - "the integer value range.", typeConverter=TypeConverters.toString) - itemCol = Param(Params._dummy(), "itemCol", "column name for item ids. Ids must be within " + - "the integer value range.", typeConverter=TypeConverters.toString) - ratingCol = Param(Params._dummy(), "ratingCol", "column name for ratings", - typeConverter=TypeConverters.toString) - nonnegative = Param(Params._dummy(), "nonnegative", - "whether to use nonnegative constraint for least squares", - typeConverter=TypeConverters.toBoolean) - intermediateStorageLevel = Param(Params._dummy(), "intermediateStorageLevel", - "StorageLevel for intermediate datasets. Cannot be 'NONE'.", - typeConverter=TypeConverters.toString) - finalStorageLevel = Param(Params._dummy(), "finalStorageLevel", - "StorageLevel for ALS model factors.", - typeConverter=TypeConverters.toString) - coldStartStrategy = Param(Params._dummy(), "coldStartStrategy", "strategy for dealing with " + - "unknown or new users/items at prediction time. This may be useful " + - "in cross-validation or production scenarios, for handling " + - "user/item ids the model has not seen in the training data. " + - "Supported values: 'nan', 'drop'.", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, - implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None, - ratingCol="rating", nonnegative=False, checkpointInterval=10, - intermediateStorageLevel="MEMORY_AND_DISK", - finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan"): - """ - __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, \ - implicitPrefs=false, alpha=1.0, userCol="user", itemCol="item", seed=None, \ - ratingCol="rating", nonnegative=false, checkpointInterval=10, \ - intermediateStorageLevel="MEMORY_AND_DISK", \ - finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan") - """ - super(ALS, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.recommendation.ALS", self.uid) - self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, - implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", - ratingCol="rating", nonnegative=False, checkpointInterval=10, - intermediateStorageLevel="MEMORY_AND_DISK", - finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, - implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None, - ratingCol="rating", nonnegative=False, checkpointInterval=10, - intermediateStorageLevel="MEMORY_AND_DISK", - finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan"): - """ - setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, \ - implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None, \ - ratingCol="rating", nonnegative=False, checkpointInterval=10, \ - intermediateStorageLevel="MEMORY_AND_DISK", \ - finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan") - Sets params for ALS. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return ALSModel(java_model) - - @since("1.4.0") - def setRank(self, value): - """ - Sets the value of :py:attr:`rank`. - """ - return self._set(rank=value) - - @since("1.4.0") - def getRank(self): - """ - Gets the value of rank or its default value. - """ - return self.getOrDefault(self.rank) - - @since("1.4.0") - def setNumUserBlocks(self, value): - """ - Sets the value of :py:attr:`numUserBlocks`. - """ - return self._set(numUserBlocks=value) - - @since("1.4.0") - def getNumUserBlocks(self): - """ - Gets the value of numUserBlocks or its default value. - """ - return self.getOrDefault(self.numUserBlocks) - - @since("1.4.0") - def setNumItemBlocks(self, value): - """ - Sets the value of :py:attr:`numItemBlocks`. - """ - return self._set(numItemBlocks=value) - - @since("1.4.0") - def getNumItemBlocks(self): - """ - Gets the value of numItemBlocks or its default value. - """ - return self.getOrDefault(self.numItemBlocks) - - @since("1.4.0") - def setNumBlocks(self, value): - """ - Sets both :py:attr:`numUserBlocks` and :py:attr:`numItemBlocks` to the specific value. - """ - self._set(numUserBlocks=value) - return self._set(numItemBlocks=value) - - @since("1.4.0") - def setImplicitPrefs(self, value): - """ - Sets the value of :py:attr:`implicitPrefs`. - """ - return self._set(implicitPrefs=value) - - @since("1.4.0") - def getImplicitPrefs(self): - """ - Gets the value of implicitPrefs or its default value. - """ - return self.getOrDefault(self.implicitPrefs) - - @since("1.4.0") - def setAlpha(self, value): - """ - Sets the value of :py:attr:`alpha`. - """ - return self._set(alpha=value) - - @since("1.4.0") - def getAlpha(self): - """ - Gets the value of alpha or its default value. - """ - return self.getOrDefault(self.alpha) - - @since("1.4.0") - def setUserCol(self, value): - """ - Sets the value of :py:attr:`userCol`. - """ - return self._set(userCol=value) - - @since("1.4.0") - def getUserCol(self): - """ - Gets the value of userCol or its default value. - """ - return self.getOrDefault(self.userCol) - - @since("1.4.0") - def setItemCol(self, value): - """ - Sets the value of :py:attr:`itemCol`. - """ - return self._set(itemCol=value) - - @since("1.4.0") - def getItemCol(self): - """ - Gets the value of itemCol or its default value. - """ - return self.getOrDefault(self.itemCol) - - @since("1.4.0") - def setRatingCol(self, value): - """ - Sets the value of :py:attr:`ratingCol`. - """ - return self._set(ratingCol=value) - - @since("1.4.0") - def getRatingCol(self): - """ - Gets the value of ratingCol or its default value. - """ - return self.getOrDefault(self.ratingCol) - - @since("1.4.0") - def setNonnegative(self, value): - """ - Sets the value of :py:attr:`nonnegative`. - """ - return self._set(nonnegative=value) - - @since("1.4.0") - def getNonnegative(self): - """ - Gets the value of nonnegative or its default value. - """ - return self.getOrDefault(self.nonnegative) - - @since("2.0.0") - def setIntermediateStorageLevel(self, value): - """ - Sets the value of :py:attr:`intermediateStorageLevel`. - """ - return self._set(intermediateStorageLevel=value) - - @since("2.0.0") - def getIntermediateStorageLevel(self): - """ - Gets the value of intermediateStorageLevel or its default value. - """ - return self.getOrDefault(self.intermediateStorageLevel) - - @since("2.0.0") - def setFinalStorageLevel(self, value): - """ - Sets the value of :py:attr:`finalStorageLevel`. - """ - return self._set(finalStorageLevel=value) - - @since("2.0.0") - def getFinalStorageLevel(self): - """ - Gets the value of finalStorageLevel or its default value. - """ - return self.getOrDefault(self.finalStorageLevel) - - @since("2.2.0") - def setColdStartStrategy(self, value): - """ - Sets the value of :py:attr:`coldStartStrategy`. - """ - return self._set(coldStartStrategy=value) - - @since("2.2.0") - def getColdStartStrategy(self): - """ - Gets the value of coldStartStrategy or its default value. - """ - return self.getOrDefault(self.coldStartStrategy) - - -class ALSModel(JavaModel, JavaMLWritable, JavaMLReadable): - """ - Model fitted by ALS. - - .. versionadded:: 1.4.0 - """ - - @property - @since("1.4.0") - def rank(self): - """rank of the matrix factorization model""" - return self._call_java("rank") - - @property - @since("1.4.0") - def userFactors(self): - """ - a DataFrame that stores user factors in two columns: `id` and - `features` - """ - return self._call_java("userFactors") - - @property - @since("1.4.0") - def itemFactors(self): - """ - a DataFrame that stores item factors in two columns: `id` and - `features` - """ - return self._call_java("itemFactors") - - @since("2.2.0") - def recommendForAllUsers(self, numItems): - """ - Returns top `numItems` items recommended for each user, for all users. - - :param numItems: max number of recommendations for each user - :return: a DataFrame of (userCol, recommendations), where recommendations are - stored as an array of (itemCol, rating) Rows. - """ - return self._call_java("recommendForAllUsers", numItems) - - @since("2.2.0") - def recommendForAllItems(self, numUsers): - """ - Returns top `numUsers` users recommended for each item, for all items. - - :param numUsers: max number of recommendations for each item - :return: a DataFrame of (itemCol, recommendations), where recommendations are - stored as an array of (userCol, rating) Rows. - """ - return self._call_java("recommendForAllItems", numUsers) - - @since("2.3.0") - def recommendForUserSubset(self, dataset, numItems): - """ - Returns top `numItems` items recommended for each user id in the input data set. Note that - if there are duplicate ids in the input dataset, only one set of recommendations per unique - id will be returned. - - :param dataset: a Dataset containing a column of user ids. The column name must match - `userCol`. - :param numItems: max number of recommendations for each user - :return: a DataFrame of (userCol, recommendations), where recommendations are - stored as an array of (itemCol, rating) Rows. - """ - return self._call_java("recommendForUserSubset", dataset, numItems) - - @since("2.3.0") - def recommendForItemSubset(self, dataset, numUsers): - """ - Returns top `numUsers` users recommended for each item id in the input data set. Note that - if there are duplicate ids in the input dataset, only one set of recommendations per unique - id will be returned. - - :param dataset: a Dataset containing a column of item ids. The column name must match - `itemCol`. - :param numUsers: max number of recommendations for each item - :return: a DataFrame of (itemCol, recommendations), where recommendations are - stored as an array of (userCol, rating) Rows. - """ - return self._call_java("recommendForItemSubset", dataset, numUsers) - - -if __name__ == "__main__": - import doctest - import pyspark.ml.recommendation - from pyspark.sql import SparkSession - globs = pyspark.ml.recommendation.__dict__.copy() - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("ml.recommendation tests")\ - .getOrCreate() - sc = spark.sparkContext - globs['sc'] = sc - globs['spark'] = spark - import tempfile - temp_path = tempfile.mkdtemp() - globs['temp_path'] = temp_path - try: - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - finally: - from shutil import rmtree - try: - rmtree(temp_path) - except OSError: - pass - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/regression.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/regression.py deleted file mode 100644 index 98f4361..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/regression.py +++ /dev/null @@ -1,1874 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import warnings - -from pyspark import since, keyword_only -from pyspark.ml.param.shared import * -from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper -from pyspark.ml.common import inherit_doc -from pyspark.sql import DataFrame - - -__all__ = ['AFTSurvivalRegression', 'AFTSurvivalRegressionModel', - 'DecisionTreeRegressor', 'DecisionTreeRegressionModel', - 'GBTRegressor', 'GBTRegressionModel', - 'GeneralizedLinearRegression', 'GeneralizedLinearRegressionModel', - 'GeneralizedLinearRegressionSummary', 'GeneralizedLinearRegressionTrainingSummary', - 'IsotonicRegression', 'IsotonicRegressionModel', - 'LinearRegression', 'LinearRegressionModel', - 'LinearRegressionSummary', 'LinearRegressionTrainingSummary', - 'RandomForestRegressor', 'RandomForestRegressionModel'] - - -@inherit_doc -class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept, - HasStandardization, HasSolver, HasWeightCol, HasAggregationDepth, HasLoss, - JavaMLWritable, JavaMLReadable): - """ - Linear regression. - - The learning objective is to minimize the specified loss function, with regularization. - This supports two kinds of loss: - - * squaredError (a.k.a squared loss) - * huber (a hybrid of squared error for relatively small errors and absolute error for \ - relatively large ones, and we estimate the scale parameter from training data) - - This supports multiple types of regularization: - - * none (a.k.a. ordinary least squares) - * L2 (ridge regression) - * L1 (Lasso) - * L2 + L1 (elastic net) - - Note: Fitting with huber loss only supports none and L2 regularization. - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([ - ... (1.0, 2.0, Vectors.dense(1.0)), - ... (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) - >>> lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") - >>> model = lr.fit(df) - >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) - >>> abs(model.transform(test0).head().prediction - (-1.0)) < 0.001 - True - >>> abs(model.coefficients[0] - 1.0) < 0.001 - True - >>> abs(model.intercept - 0.0) < 0.001 - True - >>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) - >>> abs(model.transform(test1).head().prediction - 1.0) < 0.001 - True - >>> lr.setParams("vector") - Traceback (most recent call last): - ... - TypeError: Method setParams forces keyword arguments. - >>> lr_path = temp_path + "/lr" - >>> lr.save(lr_path) - >>> lr2 = LinearRegression.load(lr_path) - >>> lr2.getMaxIter() - 5 - >>> model_path = temp_path + "/lr_model" - >>> model.save(model_path) - >>> model2 = LinearRegressionModel.load(model_path) - >>> model.coefficients[0] == model2.coefficients[0] - True - >>> model.intercept == model2.intercept - True - >>> model.numFeatures - 1 - >>> model.write().format("pmml").save(model_path + "_2") - - .. versionadded:: 1.4.0 - """ - - solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + - "options: auto, normal, l-bfgs.", typeConverter=TypeConverters.toString) - - loss = Param(Params._dummy(), "loss", "The loss function to be optimized. Supported " + - "options: squaredError, huber.", typeConverter=TypeConverters.toString) - - epsilon = Param(Params._dummy(), "epsilon", "The shape parameter to control the amount of " + - "robustness. Must be > 1.0. Only valid when loss is huber", - typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, - standardization=True, solver="auto", weightCol=None, aggregationDepth=2, - loss="squaredError", epsilon=1.35): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ - standardization=True, solver="auto", weightCol=None, aggregationDepth=2, \ - loss="squaredError", epsilon=1.35) - """ - super(LinearRegression, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.regression.LinearRegression", self.uid) - self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, loss="squaredError", epsilon=1.35) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, - standardization=True, solver="auto", weightCol=None, aggregationDepth=2, - loss="squaredError", epsilon=1.35): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ - standardization=True, solver="auto", weightCol=None, aggregationDepth=2, \ - loss="squaredError", epsilon=1.35) - Sets params for linear regression. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return LinearRegressionModel(java_model) - - @since("2.3.0") - def setEpsilon(self, value): - """ - Sets the value of :py:attr:`epsilon`. - """ - return self._set(epsilon=value) - - @since("2.3.0") - def getEpsilon(self): - """ - Gets the value of epsilon or its default value. - """ - return self.getOrDefault(self.epsilon) - - -class LinearRegressionModel(JavaModel, JavaPredictionModel, GeneralJavaMLWritable, JavaMLReadable): - """ - Model fitted by :class:`LinearRegression`. - - .. versionadded:: 1.4.0 - """ - - @property - @since("2.0.0") - def coefficients(self): - """ - Model coefficients. - """ - return self._call_java("coefficients") - - @property - @since("1.4.0") - def intercept(self): - """ - Model intercept. - """ - return self._call_java("intercept") - - @property - @since("2.3.0") - def scale(self): - r""" - The value by which :math:`\|y - X'w\|` is scaled down when loss is "huber", otherwise 1.0. - """ - return self._call_java("scale") - - @property - @since("2.0.0") - def summary(self): - """ - Gets summary (e.g. residuals, mse, r-squared ) of model on - training set. An exception is thrown if - `trainingSummary is None`. - """ - if self.hasSummary: - java_lrt_summary = self._call_java("summary") - return LinearRegressionTrainingSummary(java_lrt_summary) - else: - raise RuntimeError("No training summary available for this %s" % - self.__class__.__name__) - - @property - @since("2.0.0") - def hasSummary(self): - """ - Indicates whether a training summary exists for this model - instance. - """ - return self._call_java("hasSummary") - - @since("2.0.0") - def evaluate(self, dataset): - """ - Evaluates the model on a test dataset. - - :param dataset: - Test dataset to evaluate model on, where dataset is an - instance of :py:class:`pyspark.sql.DataFrame` - """ - if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) - java_lr_summary = self._call_java("evaluate", dataset) - return LinearRegressionSummary(java_lr_summary) - - -class LinearRegressionSummary(JavaWrapper): - """ - .. note:: Experimental - - Linear regression results evaluated on a dataset. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def predictions(self): - """ - Dataframe outputted by the model's `transform` method. - """ - return self._call_java("predictions") - - @property - @since("2.0.0") - def predictionCol(self): - """ - Field in "predictions" which gives the predicted value of - the label at each instance. - """ - return self._call_java("predictionCol") - - @property - @since("2.0.0") - def labelCol(self): - """ - Field in "predictions" which gives the true label of each - instance. - """ - return self._call_java("labelCol") - - @property - @since("2.0.0") - def featuresCol(self): - """ - Field in "predictions" which gives the features of each instance - as a vector. - """ - return self._call_java("featuresCol") - - @property - @since("2.0.0") - def explainedVariance(self): - r""" - Returns the explained variance regression score. - explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}` - - .. seealso:: `Wikipedia explain variation - `_ - - .. note:: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("explainedVariance") - - @property - @since("2.0.0") - def meanAbsoluteError(self): - """ - Returns the mean absolute error, which is a risk function - corresponding to the expected value of the absolute error - loss or l1-norm loss. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("meanAbsoluteError") - - @property - @since("2.0.0") - def meanSquaredError(self): - """ - Returns the mean squared error, which is a risk function - corresponding to the expected value of the squared error - loss or quadratic loss. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("meanSquaredError") - - @property - @since("2.0.0") - def rootMeanSquaredError(self): - """ - Returns the root mean squared error, which is defined as the - square root of the mean squared error. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("rootMeanSquaredError") - - @property - @since("2.0.0") - def r2(self): - """ - Returns R^2, the coefficient of determination. - - .. seealso:: `Wikipedia coefficient of determination - `_ - - .. note:: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("r2") - - @property - @since("2.4.0") - def r2adj(self): - """ - Returns Adjusted R^2, the adjusted coefficient of determination. - - .. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 - `_ - - .. note:: This ignores instance weights (setting all to 1.0) from - `LinearRegression.weightCol`. This will change in later Spark versions. - """ - return self._call_java("r2adj") - - @property - @since("2.0.0") - def residuals(self): - """ - Residuals (label - predicted value) - """ - return self._call_java("residuals") - - @property - @since("2.0.0") - def numInstances(self): - """ - Number of instances in DataFrame predictions - """ - return self._call_java("numInstances") - - @property - @since("2.2.0") - def degreesOfFreedom(self): - """ - Degrees of freedom. - """ - return self._call_java("degreesOfFreedom") - - @property - @since("2.0.0") - def devianceResiduals(self): - """ - The weighted residuals, the usual residuals rescaled by the - square root of the instance weights. - """ - return self._call_java("devianceResiduals") - - @property - @since("2.0.0") - def coefficientStandardErrors(self): - """ - Standard error of estimated coefficients and intercept. - This value is only available when using the "normal" solver. - - If :py:attr:`LinearRegression.fitIntercept` is set to True, - then the last element returned corresponds to the intercept. - - .. seealso:: :py:attr:`LinearRegression.solver` - """ - return self._call_java("coefficientStandardErrors") - - @property - @since("2.0.0") - def tValues(self): - """ - T-statistic of estimated coefficients and intercept. - This value is only available when using the "normal" solver. - - If :py:attr:`LinearRegression.fitIntercept` is set to True, - then the last element returned corresponds to the intercept. - - .. seealso:: :py:attr:`LinearRegression.solver` - """ - return self._call_java("tValues") - - @property - @since("2.0.0") - def pValues(self): - """ - Two-sided p-value of estimated coefficients and intercept. - This value is only available when using the "normal" solver. - - If :py:attr:`LinearRegression.fitIntercept` is set to True, - then the last element returned corresponds to the intercept. - - .. seealso:: :py:attr:`LinearRegression.solver` - """ - return self._call_java("pValues") - - -@inherit_doc -class LinearRegressionTrainingSummary(LinearRegressionSummary): - """ - .. note:: Experimental - - Linear regression training results. Currently, the training summary ignores the - training weights except for the objective trace. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def objectiveHistory(self): - """ - Objective function (scaled loss + regularization) at each - iteration. - This value is only available when using the "l-bfgs" solver. - - .. seealso:: :py:attr:`LinearRegression.solver` - """ - return self._call_java("objectiveHistory") - - @property - @since("2.0.0") - def totalIterations(self): - """ - Number of training iterations until termination. - This value is only available when using the "l-bfgs" solver. - - .. seealso:: :py:attr:`LinearRegression.solver` - """ - return self._call_java("totalIterations") - - -@inherit_doc -class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - HasWeightCol, JavaMLWritable, JavaMLReadable): - """ - Currently implemented using parallelized pool adjacent violators algorithm. - Only univariate (single feature) algorithm supported. - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([ - ... (1.0, Vectors.dense(1.0)), - ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> ir = IsotonicRegression() - >>> model = ir.fit(df) - >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) - >>> model.transform(test0).head().prediction - 0.0 - >>> model.boundaries - DenseVector([0.0, 1.0]) - >>> ir_path = temp_path + "/ir" - >>> ir.save(ir_path) - >>> ir2 = IsotonicRegression.load(ir_path) - >>> ir2.getIsotonic() - True - >>> model_path = temp_path + "/ir_model" - >>> model.save(model_path) - >>> model2 = IsotonicRegressionModel.load(model_path) - >>> model.boundaries == model2.boundaries - True - >>> model.predictions == model2.predictions - True - - .. versionadded:: 1.6.0 - """ - - isotonic = \ - Param(Params._dummy(), "isotonic", - "whether the output sequence should be isotonic/increasing (true) or" + - "antitonic/decreasing (false).", typeConverter=TypeConverters.toBoolean) - featureIndex = \ - Param(Params._dummy(), "featureIndex", - "The index of the feature if featuresCol is a vector column, no effect otherwise.", - typeConverter=TypeConverters.toInt) - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - weightCol=None, isotonic=True, featureIndex=0): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - weightCol=None, isotonic=True, featureIndex=0): - """ - super(IsotonicRegression, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.regression.IsotonicRegression", self.uid) - self._setDefault(isotonic=True, featureIndex=0) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - weightCol=None, isotonic=True, featureIndex=0): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - weightCol=None, isotonic=True, featureIndex=0): - Set the params for IsotonicRegression. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return IsotonicRegressionModel(java_model) - - def setIsotonic(self, value): - """ - Sets the value of :py:attr:`isotonic`. - """ - return self._set(isotonic=value) - - def getIsotonic(self): - """ - Gets the value of isotonic or its default value. - """ - return self.getOrDefault(self.isotonic) - - def setFeatureIndex(self, value): - """ - Sets the value of :py:attr:`featureIndex`. - """ - return self._set(featureIndex=value) - - def getFeatureIndex(self): - """ - Gets the value of featureIndex or its default value. - """ - return self.getOrDefault(self.featureIndex) - - -class IsotonicRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): - """ - Model fitted by :class:`IsotonicRegression`. - - .. versionadded:: 1.6.0 - """ - - @property - @since("1.6.0") - def boundaries(self): - """ - Boundaries in increasing order for which predictions are known. - """ - return self._call_java("boundaries") - - @property - @since("1.6.0") - def predictions(self): - """ - Predictions associated with the boundaries at the same index, monotone because of isotonic - regression. - """ - return self._call_java("predictions") - - -class TreeEnsembleParams(DecisionTreeParams): - """ - Mixin for Decision Tree-based ensemble algorithms parameters. - """ - - subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " + - "used for learning each decision tree, in range (0, 1].", - typeConverter=TypeConverters.toFloat) - - supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"] - - featureSubsetStrategy = \ - Param(Params._dummy(), "featureSubsetStrategy", - "The number of features to consider for splits at each tree node. Supported " + - "options: 'auto' (choose automatically for task: If numTrees == 1, set to " + - "'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to " + - "'onethird' for regression), 'all' (use all features), 'onethird' (use " + - "1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use " + - "log2(number of features)), 'n' (when n is in the range (0, 1.0], use " + - "n * number of features. When n is in the range (1, number of features), use" + - " n features). default = 'auto'", typeConverter=TypeConverters.toString) - - def __init__(self): - super(TreeEnsembleParams, self).__init__() - - @since("1.4.0") - def setSubsamplingRate(self, value): - """ - Sets the value of :py:attr:`subsamplingRate`. - """ - return self._set(subsamplingRate=value) - - @since("1.4.0") - def getSubsamplingRate(self): - """ - Gets the value of subsamplingRate or its default value. - """ - return self.getOrDefault(self.subsamplingRate) - - @since("1.4.0") - def setFeatureSubsetStrategy(self, value): - """ - Sets the value of :py:attr:`featureSubsetStrategy`. - - .. note:: Deprecated in 2.4.0 and will be removed in 3.0.0. - """ - return self._set(featureSubsetStrategy=value) - - @since("1.4.0") - def getFeatureSubsetStrategy(self): - """ - Gets the value of featureSubsetStrategy or its default value. - """ - return self.getOrDefault(self.featureSubsetStrategy) - - -class TreeRegressorParams(Params): - """ - Private class to track supported impurity measures. - """ - - supportedImpurities = ["variance"] - impurity = Param(Params._dummy(), "impurity", - "Criterion used for information gain calculation (case-insensitive). " + - "Supported options: " + - ", ".join(supportedImpurities), typeConverter=TypeConverters.toString) - - def __init__(self): - super(TreeRegressorParams, self).__init__() - - @since("1.4.0") - def setImpurity(self, value): - """ - Sets the value of :py:attr:`impurity`. - """ - return self._set(impurity=value) - - @since("1.4.0") - def getImpurity(self): - """ - Gets the value of impurity or its default value. - """ - return self.getOrDefault(self.impurity) - - -class RandomForestParams(TreeEnsembleParams): - """ - Private class to track supported random forest parameters. - """ - - numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).", - typeConverter=TypeConverters.toInt) - - def __init__(self): - super(RandomForestParams, self).__init__() - - @since("1.4.0") - def setNumTrees(self, value): - """ - Sets the value of :py:attr:`numTrees`. - """ - return self._set(numTrees=value) - - @since("1.4.0") - def getNumTrees(self): - """ - Gets the value of numTrees or its default value. - """ - return self.getOrDefault(self.numTrees) - - -class GBTParams(TreeEnsembleParams): - """ - Private class to track supported GBT params. - """ - supportedLossTypes = ["squared", "absolute"] - - -@inherit_doc -class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval, - HasSeed, JavaMLWritable, JavaMLReadable, HasVarianceCol): - """ - `Decision tree `_ - learning algorithm for regression. - It supports both continuous and categorical features. - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([ - ... (1.0, Vectors.dense(1.0)), - ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance") - >>> model = dt.fit(df) - >>> model.depth - 1 - >>> model.numNodes - 3 - >>> model.featureImportances - SparseVector(1, {0: 1.0}) - >>> model.numFeatures - 1 - >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) - >>> model.transform(test0).head().prediction - 0.0 - >>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) - >>> model.transform(test1).head().prediction - 1.0 - >>> dtr_path = temp_path + "/dtr" - >>> dt.save(dtr_path) - >>> dt2 = DecisionTreeRegressor.load(dtr_path) - >>> dt2.getMaxDepth() - 2 - >>> model_path = temp_path + "/dtr_model" - >>> model.save(model_path) - >>> model2 = DecisionTreeRegressionModel.load(model_path) - >>> model.numNodes == model2.numNodes - True - >>> model.depth == model2.depth - True - >>> model.transform(test1).head().variance - 0.0 - - .. versionadded:: 1.4.0 - """ - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", - seed=None, varianceCol=None): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - impurity="variance", seed=None, varianceCol=None) - """ - super(DecisionTreeRegressor, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.regression.DecisionTreeRegressor", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance", seed=None, varianceCol=None): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - impurity="variance", seed=None, varianceCol=None) - Sets params for the DecisionTreeRegressor. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return DecisionTreeRegressionModel(java_model) - - -@inherit_doc -class DecisionTreeModel(JavaModel, JavaPredictionModel): - """ - Abstraction for Decision Tree models. - - .. versionadded:: 1.5.0 - """ - - @property - @since("1.5.0") - def numNodes(self): - """Return number of nodes of the decision tree.""" - return self._call_java("numNodes") - - @property - @since("1.5.0") - def depth(self): - """Return depth of the decision tree.""" - return self._call_java("depth") - - @property - @since("2.0.0") - def toDebugString(self): - """Full description of model.""" - return self._call_java("toDebugString") - - def __repr__(self): - return self._call_java("toString") - - -@inherit_doc -class TreeEnsembleModel(JavaModel): - """ - (private abstraction) - - Represents a tree ensemble model. - """ - - @property - @since("2.0.0") - def trees(self): - """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeModel(m) for m in list(self._call_java("trees"))] - - @property - @since("2.0.0") - def getNumTrees(self): - """Number of trees in ensemble.""" - return self._call_java("getNumTrees") - - @property - @since("1.5.0") - def treeWeights(self): - """Return the weights for each tree""" - return list(self._call_java("javaTreeWeights")) - - @property - @since("2.0.0") - def totalNumNodes(self): - """Total number of nodes, summed over all trees in the ensemble.""" - return self._call_java("totalNumNodes") - - @property - @since("2.0.0") - def toDebugString(self): - """Full description of model.""" - return self._call_java("toDebugString") - - def __repr__(self): - return self._call_java("toString") - - -@inherit_doc -class DecisionTreeRegressionModel(DecisionTreeModel, JavaMLWritable, JavaMLReadable): - """ - Model fitted by :class:`DecisionTreeRegressor`. - - .. versionadded:: 1.4.0 - """ - - @property - @since("2.0.0") - def featureImportances(self): - """ - Estimate of the importance of each feature. - - This generalizes the idea of "Gini" importance to other losses, - following the explanation of Gini importance from "Random Forests" documentation - by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn. - - This feature importance is calculated as follows: - - importance(feature j) = sum (over nodes which split on feature j) of the gain, - where gain is scaled by the number of instances passing through node - - Normalize importances for tree to sum to 1. - - .. note:: Feature importance for single decision trees can have high variance due to - correlated predictor variables. Consider using a :py:class:`RandomForestRegressor` - to determine feature importance instead. - """ - return self._call_java("featureImportances") - - -@inherit_doc -class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed, - RandomForestParams, TreeRegressorParams, HasCheckpointInterval, - JavaMLWritable, JavaMLReadable): - """ - `Random Forest `_ - learning algorithm for regression. - It supports both continuous and categorical features. - - >>> from numpy import allclose - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([ - ... (1.0, Vectors.dense(1.0)), - ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) - >>> model = rf.fit(df) - >>> model.featureImportances - SparseVector(1, {0: 1.0}) - >>> allclose(model.treeWeights, [1.0, 1.0]) - True - >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) - >>> model.transform(test0).head().prediction - 0.0 - >>> model.numFeatures - 1 - >>> model.trees - [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...] - >>> model.getNumTrees - 2 - >>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) - >>> model.transform(test1).head().prediction - 0.5 - >>> rfr_path = temp_path + "/rfr" - >>> rf.save(rfr_path) - >>> rf2 = RandomForestRegressor.load(rfr_path) - >>> rf2.getNumTrees() - 2 - >>> model_path = temp_path + "/rfr_model" - >>> model.save(model_path) - >>> model2 = RandomForestRegressionModel.load(model_path) - >>> model.featureImportances == model2.featureImportances - True - - .. versionadded:: 1.4.0 - """ - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, - featureSubsetStrategy="auto"): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \ - featureSubsetStrategy="auto") - """ - super(RandomForestRegressor, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.regression.RandomForestRegressor", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance", subsamplingRate=1.0, numTrees=20, - featureSubsetStrategy="auto") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, - featureSubsetStrategy="auto"): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \ - featureSubsetStrategy="auto") - Sets params for linear regression. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return RandomForestRegressionModel(java_model) - - @since("2.4.0") - def setFeatureSubsetStrategy(self, value): - """ - Sets the value of :py:attr:`featureSubsetStrategy`. - """ - return self._set(featureSubsetStrategy=value) - - -class RandomForestRegressionModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, - JavaMLReadable): - """ - Model fitted by :class:`RandomForestRegressor`. - - .. versionadded:: 1.4.0 - """ - - @property - @since("2.0.0") - def trees(self): - """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] - - @property - @since("2.0.0") - def featureImportances(self): - """ - Estimate of the importance of each feature. - - Each feature's importance is the average of its importance across all trees in the ensemble - The importance vector is normalized to sum to 1. This method is suggested by Hastie et al. - (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.) - and follows the implementation from scikit-learn. - - .. seealso:: :py:attr:`DecisionTreeRegressionModel.featureImportances` - """ - return self._call_java("featureImportances") - - -@inherit_doc -class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, - JavaMLReadable, TreeRegressorParams): - """ - `Gradient-Boosted Trees (GBTs) `_ - learning algorithm for regression. - It supports both continuous and categorical features. - - >>> from numpy import allclose - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([ - ... (1.0, Vectors.dense(1.0)), - ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42) - >>> print(gbt.getImpurity()) - variance - >>> print(gbt.getFeatureSubsetStrategy()) - all - >>> model = gbt.fit(df) - >>> model.featureImportances - SparseVector(1, {0: 1.0}) - >>> model.numFeatures - 1 - >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) - True - >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) - >>> model.transform(test0).head().prediction - 0.0 - >>> test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) - >>> model.transform(test1).head().prediction - 1.0 - >>> gbtr_path = temp_path + "gbtr" - >>> gbt.save(gbtr_path) - >>> gbt2 = GBTRegressor.load(gbtr_path) - >>> gbt2.getMaxDepth() - 2 - >>> model_path = temp_path + "gbtr_model" - >>> model.save(model_path) - >>> model2 = GBTRegressionModel.load(model_path) - >>> model.featureImportances == model2.featureImportances - True - >>> model.treeWeights == model2.treeWeights - True - >>> model.trees - [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...] - >>> validation = spark.createDataFrame([(0.0, Vectors.dense(-1.0))], - ... ["label", "features"]) - >>> model.evaluateEachIteration(validation, "squared") - [0.0, 0.0, 0.0, 0.0, 0.0] - - .. versionadded:: 1.4.0 - """ - - lossType = Param(Params._dummy(), "lossType", - "Loss function which GBT tries to minimize (case-insensitive). " + - "Supported options: " + ", ".join(GBTParams.supportedLossTypes), - typeConverter=TypeConverters.toString) - - stepSize = Param(Params._dummy(), "stepSize", - "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " + - "the contribution of each estimator.", - typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, - impurity="variance", featureSubsetStrategy="all"): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, \ - impurity="variance", featureSubsetStrategy="all") - """ - super(GBTRegressor, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, - impurity="variance", featureSubsetStrategy="all") - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, - impuriy="variance", featureSubsetStrategy="all"): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ - maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, \ - impurity="variance", featureSubsetStrategy="all") - Sets params for Gradient Boosted Tree Regression. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return GBTRegressionModel(java_model) - - @since("1.4.0") - def setLossType(self, value): - """ - Sets the value of :py:attr:`lossType`. - """ - return self._set(lossType=value) - - @since("1.4.0") - def getLossType(self): - """ - Gets the value of lossType or its default value. - """ - return self.getOrDefault(self.lossType) - - @since("2.4.0") - def setFeatureSubsetStrategy(self, value): - """ - Sets the value of :py:attr:`featureSubsetStrategy`. - """ - return self._set(featureSubsetStrategy=value) - - -class GBTRegressionModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable): - """ - Model fitted by :class:`GBTRegressor`. - - .. versionadded:: 1.4.0 - """ - - @property - @since("2.0.0") - def featureImportances(self): - """ - Estimate of the importance of each feature. - - Each feature's importance is the average of its importance across all trees in the ensemble - The importance vector is normalized to sum to 1. This method is suggested by Hastie et al. - (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.) - and follows the implementation from scikit-learn. - - .. seealso:: :py:attr:`DecisionTreeRegressionModel.featureImportances` - """ - return self._call_java("featureImportances") - - @property - @since("2.0.0") - def trees(self): - """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] - - @since("2.4.0") - def evaluateEachIteration(self, dataset, loss): - """ - Method to compute error or loss for every iteration of gradient boosting. - - :param dataset: - Test dataset to evaluate model on, where dataset is an - instance of :py:class:`pyspark.sql.DataFrame` - :param loss: - The loss function used to compute error. - Supported options: squared, absolute - """ - return self._call_java("evaluateEachIteration", dataset, loss) - - -@inherit_doc -class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - HasFitIntercept, HasMaxIter, HasTol, HasAggregationDepth, - JavaMLWritable, JavaMLReadable): - """ - .. note:: Experimental - - Accelerated Failure Time (AFT) Model Survival Regression - - Fit a parametric AFT survival regression model based on the Weibull distribution - of the survival time. - - .. seealso:: `AFT Model `_ - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([ - ... (1.0, Vectors.dense(1.0), 1.0), - ... (1e-40, Vectors.sparse(1, [], []), 0.0)], ["label", "features", "censor"]) - >>> aftsr = AFTSurvivalRegression() - >>> model = aftsr.fit(df) - >>> model.predict(Vectors.dense(6.3)) - 1.0 - >>> model.predictQuantiles(Vectors.dense(6.3)) - DenseVector([0.0101, 0.0513, 0.1054, 0.2877, 0.6931, 1.3863, 2.3026, 2.9957, 4.6052]) - >>> model.transform(df).show() - +-------+---------+------+----------+ - | label| features|censor|prediction| - +-------+---------+------+----------+ - | 1.0| [1.0]| 1.0| 1.0| - |1.0E-40|(1,[],[])| 0.0| 1.0| - +-------+---------+------+----------+ - ... - >>> aftsr_path = temp_path + "/aftsr" - >>> aftsr.save(aftsr_path) - >>> aftsr2 = AFTSurvivalRegression.load(aftsr_path) - >>> aftsr2.getMaxIter() - 100 - >>> model_path = temp_path + "/aftsr_model" - >>> model.save(model_path) - >>> model2 = AFTSurvivalRegressionModel.load(model_path) - >>> model.coefficients == model2.coefficients - True - >>> model.intercept == model2.intercept - True - >>> model.scale == model2.scale - True - - .. versionadded:: 1.6.0 - """ - - censorCol = Param(Params._dummy(), "censorCol", - "censor column name. The value of this column could be 0 or 1. " + - "If the value is 1, it means the event has occurred i.e. " + - "uncensored; otherwise censored.", typeConverter=TypeConverters.toString) - quantileProbabilities = \ - Param(Params._dummy(), "quantileProbabilities", - "quantile probabilities array. Values of the quantile probabilities array " + - "should be in the range (0, 1) and the array should be non-empty.", - typeConverter=TypeConverters.toListFloat) - quantilesCol = Param(Params._dummy(), "quantilesCol", - "quantiles column name. This column will output quantiles of " + - "corresponding quantileProbabilities if it is set.", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", - quantileProbabilities=list([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), - quantilesCol=None, aggregationDepth=2): - """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \ - quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \ - quantilesCol=None, aggregationDepth=2) - """ - super(AFTSurvivalRegression, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.regression.AFTSurvivalRegression", self.uid) - self._setDefault(censorCol="censor", - quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], - maxIter=100, tol=1E-6) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - @since("1.6.0") - def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", - quantileProbabilities=list([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), - quantilesCol=None, aggregationDepth=2): - """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \ - quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \ - quantilesCol=None, aggregationDepth=2): - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return AFTSurvivalRegressionModel(java_model) - - @since("1.6.0") - def setCensorCol(self, value): - """ - Sets the value of :py:attr:`censorCol`. - """ - return self._set(censorCol=value) - - @since("1.6.0") - def getCensorCol(self): - """ - Gets the value of censorCol or its default value. - """ - return self.getOrDefault(self.censorCol) - - @since("1.6.0") - def setQuantileProbabilities(self, value): - """ - Sets the value of :py:attr:`quantileProbabilities`. - """ - return self._set(quantileProbabilities=value) - - @since("1.6.0") - def getQuantileProbabilities(self): - """ - Gets the value of quantileProbabilities or its default value. - """ - return self.getOrDefault(self.quantileProbabilities) - - @since("1.6.0") - def setQuantilesCol(self, value): - """ - Sets the value of :py:attr:`quantilesCol`. - """ - return self._set(quantilesCol=value) - - @since("1.6.0") - def getQuantilesCol(self): - """ - Gets the value of quantilesCol or its default value. - """ - return self.getOrDefault(self.quantilesCol) - - -class AFTSurvivalRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): - """ - .. note:: Experimental - - Model fitted by :class:`AFTSurvivalRegression`. - - .. versionadded:: 1.6.0 - """ - - @property - @since("2.0.0") - def coefficients(self): - """ - Model coefficients. - """ - return self._call_java("coefficients") - - @property - @since("1.6.0") - def intercept(self): - """ - Model intercept. - """ - return self._call_java("intercept") - - @property - @since("1.6.0") - def scale(self): - """ - Model scale parameter. - """ - return self._call_java("scale") - - @since("2.0.0") - def predictQuantiles(self, features): - """ - Predicted Quantiles - """ - return self._call_java("predictQuantiles", features) - - @since("2.0.0") - def predict(self, features): - """ - Predicted value - """ - return self._call_java("predict", features) - - -@inherit_doc -class GeneralizedLinearRegression(JavaEstimator, HasLabelCol, HasFeaturesCol, HasPredictionCol, - HasFitIntercept, HasMaxIter, HasTol, HasRegParam, HasWeightCol, - HasSolver, JavaMLWritable, JavaMLReadable): - """ - .. note:: Experimental - - Generalized Linear Regression. - - Fit a Generalized Linear Model specified by giving a symbolic description of the linear - predictor (link function) and a description of the error distribution (family). It supports - "gaussian", "binomial", "poisson", "gamma" and "tweedie" as family. Valid link functions for - each family is listed below. The first link function of each family is the default one. - - * "gaussian" -> "identity", "log", "inverse" - - * "binomial" -> "logit", "probit", "cloglog" - - * "poisson" -> "log", "identity", "sqrt" - - * "gamma" -> "inverse", "identity", "log" - - * "tweedie" -> power link function specified through "linkPower". \ - The default link power in the tweedie family is 1 - variancePower. - - .. seealso:: `GLM `_ - - >>> from pyspark.ml.linalg import Vectors - >>> df = spark.createDataFrame([ - ... (1.0, Vectors.dense(0.0, 0.0)), - ... (1.0, Vectors.dense(1.0, 2.0)), - ... (2.0, Vectors.dense(0.0, 0.0)), - ... (2.0, Vectors.dense(1.0, 1.0)),], ["label", "features"]) - >>> glr = GeneralizedLinearRegression(family="gaussian", link="identity", linkPredictionCol="p") - >>> model = glr.fit(df) - >>> transformed = model.transform(df) - >>> abs(transformed.head().prediction - 1.5) < 0.001 - True - >>> abs(transformed.head().p - 1.5) < 0.001 - True - >>> model.coefficients - DenseVector([1.5..., -1.0...]) - >>> model.numFeatures - 2 - >>> abs(model.intercept - 1.5) < 0.001 - True - >>> glr_path = temp_path + "/glr" - >>> glr.save(glr_path) - >>> glr2 = GeneralizedLinearRegression.load(glr_path) - >>> glr.getFamily() == glr2.getFamily() - True - >>> model_path = temp_path + "/glr_model" - >>> model.save(model_path) - >>> model2 = GeneralizedLinearRegressionModel.load(model_path) - >>> model.intercept == model2.intercept - True - >>> model.coefficients[0] == model2.coefficients[0] - True - - .. versionadded:: 2.0.0 - """ - - family = Param(Params._dummy(), "family", "The name of family which is a description of " + - "the error distribution to be used in the model. Supported options: " + - "gaussian (default), binomial, poisson, gamma and tweedie.", - typeConverter=TypeConverters.toString) - link = Param(Params._dummy(), "link", "The name of link function which provides the " + - "relationship between the linear predictor and the mean of the distribution " + - "function. Supported options: identity, log, inverse, logit, probit, cloglog " + - "and sqrt.", typeConverter=TypeConverters.toString) - linkPredictionCol = Param(Params._dummy(), "linkPredictionCol", "link prediction (linear " + - "predictor) column name", typeConverter=TypeConverters.toString) - variancePower = Param(Params._dummy(), "variancePower", "The power in the variance function " + - "of the Tweedie distribution which characterizes the relationship " + - "between the variance and mean of the distribution. Only applicable " + - "for the Tweedie family. Supported values: 0 and [1, Inf).", - typeConverter=TypeConverters.toFloat) - linkPower = Param(Params._dummy(), "linkPower", "The index in the power link function. " + - "Only applicable to the Tweedie family.", - typeConverter=TypeConverters.toFloat) - solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + - "options: irls.", typeConverter=TypeConverters.toString) - offsetCol = Param(Params._dummy(), "offsetCol", "The offset column name. If this is not set " + - "or empty, we treat all instance offsets as 0.0", - typeConverter=TypeConverters.toString) - - @keyword_only - def __init__(self, labelCol="label", featuresCol="features", predictionCol="prediction", - family="gaussian", link=None, fitIntercept=True, maxIter=25, tol=1e-6, - regParam=0.0, weightCol=None, solver="irls", linkPredictionCol=None, - variancePower=0.0, linkPower=None, offsetCol=None): - """ - __init__(self, labelCol="label", featuresCol="features", predictionCol="prediction", \ - family="gaussian", link=None, fitIntercept=True, maxIter=25, tol=1e-6, \ - regParam=0.0, weightCol=None, solver="irls", linkPredictionCol=None, \ - variancePower=0.0, linkPower=None, offsetCol=None) - """ - super(GeneralizedLinearRegression, self).__init__() - self._java_obj = self._new_java_obj( - "org.apache.spark.ml.regression.GeneralizedLinearRegression", self.uid) - self._setDefault(family="gaussian", maxIter=25, tol=1e-6, regParam=0.0, solver="irls", - variancePower=0.0) - kwargs = self._input_kwargs - - self.setParams(**kwargs) - - @keyword_only - @since("2.0.0") - def setParams(self, labelCol="label", featuresCol="features", predictionCol="prediction", - family="gaussian", link=None, fitIntercept=True, maxIter=25, tol=1e-6, - regParam=0.0, weightCol=None, solver="irls", linkPredictionCol=None, - variancePower=0.0, linkPower=None, offsetCol=None): - """ - setParams(self, labelCol="label", featuresCol="features", predictionCol="prediction", \ - family="gaussian", link=None, fitIntercept=True, maxIter=25, tol=1e-6, \ - regParam=0.0, weightCol=None, solver="irls", linkPredictionCol=None, \ - variancePower=0.0, linkPower=None, offsetCol=None) - Sets params for generalized linear regression. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _create_model(self, java_model): - return GeneralizedLinearRegressionModel(java_model) - - @since("2.0.0") - def setFamily(self, value): - """ - Sets the value of :py:attr:`family`. - """ - return self._set(family=value) - - @since("2.0.0") - def getFamily(self): - """ - Gets the value of family or its default value. - """ - return self.getOrDefault(self.family) - - @since("2.0.0") - def setLinkPredictionCol(self, value): - """ - Sets the value of :py:attr:`linkPredictionCol`. - """ - return self._set(linkPredictionCol=value) - - @since("2.0.0") - def getLinkPredictionCol(self): - """ - Gets the value of linkPredictionCol or its default value. - """ - return self.getOrDefault(self.linkPredictionCol) - - @since("2.0.0") - def setLink(self, value): - """ - Sets the value of :py:attr:`link`. - """ - return self._set(link=value) - - @since("2.0.0") - def getLink(self): - """ - Gets the value of link or its default value. - """ - return self.getOrDefault(self.link) - - @since("2.2.0") - def setVariancePower(self, value): - """ - Sets the value of :py:attr:`variancePower`. - """ - return self._set(variancePower=value) - - @since("2.2.0") - def getVariancePower(self): - """ - Gets the value of variancePower or its default value. - """ - return self.getOrDefault(self.variancePower) - - @since("2.2.0") - def setLinkPower(self, value): - """ - Sets the value of :py:attr:`linkPower`. - """ - return self._set(linkPower=value) - - @since("2.2.0") - def getLinkPower(self): - """ - Gets the value of linkPower or its default value. - """ - return self.getOrDefault(self.linkPower) - - @since("2.3.0") - def setOffsetCol(self, value): - """ - Sets the value of :py:attr:`offsetCol`. - """ - return self._set(offsetCol=value) - - @since("2.3.0") - def getOffsetCol(self): - """ - Gets the value of offsetCol or its default value. - """ - return self.getOrDefault(self.offsetCol) - - -class GeneralizedLinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable, - JavaMLReadable): - """ - .. note:: Experimental - - Model fitted by :class:`GeneralizedLinearRegression`. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def coefficients(self): - """ - Model coefficients. - """ - return self._call_java("coefficients") - - @property - @since("2.0.0") - def intercept(self): - """ - Model intercept. - """ - return self._call_java("intercept") - - @property - @since("2.0.0") - def summary(self): - """ - Gets summary (e.g. residuals, deviance, pValues) of model on - training set. An exception is thrown if - `trainingSummary is None`. - """ - if self.hasSummary: - java_glrt_summary = self._call_java("summary") - return GeneralizedLinearRegressionTrainingSummary(java_glrt_summary) - else: - raise RuntimeError("No training summary available for this %s" % - self.__class__.__name__) - - @property - @since("2.0.0") - def hasSummary(self): - """ - Indicates whether a training summary exists for this model - instance. - """ - return self._call_java("hasSummary") - - @since("2.0.0") - def evaluate(self, dataset): - """ - Evaluates the model on a test dataset. - - :param dataset: - Test dataset to evaluate model on, where dataset is an - instance of :py:class:`pyspark.sql.DataFrame` - """ - if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) - java_glr_summary = self._call_java("evaluate", dataset) - return GeneralizedLinearRegressionSummary(java_glr_summary) - - -class GeneralizedLinearRegressionSummary(JavaWrapper): - """ - .. note:: Experimental - - Generalized linear regression results evaluated on a dataset. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def predictions(self): - """ - Predictions output by the model's `transform` method. - """ - return self._call_java("predictions") - - @property - @since("2.0.0") - def predictionCol(self): - """ - Field in :py:attr:`predictions` which gives the predicted value of each instance. - This is set to a new column name if the original model's `predictionCol` is not set. - """ - return self._call_java("predictionCol") - - @property - @since("2.2.0") - def numInstances(self): - """ - Number of instances in DataFrame predictions. - """ - return self._call_java("numInstances") - - @property - @since("2.0.0") - def rank(self): - """ - The numeric rank of the fitted linear model. - """ - return self._call_java("rank") - - @property - @since("2.0.0") - def degreesOfFreedom(self): - """ - Degrees of freedom. - """ - return self._call_java("degreesOfFreedom") - - @property - @since("2.0.0") - def residualDegreeOfFreedom(self): - """ - The residual degrees of freedom. - """ - return self._call_java("residualDegreeOfFreedom") - - @property - @since("2.0.0") - def residualDegreeOfFreedomNull(self): - """ - The residual degrees of freedom for the null model. - """ - return self._call_java("residualDegreeOfFreedomNull") - - @since("2.0.0") - def residuals(self, residualsType="deviance"): - """ - Get the residuals of the fitted model by type. - - :param residualsType: The type of residuals which should be returned. - Supported options: deviance (default), pearson, working, and response. - """ - return self._call_java("residuals", residualsType) - - @property - @since("2.0.0") - def nullDeviance(self): - """ - The deviance for the null model. - """ - return self._call_java("nullDeviance") - - @property - @since("2.0.0") - def deviance(self): - """ - The deviance for the fitted model. - """ - return self._call_java("deviance") - - @property - @since("2.0.0") - def dispersion(self): - """ - The dispersion of the fitted model. - It is taken as 1.0 for the "binomial" and "poisson" families, and otherwise - estimated by the residual Pearson's Chi-Squared statistic (which is defined as - sum of the squares of the Pearson residuals) divided by the residual degrees of freedom. - """ - return self._call_java("dispersion") - - @property - @since("2.0.0") - def aic(self): - """ - Akaike's "An Information Criterion"(AIC) for the fitted model. - """ - return self._call_java("aic") - - -@inherit_doc -class GeneralizedLinearRegressionTrainingSummary(GeneralizedLinearRegressionSummary): - """ - .. note:: Experimental - - Generalized linear regression training results. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def numIterations(self): - """ - Number of training iterations. - """ - return self._call_java("numIterations") - - @property - @since("2.0.0") - def solver(self): - """ - The numeric solver used for training. - """ - return self._call_java("solver") - - @property - @since("2.0.0") - def coefficientStandardErrors(self): - """ - Standard error of estimated coefficients and intercept. - - If :py:attr:`GeneralizedLinearRegression.fitIntercept` is set to True, - then the last element returned corresponds to the intercept. - """ - return self._call_java("coefficientStandardErrors") - - @property - @since("2.0.0") - def tValues(self): - """ - T-statistic of estimated coefficients and intercept. - - If :py:attr:`GeneralizedLinearRegression.fitIntercept` is set to True, - then the last element returned corresponds to the intercept. - """ - return self._call_java("tValues") - - @property - @since("2.0.0") - def pValues(self): - """ - Two-sided p-value of estimated coefficients and intercept. - - If :py:attr:`GeneralizedLinearRegression.fitIntercept` is set to True, - then the last element returned corresponds to the intercept. - """ - return self._call_java("pValues") - - def __repr__(self): - return self._call_java("toString") - - -if __name__ == "__main__": - import doctest - import pyspark.ml.regression - from pyspark.sql import SparkSession - globs = pyspark.ml.regression.__dict__.copy() - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("ml.regression tests")\ - .getOrCreate() - sc = spark.sparkContext - globs['sc'] = sc - globs['spark'] = spark - import tempfile - temp_path = tempfile.mkdtemp() - globs['temp_path'] = temp_path - try: - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - finally: - from shutil import rmtree - try: - rmtree(temp_path) - except OSError: - pass - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/stat.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/stat.py deleted file mode 100644 index 3f42102..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/stat.py +++ /dev/null @@ -1,414 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys - -from pyspark import since, SparkContext -from pyspark.ml.common import _java2py, _py2java -from pyspark.ml.wrapper import JavaWrapper, _jvm -from pyspark.sql.column import Column, _to_seq -from pyspark.sql.functions import lit - - -class ChiSquareTest(object): - """ - .. note:: Experimental - - Conduct Pearson's independence test for every feature against the label. For each feature, - the (feature, label) pairs are converted into a contingency matrix for which the Chi-squared - statistic is computed. All label and feature values must be categorical. - - The null hypothesis is that the occurrence of the outcomes is statistically independent. - - .. versionadded:: 2.2.0 - - """ - @staticmethod - @since("2.2.0") - def test(dataset, featuresCol, labelCol): - """ - Perform a Pearson's independence test using dataset. - - :param dataset: - DataFrame of categorical labels and categorical features. - Real-valued features will be treated as categorical for each distinct value. - :param featuresCol: - Name of features column in dataset, of type `Vector` (`VectorUDT`). - :param labelCol: - Name of label column in dataset, of any numerical type. - :return: - DataFrame containing the test result for every feature against the label. - This DataFrame will contain a single Row with the following fields: - - `pValues: Vector` - - `degreesOfFreedom: Array[Int]` - - `statistics: Vector` - Each of these fields has one value per feature. - - >>> from pyspark.ml.linalg import Vectors - >>> from pyspark.ml.stat import ChiSquareTest - >>> dataset = [[0, Vectors.dense([0, 0, 1])], - ... [0, Vectors.dense([1, 0, 1])], - ... [1, Vectors.dense([2, 1, 1])], - ... [1, Vectors.dense([3, 1, 1])]] - >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) - >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label') - >>> chiSqResult.select("degreesOfFreedom").collect()[0] - Row(degreesOfFreedom=[3, 1, 0]) - """ - sc = SparkContext._active_spark_context - javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest - args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)] - return _java2py(sc, javaTestObj.test(*args)) - - -class Correlation(object): - """ - .. note:: Experimental - - Compute the correlation matrix for the input dataset of Vectors using the specified method. - Methods currently supported: `pearson` (default), `spearman`. - - .. note:: For Spearman, a rank correlation, we need to create an RDD[Double] for each column - and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector], - which is fairly costly. Cache the input Dataset before calling corr with `method = 'spearman'` - to avoid recomputing the common lineage. - - .. versionadded:: 2.2.0 - - """ - @staticmethod - @since("2.2.0") - def corr(dataset, column, method="pearson"): - """ - Compute the correlation matrix with specified method using dataset. - - :param dataset: - A Dataset or a DataFrame. - :param column: - The name of the column of vectors for which the correlation coefficient needs - to be computed. This must be a column of the dataset, and it must contain - Vector objects. - :param method: - String specifying the method to use for computing correlation. - Supported: `pearson` (default), `spearman`. - :return: - A DataFrame that contains the correlation matrix of the column of vectors. This - DataFrame contains a single row and a single column of name - '$METHODNAME($COLUMN)'. - - >>> from pyspark.ml.linalg import Vectors - >>> from pyspark.ml.stat import Correlation - >>> dataset = [[Vectors.dense([1, 0, 0, -2])], - ... [Vectors.dense([4, 5, 0, 3])], - ... [Vectors.dense([6, 7, 0, 8])], - ... [Vectors.dense([9, 0, 0, 1])]] - >>> dataset = spark.createDataFrame(dataset, ['features']) - >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0] - >>> print(str(pearsonCorr).replace('nan', 'NaN')) - DenseMatrix([[ 1. , 0.0556..., NaN, 0.4004...], - [ 0.0556..., 1. , NaN, 0.9135...], - [ NaN, NaN, 1. , NaN], - [ 0.4004..., 0.9135..., NaN, 1. ]]) - >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0] - >>> print(str(spearmanCorr).replace('nan', 'NaN')) - DenseMatrix([[ 1. , 0.1054..., NaN, 0.4 ], - [ 0.1054..., 1. , NaN, 0.9486... ], - [ NaN, NaN, 1. , NaN], - [ 0.4 , 0.9486... , NaN, 1. ]]) - """ - sc = SparkContext._active_spark_context - javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation - args = [_py2java(sc, arg) for arg in (dataset, column, method)] - return _java2py(sc, javaCorrObj.corr(*args)) - - -class KolmogorovSmirnovTest(object): - """ - .. note:: Experimental - - Conduct the two-sided Kolmogorov Smirnov (KS) test for data sampled from a continuous - distribution. - - By comparing the largest difference between the empirical cumulative - distribution of the sample data and the theoretical distribution we can provide a test for the - the null hypothesis that the sample data comes from that theoretical distribution. - - .. versionadded:: 2.4.0 - - """ - @staticmethod - @since("2.4.0") - def test(dataset, sampleCol, distName, *params): - """ - Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution - equality. Currently supports the normal distribution, taking as parameters the mean and - standard deviation. - - :param dataset: - a Dataset or a DataFrame containing the sample of data to test. - :param sampleCol: - Name of sample column in dataset, of any numerical type. - :param distName: - a `string` name for a theoretical distribution, currently only support "norm". - :param params: - a list of `Double` values specifying the parameters to be used for the theoretical - distribution. For "norm" distribution, the parameters includes mean and variance. - :return: - A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data. - This DataFrame will contain a single Row with the following fields: - - `pValue: Double` - - `statistic: Double` - - >>> from pyspark.ml.stat import KolmogorovSmirnovTest - >>> dataset = [[-1.0], [0.0], [1.0]] - >>> dataset = spark.createDataFrame(dataset, ['sample']) - >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first() - >>> round(ksResult.pValue, 3) - 1.0 - >>> round(ksResult.statistic, 3) - 0.175 - >>> dataset = [[2.0], [3.0], [4.0]] - >>> dataset = spark.createDataFrame(dataset, ['sample']) - >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first() - >>> round(ksResult.pValue, 3) - 1.0 - >>> round(ksResult.statistic, 3) - 0.175 - """ - sc = SparkContext._active_spark_context - javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest - dataset = _py2java(sc, dataset) - params = [float(param) for param in params] - return _java2py(sc, javaTestObj.test(dataset, sampleCol, distName, - _jvm().PythonUtils.toSeq(params))) - - -class Summarizer(object): - """ - .. note:: Experimental - - Tools for vectorized statistics on MLlib Vectors. - The methods in this package provide various statistics for Vectors contained inside DataFrames. - This class lets users pick the statistics they would like to extract for a given column. - - >>> from pyspark.ml.stat import Summarizer - >>> from pyspark.sql import Row - >>> from pyspark.ml.linalg import Vectors - >>> summarizer = Summarizer.metrics("mean", "count") - >>> df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), - ... Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() - >>> df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) - +-----------------------------------+ - |aggregate_metrics(features, weight)| - +-----------------------------------+ - |[[1.0,1.0,1.0], 1] | - +-----------------------------------+ - - >>> df.select(summarizer.summary(df.features)).show(truncate=False) - +--------------------------------+ - |aggregate_metrics(features, 1.0)| - +--------------------------------+ - |[[1.0,1.5,2.0], 2] | - +--------------------------------+ - - >>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) - +--------------+ - |mean(features)| - +--------------+ - |[1.0,1.0,1.0] | - +--------------+ - - >>> df.select(Summarizer.mean(df.features)).show(truncate=False) - +--------------+ - |mean(features)| - +--------------+ - |[1.0,1.5,2.0] | - +--------------+ - - - .. versionadded:: 2.4.0 - - """ - @staticmethod - @since("2.4.0") - def mean(col, weightCol=None): - """ - return a column of mean summary - """ - return Summarizer._get_single_metric(col, weightCol, "mean") - - @staticmethod - @since("2.4.0") - def variance(col, weightCol=None): - """ - return a column of variance summary - """ - return Summarizer._get_single_metric(col, weightCol, "variance") - - @staticmethod - @since("2.4.0") - def count(col, weightCol=None): - """ - return a column of count summary - """ - return Summarizer._get_single_metric(col, weightCol, "count") - - @staticmethod - @since("2.4.0") - def numNonZeros(col, weightCol=None): - """ - return a column of numNonZero summary - """ - return Summarizer._get_single_metric(col, weightCol, "numNonZeros") - - @staticmethod - @since("2.4.0") - def max(col, weightCol=None): - """ - return a column of max summary - """ - return Summarizer._get_single_metric(col, weightCol, "max") - - @staticmethod - @since("2.4.0") - def min(col, weightCol=None): - """ - return a column of min summary - """ - return Summarizer._get_single_metric(col, weightCol, "min") - - @staticmethod - @since("2.4.0") - def normL1(col, weightCol=None): - """ - return a column of normL1 summary - """ - return Summarizer._get_single_metric(col, weightCol, "normL1") - - @staticmethod - @since("2.4.0") - def normL2(col, weightCol=None): - """ - return a column of normL2 summary - """ - return Summarizer._get_single_metric(col, weightCol, "normL2") - - @staticmethod - def _check_param(featuresCol, weightCol): - if weightCol is None: - weightCol = lit(1.0) - if not isinstance(featuresCol, Column) or not isinstance(weightCol, Column): - raise TypeError("featureCol and weightCol should be a Column") - return featuresCol, weightCol - - @staticmethod - def _get_single_metric(col, weightCol, metric): - col, weightCol = Summarizer._check_param(col, weightCol) - return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric, - col._jc, weightCol._jc)) - - @staticmethod - @since("2.4.0") - def metrics(*metrics): - """ - Given a list of metrics, provides a builder that it turns computes metrics from a column. - - See the documentation of [[Summarizer]] for an example. - - The following metrics are accepted (case sensitive): - - mean: a vector that contains the coefficient-wise mean. - - variance: a vector tha contains the coefficient-wise variance. - - count: the count of all vectors seen. - - numNonzeros: a vector with the number of non-zeros for each coefficients - - max: the maximum for each coefficient. - - min: the minimum for each coefficient. - - normL2: the Euclidean norm for each coefficient. - - normL1: the L1 norm of each coefficient (sum of the absolute values). - - :param metrics: - metrics that can be provided. - :return: - an object of :py:class:`pyspark.ml.stat.SummaryBuilder` - - Note: Currently, the performance of this interface is about 2x~3x slower then using the RDD - interface. - """ - sc = SparkContext._active_spark_context - js = JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer.metrics", - _to_seq(sc, metrics)) - return SummaryBuilder(js) - - -class SummaryBuilder(JavaWrapper): - """ - .. note:: Experimental - - A builder object that provides summary statistics about a given column. - - Users should not directly create such builders, but instead use one of the methods in - :py:class:`pyspark.ml.stat.Summarizer` - - .. versionadded:: 2.4.0 - - """ - def __init__(self, jSummaryBuilder): - super(SummaryBuilder, self).__init__(jSummaryBuilder) - - @since("2.4.0") - def summary(self, featuresCol, weightCol=None): - """ - Returns an aggregate object that contains the summary of the column with the requested - metrics. - - :param featuresCol: - a column that contains features Vector object. - :param weightCol: - a column that contains weight value. Default weight is 1.0. - :return: - an aggregate column that contains the statistics. The exact content of this - structure is determined during the creation of the builder. - """ - featuresCol, weightCol = Summarizer._check_param(featuresCol, weightCol) - return Column(self._java_obj.summary(featuresCol._jc, weightCol._jc)) - - -if __name__ == "__main__": - import doctest - import numpy - import pyspark.ml.stat - from pyspark.sql import SparkSession - try: - # Numpy 1.14+ changed it's string format. - numpy.set_printoptions(legacy='1.13') - except TypeError: - pass - - globs = pyspark.ml.stat.__dict__.copy() - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - spark = SparkSession.builder \ - .master("local[2]") \ - .appName("ml.stat tests") \ - .getOrCreate() - sc = spark.sparkContext - globs['sc'] = sc - globs['spark'] = spark - - failure_count, test_count = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/tests.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/tests.py deleted file mode 100755 index 821e037..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/tests.py +++ /dev/null @@ -1,2762 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Unit tests for MLlib Python DataFrame-based APIs. -""" -import sys -if sys.version > '3': - xrange = range - basestring = str - -try: - import xmlrunner -except ImportError: - xmlrunner = None - -if sys.version_info[:2] <= (2, 6): - try: - import unittest2 as unittest - except ImportError: - sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') - sys.exit(1) -else: - import unittest - -from shutil import rmtree -import tempfile -import array as pyarray -import numpy as np -from numpy import abs, all, arange, array, array_equal, inf, ones, tile, zeros -import inspect -import py4j - -from pyspark import keyword_only, SparkContext -from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer, UnaryTransformer -from pyspark.ml.classification import * -from pyspark.ml.clustering import * -from pyspark.ml.common import _java2py, _py2java -from pyspark.ml.evaluation import BinaryClassificationEvaluator, ClusteringEvaluator, \ - MulticlassClassificationEvaluator, RegressionEvaluator -from pyspark.ml.feature import * -from pyspark.ml.fpm import FPGrowth, FPGrowthModel -from pyspark.ml.image import ImageSchema -from pyspark.ml.linalg import DenseMatrix, DenseMatrix, DenseVector, Matrices, MatrixUDT, \ - SparseMatrix, SparseVector, Vector, VectorUDT, Vectors -from pyspark.ml.param import Param, Params, TypeConverters -from pyspark.ml.param.shared import HasInputCol, HasMaxIter, HasSeed -from pyspark.ml.recommendation import ALS -from pyspark.ml.regression import DecisionTreeRegressor, GeneralizedLinearRegression, \ - LinearRegression -from pyspark.ml.stat import ChiSquareTest -from pyspark.ml.tuning import * -from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaParams, JavaWrapper -from pyspark.serializers import PickleSerializer -from pyspark.sql import DataFrame, Row, SparkSession, HiveContext -from pyspark.sql.functions import rand -from pyspark.sql.types import DoubleType, IntegerType -from pyspark.storagelevel import * -from pyspark.tests import QuietTest, ReusedPySparkTestCase as PySparkTestCase - -ser = PickleSerializer() - - -class MLlibTestCase(unittest.TestCase): - def setUp(self): - self.sc = SparkContext('local[4]', "MLlib tests") - self.spark = SparkSession(self.sc) - - def tearDown(self): - self.spark.stop() - - -class SparkSessionTestCase(PySparkTestCase): - @classmethod - def setUpClass(cls): - PySparkTestCase.setUpClass() - cls.spark = SparkSession(cls.sc) - - @classmethod - def tearDownClass(cls): - PySparkTestCase.tearDownClass() - cls.spark.stop() - - -class MockDataset(DataFrame): - - def __init__(self): - self.index = 0 - - -class HasFake(Params): - - def __init__(self): - super(HasFake, self).__init__() - self.fake = Param(self, "fake", "fake param") - - def getFake(self): - return self.getOrDefault(self.fake) - - -class MockTransformer(Transformer, HasFake): - - def __init__(self): - super(MockTransformer, self).__init__() - self.dataset_index = None - - def _transform(self, dataset): - self.dataset_index = dataset.index - dataset.index += 1 - return dataset - - -class MockUnaryTransformer(UnaryTransformer, DefaultParamsReadable, DefaultParamsWritable): - - shift = Param(Params._dummy(), "shift", "The amount by which to shift " + - "data in a DataFrame", - typeConverter=TypeConverters.toFloat) - - def __init__(self, shiftVal=1): - super(MockUnaryTransformer, self).__init__() - self._setDefault(shift=1) - self._set(shift=shiftVal) - - def getShift(self): - return self.getOrDefault(self.shift) - - def setShift(self, shift): - self._set(shift=shift) - - def createTransformFunc(self): - shiftVal = self.getShift() - return lambda x: x + shiftVal - - def outputDataType(self): - return DoubleType() - - def validateInputType(self, inputType): - if inputType != DoubleType(): - raise TypeError("Bad input type: {}. ".format(inputType) + - "Requires Double.") - - -class MockEstimator(Estimator, HasFake): - - def __init__(self): - super(MockEstimator, self).__init__() - self.dataset_index = None - - def _fit(self, dataset): - self.dataset_index = dataset.index - model = MockModel() - self._copyValues(model) - return model - - -class MockModel(MockTransformer, Model, HasFake): - pass - - -class JavaWrapperMemoryTests(SparkSessionTestCase): - - def test_java_object_gets_detached(self): - df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), - (0.0, 2.0, Vectors.sparse(1, [], []))], - ["label", "weight", "features"]) - lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight", - fitIntercept=False) - - model = lr.fit(df) - summary = model.summary - - self.assertIsInstance(model, JavaWrapper) - self.assertIsInstance(summary, JavaWrapper) - self.assertIsInstance(model, JavaParams) - self.assertNotIsInstance(summary, JavaParams) - - error_no_object = 'Target Object ID does not exist for this gateway' - - self.assertIn("LinearRegression_", model._java_obj.toString()) - self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) - - model.__del__() - - with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): - model._java_obj.toString() - self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) - - try: - summary.__del__() - except: - pass - - with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): - model._java_obj.toString() - with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): - summary._java_obj.toString() - - -class ParamTypeConversionTests(PySparkTestCase): - """ - Test that param type conversion happens. - """ - - def test_int(self): - lr = LogisticRegression(maxIter=5.0) - self.assertEqual(lr.getMaxIter(), 5) - self.assertTrue(type(lr.getMaxIter()) == int) - self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt")) - self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1)) - - def test_float(self): - lr = LogisticRegression(tol=1) - self.assertEqual(lr.getTol(), 1.0) - self.assertTrue(type(lr.getTol()) == float) - self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat")) - - def test_vector(self): - ewp = ElementwiseProduct(scalingVec=[1, 3]) - self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0])) - ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4])) - self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4])) - self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"])) - - def test_list(self): - l = [0, 1] - for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), - range(len(l)), l), pyarray.array('l', l), xrange(2), tuple(l)]: - converted = TypeConverters.toList(lst_like) - self.assertEqual(type(converted), list) - self.assertListEqual(converted, l) - - def test_list_int(self): - for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]), - SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0), - pyarray.array('d', [1.0, 2.0])]: - vs = VectorSlicer(indices=indices) - self.assertListEqual(vs.getIndices(), [1, 2]) - self.assertTrue(all([type(v) == int for v in vs.getIndices()])) - self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"])) - - def test_list_float(self): - b = Bucketizer(splits=[1, 4]) - self.assertEqual(b.getSplits(), [1.0, 4.0]) - self.assertTrue(all([type(v) == float for v in b.getSplits()])) - self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0])) - - def test_list_string(self): - for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 'b'])]: - idx_to_string = IndexToString(labels=labels) - self.assertListEqual(idx_to_string.getLabels(), ['a', 'b']) - self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2])) - - def test_string(self): - lr = LogisticRegression() - for col in ['features', u'features', np.str_('features')]: - lr.setFeaturesCol(col) - self.assertEqual(lr.getFeaturesCol(), 'features') - self.assertRaises(TypeError, lambda: LogisticRegression(featuresCol=2.3)) - - def test_bool(self): - self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1)) - self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false")) - - -class PipelineTests(PySparkTestCase): - - def test_pipeline(self): - dataset = MockDataset() - estimator0 = MockEstimator() - transformer1 = MockTransformer() - estimator2 = MockEstimator() - transformer3 = MockTransformer() - pipeline = Pipeline(stages=[estimator0, transformer1, estimator2, transformer3]) - pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1}) - model0, transformer1, model2, transformer3 = pipeline_model.stages - self.assertEqual(0, model0.dataset_index) - self.assertEqual(0, model0.getFake()) - self.assertEqual(1, transformer1.dataset_index) - self.assertEqual(1, transformer1.getFake()) - self.assertEqual(2, dataset.index) - self.assertIsNone(model2.dataset_index, "The last model shouldn't be called in fit.") - self.assertIsNone(transformer3.dataset_index, - "The last transformer shouldn't be called in fit.") - dataset = pipeline_model.transform(dataset) - self.assertEqual(2, model0.dataset_index) - self.assertEqual(3, transformer1.dataset_index) - self.assertEqual(4, model2.dataset_index) - self.assertEqual(5, transformer3.dataset_index) - self.assertEqual(6, dataset.index) - - def test_identity_pipeline(self): - dataset = MockDataset() - - def doTransform(pipeline): - pipeline_model = pipeline.fit(dataset) - return pipeline_model.transform(dataset) - # check that empty pipeline did not perform any transformation - self.assertEqual(dataset.index, doTransform(Pipeline(stages=[])).index) - # check that failure to set stages param will raise KeyError for missing param - self.assertRaises(KeyError, lambda: doTransform(Pipeline())) - - -class TestParams(HasMaxIter, HasInputCol, HasSeed): - """ - A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed. - """ - @keyword_only - def __init__(self, seed=None): - super(TestParams, self).__init__() - self._setDefault(maxIter=10) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - def setParams(self, seed=None): - """ - setParams(self, seed=None) - Sets params for this test. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - -class OtherTestParams(HasMaxIter, HasInputCol, HasSeed): - """ - A subclass of Params mixed with HasMaxIter, HasInputCol and HasSeed. - """ - @keyword_only - def __init__(self, seed=None): - super(OtherTestParams, self).__init__() - self._setDefault(maxIter=10) - kwargs = self._input_kwargs - self.setParams(**kwargs) - - @keyword_only - def setParams(self, seed=None): - """ - setParams(self, seed=None) - Sets params for this test. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - -class HasThrowableProperty(Params): - - def __init__(self): - super(HasThrowableProperty, self).__init__() - self.p = Param(self, "none", "empty param") - - @property - def test_property(self): - raise RuntimeError("Test property to raise error when invoked") - - -class ParamTests(SparkSessionTestCase): - - def test_copy_new_parent(self): - testParams = TestParams() - # Copying an instantiated param should fail - with self.assertRaises(ValueError): - testParams.maxIter._copy_new_parent(testParams) - # Copying a dummy param should succeed - TestParams.maxIter._copy_new_parent(testParams) - maxIter = testParams.maxIter - self.assertEqual(maxIter.name, "maxIter") - self.assertEqual(maxIter.doc, "max number of iterations (>= 0).") - self.assertTrue(maxIter.parent == testParams.uid) - - def test_param(self): - testParams = TestParams() - maxIter = testParams.maxIter - self.assertEqual(maxIter.name, "maxIter") - self.assertEqual(maxIter.doc, "max number of iterations (>= 0).") - self.assertTrue(maxIter.parent == testParams.uid) - - def test_hasparam(self): - testParams = TestParams() - self.assertTrue(all([testParams.hasParam(p.name) for p in testParams.params])) - self.assertFalse(testParams.hasParam("notAParameter")) - self.assertTrue(testParams.hasParam(u"maxIter")) - - def test_resolveparam(self): - testParams = TestParams() - self.assertEqual(testParams._resolveParam(testParams.maxIter), testParams.maxIter) - self.assertEqual(testParams._resolveParam("maxIter"), testParams.maxIter) - - self.assertEqual(testParams._resolveParam(u"maxIter"), testParams.maxIter) - if sys.version_info[0] >= 3: - # In Python 3, it is allowed to get/set attributes with non-ascii characters. - e_cls = AttributeError - else: - e_cls = UnicodeEncodeError - self.assertRaises(e_cls, lambda: testParams._resolveParam(u"아")) - - def test_params(self): - testParams = TestParams() - maxIter = testParams.maxIter - inputCol = testParams.inputCol - seed = testParams.seed - - params = testParams.params - self.assertEqual(params, [inputCol, maxIter, seed]) - - self.assertTrue(testParams.hasParam(maxIter.name)) - self.assertTrue(testParams.hasDefault(maxIter)) - self.assertFalse(testParams.isSet(maxIter)) - self.assertTrue(testParams.isDefined(maxIter)) - self.assertEqual(testParams.getMaxIter(), 10) - testParams.setMaxIter(100) - self.assertTrue(testParams.isSet(maxIter)) - self.assertEqual(testParams.getMaxIter(), 100) - - self.assertTrue(testParams.hasParam(inputCol.name)) - self.assertFalse(testParams.hasDefault(inputCol)) - self.assertFalse(testParams.isSet(inputCol)) - self.assertFalse(testParams.isDefined(inputCol)) - with self.assertRaises(KeyError): - testParams.getInputCol() - - otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " + - "set raises an error for a non-member parameter.", - typeConverter=TypeConverters.toString) - with self.assertRaises(ValueError): - testParams.set(otherParam, "value") - - # Since the default is normally random, set it to a known number for debug str - testParams._setDefault(seed=41) - testParams.setSeed(43) - - self.assertEqual( - testParams.explainParams(), - "\n".join(["inputCol: input column name. (undefined)", - "maxIter: max number of iterations (>= 0). (default: 10, current: 100)", - "seed: random seed. (default: 41, current: 43)"])) - - def test_kmeans_param(self): - algo = KMeans() - self.assertEqual(algo.getInitMode(), "k-means||") - algo.setK(10) - self.assertEqual(algo.getK(), 10) - algo.setInitSteps(10) - self.assertEqual(algo.getInitSteps(), 10) - self.assertEqual(algo.getDistanceMeasure(), "euclidean") - algo.setDistanceMeasure("cosine") - self.assertEqual(algo.getDistanceMeasure(), "cosine") - - def test_hasseed(self): - noSeedSpecd = TestParams() - withSeedSpecd = TestParams(seed=42) - other = OtherTestParams() - # Check that we no longer use 42 as the magic number - self.assertNotEqual(noSeedSpecd.getSeed(), 42) - origSeed = noSeedSpecd.getSeed() - # Check that we only compute the seed once - self.assertEqual(noSeedSpecd.getSeed(), origSeed) - # Check that a specified seed is honored - self.assertEqual(withSeedSpecd.getSeed(), 42) - # Check that a different class has a different seed - self.assertNotEqual(other.getSeed(), noSeedSpecd.getSeed()) - - def test_param_property_error(self): - param_store = HasThrowableProperty() - self.assertRaises(RuntimeError, lambda: param_store.test_property) - params = param_store.params # should not invoke the property 'test_property' - self.assertEqual(len(params), 1) - - def test_word2vec_param(self): - model = Word2Vec().setWindowSize(6) - # Check windowSize is set properly - self.assertEqual(model.getWindowSize(), 6) - - def test_copy_param_extras(self): - tp = TestParams(seed=42) - extra = {tp.getParam(TestParams.inputCol.name): "copy_input"} - tp_copy = tp.copy(extra=extra) - self.assertEqual(tp.uid, tp_copy.uid) - self.assertEqual(tp.params, tp_copy.params) - for k, v in extra.items(): - self.assertTrue(tp_copy.isDefined(k)) - self.assertEqual(tp_copy.getOrDefault(k), v) - copied_no_extra = {} - for k, v in tp_copy._paramMap.items(): - if k not in extra: - copied_no_extra[k] = v - self.assertEqual(tp._paramMap, copied_no_extra) - self.assertEqual(tp._defaultParamMap, tp_copy._defaultParamMap) - - def test_logistic_regression_check_thresholds(self): - self.assertIsInstance( - LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]), - LogisticRegression - ) - - self.assertRaisesRegexp( - ValueError, - "Logistic Regression getThreshold found inconsistent.*$", - LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] - ) - - def test_preserve_set_state(self): - dataset = self.spark.createDataFrame([(0.5,)], ["data"]) - binarizer = Binarizer(inputCol="data") - self.assertFalse(binarizer.isSet("threshold")) - binarizer.transform(dataset) - binarizer._transfer_params_from_java() - self.assertFalse(binarizer.isSet("threshold"), - "Params not explicitly set should remain unset after transform") - - def test_default_params_transferred(self): - dataset = self.spark.createDataFrame([(0.5,)], ["data"]) - binarizer = Binarizer(inputCol="data") - # intentionally change the pyspark default, but don't set it - binarizer._defaultParamMap[binarizer.outputCol] = "my_default" - result = binarizer.transform(dataset).select("my_default").collect() - self.assertFalse(binarizer.isSet(binarizer.outputCol)) - self.assertEqual(result[0][0], 1.0) - - @staticmethod - def check_params(test_self, py_stage, check_params_exist=True): - """ - Checks common requirements for Params.params: - - set of params exist in Java and Python and are ordered by names - - param parent has the same UID as the object's UID - - default param value from Java matches value in Python - - optionally check if all params from Java also exist in Python - """ - py_stage_str = "%s %s" % (type(py_stage), py_stage) - if not hasattr(py_stage, "_to_java"): - return - java_stage = py_stage._to_java() - if java_stage is None: - return - test_self.assertEqual(py_stage.uid, java_stage.uid(), msg=py_stage_str) - if check_params_exist: - param_names = [p.name for p in py_stage.params] - java_params = list(java_stage.params()) - java_param_names = [jp.name() for jp in java_params] - test_self.assertEqual( - param_names, sorted(java_param_names), - "Param list in Python does not match Java for %s:\nJava = %s\nPython = %s" - % (py_stage_str, java_param_names, param_names)) - for p in py_stage.params: - test_self.assertEqual(p.parent, py_stage.uid) - java_param = java_stage.getParam(p.name) - py_has_default = py_stage.hasDefault(p) - java_has_default = java_stage.hasDefault(java_param) - test_self.assertEqual(py_has_default, java_has_default, - "Default value mismatch of param %s for Params %s" - % (p.name, str(py_stage))) - if py_has_default: - if p.name == "seed": - continue # Random seeds between Spark and PySpark are different - java_default = _java2py(test_self.sc, - java_stage.clear(java_param).getOrDefault(java_param)) - py_stage._clear(p) - py_default = py_stage.getOrDefault(p) - # equality test for NaN is always False - if isinstance(java_default, float) and np.isnan(java_default): - java_default = "NaN" - py_default = "NaN" if np.isnan(py_default) else "not NaN" - test_self.assertEqual( - java_default, py_default, - "Java default %s != python default %s of param %s for Params %s" - % (str(java_default), str(py_default), p.name, str(py_stage))) - - -class EvaluatorTests(SparkSessionTestCase): - - def test_java_params(self): - """ - This tests a bug fixed by SPARK-18274 which causes multiple copies - of a Params instance in Python to be linked to the same Java instance. - """ - evaluator = RegressionEvaluator(metricName="r2") - df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)]) - evaluator.evaluate(df) - self.assertEqual(evaluator._java_obj.getMetricName(), "r2") - evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"}) - evaluator.evaluate(df) - evaluatorCopy.evaluate(df) - self.assertEqual(evaluator._java_obj.getMetricName(), "r2") - self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae") - - def test_clustering_evaluator_with_cosine_distance(self): - featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]), - [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0), - ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)]) - dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"]) - evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine") - self.assertEqual(evaluator.getDistanceMeasure(), "cosine") - self.assertTrue(np.isclose(evaluator.evaluate(dataset), 0.992671213, atol=1e-5)) - - -class FeatureTests(SparkSessionTestCase): - - def test_binarizer(self): - b0 = Binarizer() - self.assertListEqual(b0.params, [b0.inputCol, b0.outputCol, b0.threshold]) - self.assertTrue(all([~b0.isSet(p) for p in b0.params])) - self.assertTrue(b0.hasDefault(b0.threshold)) - self.assertEqual(b0.getThreshold(), 0.0) - b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0) - self.assertTrue(all([b0.isSet(p) for p in b0.params])) - self.assertEqual(b0.getThreshold(), 1.0) - self.assertEqual(b0.getInputCol(), "input") - self.assertEqual(b0.getOutputCol(), "output") - - b0c = b0.copy({b0.threshold: 2.0}) - self.assertEqual(b0c.uid, b0.uid) - self.assertListEqual(b0c.params, b0.params) - self.assertEqual(b0c.getThreshold(), 2.0) - - b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output") - self.assertNotEqual(b1.uid, b0.uid) - self.assertEqual(b1.getThreshold(), 2.0) - self.assertEqual(b1.getInputCol(), "input") - self.assertEqual(b1.getOutputCol(), "output") - - def test_idf(self): - dataset = self.spark.createDataFrame([ - (DenseVector([1.0, 2.0]),), - (DenseVector([0.0, 1.0]),), - (DenseVector([3.0, 0.2]),)], ["tf"]) - idf0 = IDF(inputCol="tf") - self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol]) - idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"}) - self.assertEqual(idf0m.uid, idf0.uid, - "Model should inherit the UID from its parent estimator.") - output = idf0m.transform(dataset) - self.assertIsNotNone(output.head().idf) - # Test that parameters transferred to Python Model - ParamTests.check_params(self, idf0m) - - def test_ngram(self): - dataset = self.spark.createDataFrame([ - Row(input=["a", "b", "c", "d", "e"])]) - ngram0 = NGram(n=4, inputCol="input", outputCol="output") - self.assertEqual(ngram0.getN(), 4) - self.assertEqual(ngram0.getInputCol(), "input") - self.assertEqual(ngram0.getOutputCol(), "output") - transformedDF = ngram0.transform(dataset) - self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"]) - - def test_stopwordsremover(self): - dataset = self.spark.createDataFrame([Row(input=["a", "panda"])]) - stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output") - # Default - self.assertEqual(stopWordRemover.getInputCol(), "input") - transformedDF = stopWordRemover.transform(dataset) - self.assertEqual(transformedDF.head().output, ["panda"]) - self.assertEqual(type(stopWordRemover.getStopWords()), list) - self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) - # Custom - stopwords = ["panda"] - stopWordRemover.setStopWords(stopwords) - self.assertEqual(stopWordRemover.getInputCol(), "input") - self.assertEqual(stopWordRemover.getStopWords(), stopwords) - transformedDF = stopWordRemover.transform(dataset) - self.assertEqual(transformedDF.head().output, ["a"]) - # with language selection - stopwords = StopWordsRemover.loadDefaultStopWords("turkish") - dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])]) - stopWordRemover.setStopWords(stopwords) - self.assertEqual(stopWordRemover.getStopWords(), stopwords) - transformedDF = stopWordRemover.transform(dataset) - self.assertEqual(transformedDF.head().output, []) - # with locale - stopwords = ["BELKİ"] - dataset = self.spark.createDataFrame([Row(input=["belki"])]) - stopWordRemover.setStopWords(stopwords).setLocale("tr") - self.assertEqual(stopWordRemover.getStopWords(), stopwords) - transformedDF = stopWordRemover.transform(dataset) - self.assertEqual(transformedDF.head().output, []) - - def test_count_vectorizer_with_binary(self): - dataset = self.spark.createDataFrame([ - (0, "a a a b b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),), - (1, "a a".split(' '), SparseVector(3, {0: 1.0}),), - (2, "a b".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),), - (3, "c".split(' '), SparseVector(3, {2: 1.0}),)], ["id", "words", "expected"]) - cv = CountVectorizer(binary=True, inputCol="words", outputCol="features") - model = cv.fit(dataset) - - transformedList = model.transform(dataset).select("features", "expected").collect() - - for r in transformedList: - feature, expected = r - self.assertEqual(feature, expected) - - def test_count_vectorizer_with_maxDF(self): - dataset = self.spark.createDataFrame([ - (0, "a b c d".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),), - (1, "a b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),), - (2, "a b".split(' '), SparseVector(3, {0: 1.0}),), - (3, "a".split(' '), SparseVector(3, {}),)], ["id", "words", "expected"]) - cv = CountVectorizer(inputCol="words", outputCol="features") - model1 = cv.setMaxDF(3).fit(dataset) - self.assertEqual(model1.vocabulary, ['b', 'c', 'd']) - - transformedList1 = model1.transform(dataset).select("features", "expected").collect() - - for r in transformedList1: - feature, expected = r - self.assertEqual(feature, expected) - - model2 = cv.setMaxDF(0.75).fit(dataset) - self.assertEqual(model2.vocabulary, ['b', 'c', 'd']) - - transformedList2 = model2.transform(dataset).select("features", "expected").collect() - - for r in transformedList2: - feature, expected = r - self.assertEqual(feature, expected) - - def test_count_vectorizer_from_vocab(self): - model = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words", - outputCol="features", minTF=2) - self.assertEqual(model.vocabulary, ["a", "b", "c"]) - self.assertEqual(model.getMinTF(), 2) - - dataset = self.spark.createDataFrame([ - (0, "a a a b b c".split(' '), SparseVector(3, {0: 3.0, 1: 2.0}),), - (1, "a a".split(' '), SparseVector(3, {0: 2.0}),), - (2, "a b".split(' '), SparseVector(3, {}),)], ["id", "words", "expected"]) - - transformed_list = model.transform(dataset).select("features", "expected").collect() - - for r in transformed_list: - feature, expected = r - self.assertEqual(feature, expected) - - # Test an empty vocabulary - with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "vocabSize.*invalid.*0"): - CountVectorizerModel.from_vocabulary([], inputCol="words") - - # Test model with default settings can transform - model_default = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words") - transformed_list = model_default.transform(dataset)\ - .select(model_default.getOrDefault(model_default.outputCol)).collect() - self.assertEqual(len(transformed_list), 3) - - def test_rformula_force_index_label(self): - df = self.spark.createDataFrame([ - (1.0, 1.0, "a"), - (0.0, 2.0, "b"), - (1.0, 0.0, "a")], ["y", "x", "s"]) - # Does not index label by default since it's numeric type. - rf = RFormula(formula="y ~ x + s") - model = rf.fit(df) - transformedDF = model.transform(df) - self.assertEqual(transformedDF.head().label, 1.0) - # Force to index label. - rf2 = RFormula(formula="y ~ x + s").setForceIndexLabel(True) - model2 = rf2.fit(df) - transformedDF2 = model2.transform(df) - self.assertEqual(transformedDF2.head().label, 0.0) - - def test_rformula_string_indexer_order_type(self): - df = self.spark.createDataFrame([ - (1.0, 1.0, "a"), - (0.0, 2.0, "b"), - (1.0, 0.0, "a")], ["y", "x", "s"]) - rf = RFormula(formula="y ~ x + s", stringIndexerOrderType="alphabetDesc") - self.assertEqual(rf.getStringIndexerOrderType(), 'alphabetDesc') - transformedDF = rf.fit(df).transform(df) - observed = transformedDF.select("features").collect() - expected = [[1.0, 0.0], [2.0, 1.0], [0.0, 0.0]] - for i in range(0, len(expected)): - self.assertTrue(all(observed[i]["features"].toArray() == expected[i])) - - def test_string_indexer_handle_invalid(self): - df = self.spark.createDataFrame([ - (0, "a"), - (1, "d"), - (2, None)], ["id", "label"]) - - si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep", - stringOrderType="alphabetAsc") - model1 = si1.fit(df) - td1 = model1.transform(df) - actual1 = td1.select("id", "indexed").collect() - expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)] - self.assertEqual(actual1, expected1) - - si2 = si1.setHandleInvalid("skip") - model2 = si2.fit(df) - td2 = model2.transform(df) - actual2 = td2.select("id", "indexed").collect() - expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)] - self.assertEqual(actual2, expected2) - - def test_string_indexer_from_labels(self): - model = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label", - outputCol="indexed", handleInvalid="keep") - self.assertEqual(model.labels, ["a", "b", "c"]) - - df1 = self.spark.createDataFrame([ - (0, "a"), - (1, "c"), - (2, None), - (3, "b"), - (4, "b")], ["id", "label"]) - - result1 = model.transform(df1) - actual1 = result1.select("id", "indexed").collect() - expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=2.0), Row(id=2, indexed=3.0), - Row(id=3, indexed=1.0), Row(id=4, indexed=1.0)] - self.assertEqual(actual1, expected1) - - model_empty_labels = StringIndexerModel.from_labels( - [], inputCol="label", outputCol="indexed", handleInvalid="keep") - actual2 = model_empty_labels.transform(df1).select("id", "indexed").collect() - expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=0.0), Row(id=2, indexed=0.0), - Row(id=3, indexed=0.0), Row(id=4, indexed=0.0)] - self.assertEqual(actual2, expected2) - - # Test model with default settings can transform - model_default = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label") - df2 = self.spark.createDataFrame([ - (0, "a"), - (1, "c"), - (2, "b"), - (3, "b"), - (4, "b")], ["id", "label"]) - transformed_list = model_default.transform(df2)\ - .select(model_default.getOrDefault(model_default.outputCol)).collect() - self.assertEqual(len(transformed_list), 5) - - def test_vector_size_hint(self): - df = self.spark.createDataFrame( - [(0, Vectors.dense([0.0, 10.0, 0.5])), - (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])), - (2, Vectors.dense([2.0, 12.0]))], - ["id", "vector"]) - - sizeHint = VectorSizeHint( - inputCol="vector", - handleInvalid="skip") - sizeHint.setSize(3) - self.assertEqual(sizeHint.getSize(), 3) - - output = sizeHint.transform(df).head().vector - expected = DenseVector([0.0, 10.0, 0.5]) - self.assertEqual(output, expected) - - -class HasInducedError(Params): - - def __init__(self): - super(HasInducedError, self).__init__() - self.inducedError = Param(self, "inducedError", - "Uniformly-distributed error added to feature") - - def getInducedError(self): - return self.getOrDefault(self.inducedError) - - -class InducedErrorModel(Model, HasInducedError): - - def __init__(self): - super(InducedErrorModel, self).__init__() - - def _transform(self, dataset): - return dataset.withColumn("prediction", - dataset.feature + (rand(0) * self.getInducedError())) - - -class InducedErrorEstimator(Estimator, HasInducedError): - - def __init__(self, inducedError=1.0): - super(InducedErrorEstimator, self).__init__() - self._set(inducedError=inducedError) - - def _fit(self, dataset): - model = InducedErrorModel() - self._copyValues(model) - return model - - -class CrossValidatorTests(SparkSessionTestCase): - - def test_copy(self): - dataset = self.spark.createDataFrame([ - (10, 10.0), - (50, 50.0), - (100, 100.0), - (500, 500.0)] * 10, - ["feature", "label"]) - - iee = InducedErrorEstimator() - evaluator = RegressionEvaluator(metricName="rmse") - - grid = (ParamGridBuilder() - .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) - .build()) - cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) - cvCopied = cv.copy() - self.assertEqual(cv.getEstimator().uid, cvCopied.getEstimator().uid) - - cvModel = cv.fit(dataset) - cvModelCopied = cvModel.copy() - for index in range(len(cvModel.avgMetrics)): - self.assertTrue(abs(cvModel.avgMetrics[index] - cvModelCopied.avgMetrics[index]) - < 0.0001) - - def test_fit_minimize_metric(self): - dataset = self.spark.createDataFrame([ - (10, 10.0), - (50, 50.0), - (100, 100.0), - (500, 500.0)] * 10, - ["feature", "label"]) - - iee = InducedErrorEstimator() - evaluator = RegressionEvaluator(metricName="rmse") - - grid = (ParamGridBuilder() - .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) - .build()) - cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) - cvModel = cv.fit(dataset) - bestModel = cvModel.bestModel - bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) - - self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), - "Best model should have zero induced error") - self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0") - - def test_fit_maximize_metric(self): - dataset = self.spark.createDataFrame([ - (10, 10.0), - (50, 50.0), - (100, 100.0), - (500, 500.0)] * 10, - ["feature", "label"]) - - iee = InducedErrorEstimator() - evaluator = RegressionEvaluator(metricName="r2") - - grid = (ParamGridBuilder() - .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) - .build()) - cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) - cvModel = cv.fit(dataset) - bestModel = cvModel.bestModel - bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) - - self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), - "Best model should have zero induced error") - self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") - - def test_param_grid_type_coercion(self): - lr = LogisticRegression(maxIter=10) - paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.5, 1]).build() - for param in paramGrid: - for v in param.values(): - assert(type(v) == float) - - def test_save_load_trained_model(self): - # This tests saving and loading the trained model only. - # Save/load for CrossValidator will be added later: SPARK-13786 - temp_path = tempfile.mkdtemp() - dataset = self.spark.createDataFrame( - [(Vectors.dense([0.0]), 0.0), - (Vectors.dense([0.4]), 1.0), - (Vectors.dense([0.5]), 0.0), - (Vectors.dense([0.6]), 1.0), - (Vectors.dense([1.0]), 1.0)] * 10, - ["features", "label"]) - lr = LogisticRegression() - grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() - evaluator = BinaryClassificationEvaluator() - cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) - cvModel = cv.fit(dataset) - lrModel = cvModel.bestModel - - cvModelPath = temp_path + "/cvModel" - lrModel.save(cvModelPath) - loadedLrModel = LogisticRegressionModel.load(cvModelPath) - self.assertEqual(loadedLrModel.uid, lrModel.uid) - self.assertEqual(loadedLrModel.intercept, lrModel.intercept) - - def test_save_load_simple_estimator(self): - temp_path = tempfile.mkdtemp() - dataset = self.spark.createDataFrame( - [(Vectors.dense([0.0]), 0.0), - (Vectors.dense([0.4]), 1.0), - (Vectors.dense([0.5]), 0.0), - (Vectors.dense([0.6]), 1.0), - (Vectors.dense([1.0]), 1.0)] * 10, - ["features", "label"]) - - lr = LogisticRegression() - grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() - evaluator = BinaryClassificationEvaluator() - - # test save/load of CrossValidator - cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) - cvModel = cv.fit(dataset) - cvPath = temp_path + "/cv" - cv.save(cvPath) - loadedCV = CrossValidator.load(cvPath) - self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) - self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) - self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) - - # test save/load of CrossValidatorModel - cvModelPath = temp_path + "/cvModel" - cvModel.save(cvModelPath) - loadedModel = CrossValidatorModel.load(cvModelPath) - self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) - - def test_parallel_evaluation(self): - dataset = self.spark.createDataFrame( - [(Vectors.dense([0.0]), 0.0), - (Vectors.dense([0.4]), 1.0), - (Vectors.dense([0.5]), 0.0), - (Vectors.dense([0.6]), 1.0), - (Vectors.dense([1.0]), 1.0)] * 10, - ["features", "label"]) - - lr = LogisticRegression() - grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() - evaluator = BinaryClassificationEvaluator() - - # test save/load of CrossValidator - cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) - cv.setParallelism(1) - cvSerialModel = cv.fit(dataset) - cv.setParallelism(2) - cvParallelModel = cv.fit(dataset) - self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics) - - def test_expose_sub_models(self): - temp_path = tempfile.mkdtemp() - dataset = self.spark.createDataFrame( - [(Vectors.dense([0.0]), 0.0), - (Vectors.dense([0.4]), 1.0), - (Vectors.dense([0.5]), 0.0), - (Vectors.dense([0.6]), 1.0), - (Vectors.dense([1.0]), 1.0)] * 10, - ["features", "label"]) - - lr = LogisticRegression() - grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() - evaluator = BinaryClassificationEvaluator() - - numFolds = 3 - cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, - numFolds=numFolds, collectSubModels=True) - - def checkSubModels(subModels): - self.assertEqual(len(subModels), numFolds) - for i in range(numFolds): - self.assertEqual(len(subModels[i]), len(grid)) - - cvModel = cv.fit(dataset) - checkSubModels(cvModel.subModels) - - # Test the default value for option "persistSubModel" to be "true" - testSubPath = temp_path + "/testCrossValidatorSubModels" - savingPathWithSubModels = testSubPath + "cvModel3" - cvModel.save(savingPathWithSubModels) - cvModel3 = CrossValidatorModel.load(savingPathWithSubModels) - checkSubModels(cvModel3.subModels) - cvModel4 = cvModel3.copy() - checkSubModels(cvModel4.subModels) - - savingPathWithoutSubModels = testSubPath + "cvModel2" - cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) - cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels) - self.assertEqual(cvModel2.subModels, None) - - for i in range(numFolds): - for j in range(len(grid)): - self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid) - - def test_save_load_nested_estimator(self): - temp_path = tempfile.mkdtemp() - dataset = self.spark.createDataFrame( - [(Vectors.dense([0.0]), 0.0), - (Vectors.dense([0.4]), 1.0), - (Vectors.dense([0.5]), 0.0), - (Vectors.dense([0.6]), 1.0), - (Vectors.dense([1.0]), 1.0)] * 10, - ["features", "label"]) - - ova = OneVsRest(classifier=LogisticRegression()) - lr1 = LogisticRegression().setMaxIter(100) - lr2 = LogisticRegression().setMaxIter(150) - grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() - evaluator = MulticlassClassificationEvaluator() - - # test save/load of CrossValidator - cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) - cvModel = cv.fit(dataset) - cvPath = temp_path + "/cv" - cv.save(cvPath) - loadedCV = CrossValidator.load(cvPath) - self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) - self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) - - originalParamMap = cv.getEstimatorParamMaps() - loadedParamMap = loadedCV.getEstimatorParamMaps() - for i, param in enumerate(loadedParamMap): - for p in param: - if p.name == "classifier": - self.assertEqual(param[p].uid, originalParamMap[i][p].uid) - else: - self.assertEqual(param[p], originalParamMap[i][p]) - - # test save/load of CrossValidatorModel - cvModelPath = temp_path + "/cvModel" - cvModel.save(cvModelPath) - loadedModel = CrossValidatorModel.load(cvModelPath) - self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) - - -class TrainValidationSplitTests(SparkSessionTestCase): - - def test_fit_minimize_metric(self): - dataset = self.spark.createDataFrame([ - (10, 10.0), - (50, 50.0), - (100, 100.0), - (500, 500.0)] * 10, - ["feature", "label"]) - - iee = InducedErrorEstimator() - evaluator = RegressionEvaluator(metricName="rmse") - - grid = ParamGridBuilder() \ - .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ - .build() - tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) - tvsModel = tvs.fit(dataset) - bestModel = tvsModel.bestModel - bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) - validationMetrics = tvsModel.validationMetrics - - self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), - "Best model should have zero induced error") - self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0") - self.assertEqual(len(grid), len(validationMetrics), - "validationMetrics has the same size of grid parameter") - self.assertEqual(0.0, min(validationMetrics)) - - def test_fit_maximize_metric(self): - dataset = self.spark.createDataFrame([ - (10, 10.0), - (50, 50.0), - (100, 100.0), - (500, 500.0)] * 10, - ["feature", "label"]) - - iee = InducedErrorEstimator() - evaluator = RegressionEvaluator(metricName="r2") - - grid = ParamGridBuilder() \ - .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ - .build() - tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) - tvsModel = tvs.fit(dataset) - bestModel = tvsModel.bestModel - bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) - validationMetrics = tvsModel.validationMetrics - - self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), - "Best model should have zero induced error") - self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") - self.assertEqual(len(grid), len(validationMetrics), - "validationMetrics has the same size of grid parameter") - self.assertEqual(1.0, max(validationMetrics)) - - def test_save_load_trained_model(self): - # This tests saving and loading the trained model only. - # Save/load for TrainValidationSplit will be added later: SPARK-13786 - temp_path = tempfile.mkdtemp() - dataset = self.spark.createDataFrame( - [(Vectors.dense([0.0]), 0.0), - (Vectors.dense([0.4]), 1.0), - (Vectors.dense([0.5]), 0.0), - (Vectors.dense([0.6]), 1.0), - (Vectors.dense([1.0]), 1.0)] * 10, - ["features", "label"]) - lr = LogisticRegression() - grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() - evaluator = BinaryClassificationEvaluator() - tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) - tvsModel = tvs.fit(dataset) - lrModel = tvsModel.bestModel - - tvsModelPath = temp_path + "/tvsModel" - lrModel.save(tvsModelPath) - loadedLrModel = LogisticRegressionModel.load(tvsModelPath) - self.assertEqual(loadedLrModel.uid, lrModel.uid) - self.assertEqual(loadedLrModel.intercept, lrModel.intercept) - - def test_save_load_simple_estimator(self): - # This tests saving and loading the trained model only. - # Save/load for TrainValidationSplit will be added later: SPARK-13786 - temp_path = tempfile.mkdtemp() - dataset = self.spark.createDataFrame( - [(Vectors.dense([0.0]), 0.0), - (Vectors.dense([0.4]), 1.0), - (Vectors.dense([0.5]), 0.0), - (Vectors.dense([0.6]), 1.0), - (Vectors.dense([1.0]), 1.0)] * 10, - ["features", "label"]) - lr = LogisticRegression() - grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() - evaluator = BinaryClassificationEvaluator() - tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) - tvsModel = tvs.fit(dataset) - - tvsPath = temp_path + "/tvs" - tvs.save(tvsPath) - loadedTvs = TrainValidationSplit.load(tvsPath) - self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) - self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) - self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) - - tvsModelPath = temp_path + "/tvsModel" - tvsModel.save(tvsModelPath) - loadedModel = TrainValidationSplitModel.load(tvsModelPath) - self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) - - def test_parallel_evaluation(self): - dataset = self.spark.createDataFrame( - [(Vectors.dense([0.0]), 0.0), - (Vectors.dense([0.4]), 1.0), - (Vectors.dense([0.5]), 0.0), - (Vectors.dense([0.6]), 1.0), - (Vectors.dense([1.0]), 1.0)] * 10, - ["features", "label"]) - lr = LogisticRegression() - grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() - evaluator = BinaryClassificationEvaluator() - tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) - tvs.setParallelism(1) - tvsSerialModel = tvs.fit(dataset) - tvs.setParallelism(2) - tvsParallelModel = tvs.fit(dataset) - self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics) - - def test_expose_sub_models(self): - temp_path = tempfile.mkdtemp() - dataset = self.spark.createDataFrame( - [(Vectors.dense([0.0]), 0.0), - (Vectors.dense([0.4]), 1.0), - (Vectors.dense([0.5]), 0.0), - (Vectors.dense([0.6]), 1.0), - (Vectors.dense([1.0]), 1.0)] * 10, - ["features", "label"]) - lr = LogisticRegression() - grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() - evaluator = BinaryClassificationEvaluator() - tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, - collectSubModels=True) - tvsModel = tvs.fit(dataset) - self.assertEqual(len(tvsModel.subModels), len(grid)) - - # Test the default value for option "persistSubModel" to be "true" - testSubPath = temp_path + "/testTrainValidationSplitSubModels" - savingPathWithSubModels = testSubPath + "cvModel3" - tvsModel.save(savingPathWithSubModels) - tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels) - self.assertEqual(len(tvsModel3.subModels), len(grid)) - tvsModel4 = tvsModel3.copy() - self.assertEqual(len(tvsModel4.subModels), len(grid)) - - savingPathWithoutSubModels = testSubPath + "cvModel2" - tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) - tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels) - self.assertEqual(tvsModel2.subModels, None) - - for i in range(len(grid)): - self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid) - - def test_save_load_nested_estimator(self): - # This tests saving and loading the trained model only. - # Save/load for TrainValidationSplit will be added later: SPARK-13786 - temp_path = tempfile.mkdtemp() - dataset = self.spark.createDataFrame( - [(Vectors.dense([0.0]), 0.0), - (Vectors.dense([0.4]), 1.0), - (Vectors.dense([0.5]), 0.0), - (Vectors.dense([0.6]), 1.0), - (Vectors.dense([1.0]), 1.0)] * 10, - ["features", "label"]) - ova = OneVsRest(classifier=LogisticRegression()) - lr1 = LogisticRegression().setMaxIter(100) - lr2 = LogisticRegression().setMaxIter(150) - grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() - evaluator = MulticlassClassificationEvaluator() - - tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) - tvsModel = tvs.fit(dataset) - tvsPath = temp_path + "/tvs" - tvs.save(tvsPath) - loadedTvs = TrainValidationSplit.load(tvsPath) - self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) - self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) - - originalParamMap = tvs.getEstimatorParamMaps() - loadedParamMap = loadedTvs.getEstimatorParamMaps() - for i, param in enumerate(loadedParamMap): - for p in param: - if p.name == "classifier": - self.assertEqual(param[p].uid, originalParamMap[i][p].uid) - else: - self.assertEqual(param[p], originalParamMap[i][p]) - - tvsModelPath = temp_path + "/tvsModel" - tvsModel.save(tvsModelPath) - loadedModel = TrainValidationSplitModel.load(tvsModelPath) - self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) - - def test_copy(self): - dataset = self.spark.createDataFrame([ - (10, 10.0), - (50, 50.0), - (100, 100.0), - (500, 500.0)] * 10, - ["feature", "label"]) - - iee = InducedErrorEstimator() - evaluator = RegressionEvaluator(metricName="r2") - - grid = ParamGridBuilder() \ - .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ - .build() - tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) - tvsModel = tvs.fit(dataset) - tvsCopied = tvs.copy() - tvsModelCopied = tvsModel.copy() - - self.assertEqual(tvs.getEstimator().uid, tvsCopied.getEstimator().uid, - "Copied TrainValidationSplit has the same uid of Estimator") - - self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) - self.assertEqual(len(tvsModel.validationMetrics), - len(tvsModelCopied.validationMetrics), - "Copied validationMetrics has the same size of the original") - for index in range(len(tvsModel.validationMetrics)): - self.assertEqual(tvsModel.validationMetrics[index], - tvsModelCopied.validationMetrics[index]) - - -class PersistenceTest(SparkSessionTestCase): - - def test_linear_regression(self): - lr = LinearRegression(maxIter=1) - path = tempfile.mkdtemp() - lr_path = path + "/lr" - lr.save(lr_path) - lr2 = LinearRegression.load(lr_path) - self.assertEqual(lr.uid, lr2.uid) - self.assertEqual(type(lr.uid), type(lr2.uid)) - self.assertEqual(lr2.uid, lr2.maxIter.parent, - "Loaded LinearRegression instance uid (%s) did not match Param's uid (%s)" - % (lr2.uid, lr2.maxIter.parent)) - self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter], - "Loaded LinearRegression instance default params did not match " + - "original defaults") - try: - rmtree(path) - except OSError: - pass - - def test_linear_regression_pmml_basic(self): - # Most of the validation is done in the Scala side, here we just check - # that we output text rather than parquet (e.g. that the format flag - # was respected). - df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), - (0.0, 2.0, Vectors.sparse(1, [], []))], - ["label", "weight", "features"]) - lr = LinearRegression(maxIter=1) - model = lr.fit(df) - path = tempfile.mkdtemp() - lr_path = path + "/lr-pmml" - model.write().format("pmml").save(lr_path) - pmml_text_list = self.sc.textFile(lr_path).collect() - pmml_text = "\n".join(pmml_text_list) - self.assertIn("Apache Spark", pmml_text) - self.assertIn("PMML", pmml_text) - - def test_logistic_regression(self): - lr = LogisticRegression(maxIter=1) - path = tempfile.mkdtemp() - lr_path = path + "/logreg" - lr.save(lr_path) - lr2 = LogisticRegression.load(lr_path) - self.assertEqual(lr2.uid, lr2.maxIter.parent, - "Loaded LogisticRegression instance uid (%s) " - "did not match Param's uid (%s)" - % (lr2.uid, lr2.maxIter.parent)) - self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter], - "Loaded LogisticRegression instance default params did not match " + - "original defaults") - try: - rmtree(path) - except OSError: - pass - - def _compare_params(self, m1, m2, param): - """ - Compare 2 ML Params instances for the given param, and assert both have the same param value - and parent. The param must be a parameter of m1. - """ - # Prevent key not found error in case of some param in neither paramMap nor defaultParamMap. - if m1.isDefined(param): - paramValue1 = m1.getOrDefault(param) - paramValue2 = m2.getOrDefault(m2.getParam(param.name)) - if isinstance(paramValue1, Params): - self._compare_pipelines(paramValue1, paramValue2) - else: - self.assertEqual(paramValue1, paramValue2) # for general types param - # Assert parents are equal - self.assertEqual(param.parent, m2.getParam(param.name).parent) - else: - # If m1 is not defined param, then m2 should not, too. See SPARK-14931. - self.assertFalse(m2.isDefined(m2.getParam(param.name))) - - def _compare_pipelines(self, m1, m2): - """ - Compare 2 ML types, asserting that they are equivalent. - This currently supports: - - basic types - - Pipeline, PipelineModel - - OneVsRest, OneVsRestModel - This checks: - - uid - - type - - Param values and parents - """ - self.assertEqual(m1.uid, m2.uid) - self.assertEqual(type(m1), type(m2)) - if isinstance(m1, JavaParams) or isinstance(m1, Transformer): - self.assertEqual(len(m1.params), len(m2.params)) - for p in m1.params: - self._compare_params(m1, m2, p) - elif isinstance(m1, Pipeline): - self.assertEqual(len(m1.getStages()), len(m2.getStages())) - for s1, s2 in zip(m1.getStages(), m2.getStages()): - self._compare_pipelines(s1, s2) - elif isinstance(m1, PipelineModel): - self.assertEqual(len(m1.stages), len(m2.stages)) - for s1, s2 in zip(m1.stages, m2.stages): - self._compare_pipelines(s1, s2) - elif isinstance(m1, OneVsRest) or isinstance(m1, OneVsRestModel): - for p in m1.params: - self._compare_params(m1, m2, p) - if isinstance(m1, OneVsRestModel): - self.assertEqual(len(m1.models), len(m2.models)) - for x, y in zip(m1.models, m2.models): - self._compare_pipelines(x, y) - else: - raise RuntimeError("_compare_pipelines does not yet support type: %s" % type(m1)) - - def test_pipeline_persistence(self): - """ - Pipeline[HashingTF, PCA] - """ - temp_path = tempfile.mkdtemp() - - try: - df = self.spark.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) - tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") - pca = PCA(k=2, inputCol="features", outputCol="pca_features") - pl = Pipeline(stages=[tf, pca]) - model = pl.fit(df) - - pipeline_path = temp_path + "/pipeline" - pl.save(pipeline_path) - loaded_pipeline = Pipeline.load(pipeline_path) - self._compare_pipelines(pl, loaded_pipeline) - - model_path = temp_path + "/pipeline-model" - model.save(model_path) - loaded_model = PipelineModel.load(model_path) - self._compare_pipelines(model, loaded_model) - finally: - try: - rmtree(temp_path) - except OSError: - pass - - def test_nested_pipeline_persistence(self): - """ - Pipeline[HashingTF, Pipeline[PCA]] - """ - temp_path = tempfile.mkdtemp() - - try: - df = self.spark.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) - tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") - pca = PCA(k=2, inputCol="features", outputCol="pca_features") - p0 = Pipeline(stages=[pca]) - pl = Pipeline(stages=[tf, p0]) - model = pl.fit(df) - - pipeline_path = temp_path + "/pipeline" - pl.save(pipeline_path) - loaded_pipeline = Pipeline.load(pipeline_path) - self._compare_pipelines(pl, loaded_pipeline) - - model_path = temp_path + "/pipeline-model" - model.save(model_path) - loaded_model = PipelineModel.load(model_path) - self._compare_pipelines(model, loaded_model) - finally: - try: - rmtree(temp_path) - except OSError: - pass - - def test_python_transformer_pipeline_persistence(self): - """ - Pipeline[MockUnaryTransformer, Binarizer] - """ - temp_path = tempfile.mkdtemp() - - try: - df = self.spark.range(0, 10).toDF('input') - tf = MockUnaryTransformer(shiftVal=2)\ - .setInputCol("input").setOutputCol("shiftedInput") - tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized") - pl = Pipeline(stages=[tf, tf2]) - model = pl.fit(df) - - pipeline_path = temp_path + "/pipeline" - pl.save(pipeline_path) - loaded_pipeline = Pipeline.load(pipeline_path) - self._compare_pipelines(pl, loaded_pipeline) - - model_path = temp_path + "/pipeline-model" - model.save(model_path) - loaded_model = PipelineModel.load(model_path) - self._compare_pipelines(model, loaded_model) - finally: - try: - rmtree(temp_path) - except OSError: - pass - - def test_onevsrest(self): - temp_path = tempfile.mkdtemp() - df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), - (1.0, Vectors.sparse(2, [], [])), - (2.0, Vectors.dense(0.5, 0.5))] * 10, - ["label", "features"]) - lr = LogisticRegression(maxIter=5, regParam=0.01) - ovr = OneVsRest(classifier=lr) - model = ovr.fit(df) - ovrPath = temp_path + "/ovr" - ovr.save(ovrPath) - loadedOvr = OneVsRest.load(ovrPath) - self._compare_pipelines(ovr, loadedOvr) - modelPath = temp_path + "/ovrModel" - model.save(modelPath) - loadedModel = OneVsRestModel.load(modelPath) - self._compare_pipelines(model, loadedModel) - - def test_decisiontree_classifier(self): - dt = DecisionTreeClassifier(maxDepth=1) - path = tempfile.mkdtemp() - dtc_path = path + "/dtc" - dt.save(dtc_path) - dt2 = DecisionTreeClassifier.load(dtc_path) - self.assertEqual(dt2.uid, dt2.maxDepth.parent, - "Loaded DecisionTreeClassifier instance uid (%s) " - "did not match Param's uid (%s)" - % (dt2.uid, dt2.maxDepth.parent)) - self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth], - "Loaded DecisionTreeClassifier instance default params did not match " + - "original defaults") - try: - rmtree(path) - except OSError: - pass - - def test_decisiontree_regressor(self): - dt = DecisionTreeRegressor(maxDepth=1) - path = tempfile.mkdtemp() - dtr_path = path + "/dtr" - dt.save(dtr_path) - dt2 = DecisionTreeClassifier.load(dtr_path) - self.assertEqual(dt2.uid, dt2.maxDepth.parent, - "Loaded DecisionTreeRegressor instance uid (%s) " - "did not match Param's uid (%s)" - % (dt2.uid, dt2.maxDepth.parent)) - self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth], - "Loaded DecisionTreeRegressor instance default params did not match " + - "original defaults") - try: - rmtree(path) - except OSError: - pass - - def test_default_read_write(self): - temp_path = tempfile.mkdtemp() - - lr = LogisticRegression() - lr.setMaxIter(50) - lr.setThreshold(.75) - writer = DefaultParamsWriter(lr) - - savePath = temp_path + "/lr" - writer.save(savePath) - - reader = DefaultParamsReadable.read() - lr2 = reader.load(savePath) - - self.assertEqual(lr.uid, lr2.uid) - self.assertEqual(lr.extractParamMap(), lr2.extractParamMap()) - - # test overwrite - lr.setThreshold(.8) - writer.overwrite().save(savePath) - - reader = DefaultParamsReadable.read() - lr3 = reader.load(savePath) - - self.assertEqual(lr.uid, lr3.uid) - self.assertEqual(lr.extractParamMap(), lr3.extractParamMap()) - - def test_default_read_write_default_params(self): - lr = LogisticRegression() - self.assertFalse(lr.isSet(lr.getParam("threshold"))) - - lr.setMaxIter(50) - lr.setThreshold(.75) - - # `threshold` is set by user, default param `predictionCol` is not set by user. - self.assertTrue(lr.isSet(lr.getParam("threshold"))) - self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) - self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) - - writer = DefaultParamsWriter(lr) - metadata = json.loads(writer._get_metadata_to_save(lr, self.sc)) - self.assertTrue("defaultParamMap" in metadata) - - reader = DefaultParamsReadable.read() - metadataStr = json.dumps(metadata, separators=[',', ':']) - loadedMetadata = reader._parseMetaData(metadataStr, ) - reader.getAndSetParams(lr, loadedMetadata) - - self.assertTrue(lr.isSet(lr.getParam("threshold"))) - self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) - self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) - - # manually create metadata without `defaultParamMap` section. - del metadata['defaultParamMap'] - metadataStr = json.dumps(metadata, separators=[',', ':']) - loadedMetadata = reader._parseMetaData(metadataStr, ) - with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"): - reader.getAndSetParams(lr, loadedMetadata) - - # Prior to 2.4.0, metadata doesn't have `defaultParamMap`. - metadata['sparkVersion'] = '2.3.0' - metadataStr = json.dumps(metadata, separators=[',', ':']) - loadedMetadata = reader._parseMetaData(metadataStr, ) - reader.getAndSetParams(lr, loadedMetadata) - - -class LDATest(SparkSessionTestCase): - - def _compare(self, m1, m2): - """ - Temp method for comparing instances. - TODO: Replace with generic implementation once SPARK-14706 is merged. - """ - self.assertEqual(m1.uid, m2.uid) - self.assertEqual(type(m1), type(m2)) - self.assertEqual(len(m1.params), len(m2.params)) - for p in m1.params: - if m1.isDefined(p): - self.assertEqual(m1.getOrDefault(p), m2.getOrDefault(p)) - self.assertEqual(p.parent, m2.getParam(p.name).parent) - if isinstance(m1, LDAModel): - self.assertEqual(m1.vocabSize(), m2.vocabSize()) - self.assertEqual(m1.topicsMatrix(), m2.topicsMatrix()) - - def test_persistence(self): - # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. - df = self.spark.createDataFrame([ - [1, Vectors.dense([0.0, 1.0])], - [2, Vectors.sparse(2, {0: 1.0})], - ], ["id", "features"]) - # Fit model - lda = LDA(k=2, seed=1, optimizer="em") - distributedModel = lda.fit(df) - self.assertTrue(distributedModel.isDistributed()) - localModel = distributedModel.toLocal() - self.assertFalse(localModel.isDistributed()) - # Define paths - path = tempfile.mkdtemp() - lda_path = path + "/lda" - dist_model_path = path + "/distLDAModel" - local_model_path = path + "/localLDAModel" - # Test LDA - lda.save(lda_path) - lda2 = LDA.load(lda_path) - self._compare(lda, lda2) - # Test DistributedLDAModel - distributedModel.save(dist_model_path) - distributedModel2 = DistributedLDAModel.load(dist_model_path) - self._compare(distributedModel, distributedModel2) - # Test LocalLDAModel - localModel.save(local_model_path) - localModel2 = LocalLDAModel.load(local_model_path) - self._compare(localModel, localModel2) - # Clean up - try: - rmtree(path) - except OSError: - pass - - -class TrainingSummaryTest(SparkSessionTestCase): - - def test_linear_regression_summary(self): - df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), - (0.0, 2.0, Vectors.sparse(1, [], []))], - ["label", "weight", "features"]) - lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight", - fitIntercept=False) - model = lr.fit(df) - self.assertTrue(model.hasSummary) - s = model.summary - # test that api is callable and returns expected types - self.assertGreater(s.totalIterations, 0) - self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.predictionCol, "prediction") - self.assertEqual(s.labelCol, "label") - self.assertEqual(s.featuresCol, "features") - objHist = s.objectiveHistory - self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) - self.assertAlmostEqual(s.explainedVariance, 0.25, 2) - self.assertAlmostEqual(s.meanAbsoluteError, 0.0) - self.assertAlmostEqual(s.meanSquaredError, 0.0) - self.assertAlmostEqual(s.rootMeanSquaredError, 0.0) - self.assertAlmostEqual(s.r2, 1.0, 2) - self.assertAlmostEqual(s.r2adj, 1.0, 2) - self.assertTrue(isinstance(s.residuals, DataFrame)) - self.assertEqual(s.numInstances, 2) - self.assertEqual(s.degreesOfFreedom, 1) - devResiduals = s.devianceResiduals - self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float)) - coefStdErr = s.coefficientStandardErrors - self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) - tValues = s.tValues - self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) - pValues = s.pValues - self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) - # test evaluation (with training dataset) produces a summary with same values - # one check is enough to verify a summary is returned - # The child class LinearRegressionTrainingSummary runs full test - sameSummary = model.evaluate(df) - self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance) - - def test_glr_summary(self): - from pyspark.ml.linalg import Vectors - df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), - (0.0, 2.0, Vectors.sparse(1, [], []))], - ["label", "weight", "features"]) - glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight", - fitIntercept=False) - model = glr.fit(df) - self.assertTrue(model.hasSummary) - s = model.summary - # test that api is callable and returns expected types - self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS - self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.predictionCol, "prediction") - self.assertEqual(s.numInstances, 2) - self.assertTrue(isinstance(s.residuals(), DataFrame)) - self.assertTrue(isinstance(s.residuals("pearson"), DataFrame)) - coefStdErr = s.coefficientStandardErrors - self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) - tValues = s.tValues - self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) - pValues = s.pValues - self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) - self.assertEqual(s.degreesOfFreedom, 1) - self.assertEqual(s.residualDegreeOfFreedom, 1) - self.assertEqual(s.residualDegreeOfFreedomNull, 2) - self.assertEqual(s.rank, 1) - self.assertTrue(isinstance(s.solver, basestring)) - self.assertTrue(isinstance(s.aic, float)) - self.assertTrue(isinstance(s.deviance, float)) - self.assertTrue(isinstance(s.nullDeviance, float)) - self.assertTrue(isinstance(s.dispersion, float)) - # test evaluation (with training dataset) produces a summary with same values - # one check is enough to verify a summary is returned - # The child class GeneralizedLinearRegressionTrainingSummary runs full test - sameSummary = model.evaluate(df) - self.assertAlmostEqual(sameSummary.deviance, s.deviance) - - def test_binary_logistic_regression_summary(self): - df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), - (0.0, 2.0, Vectors.sparse(1, [], []))], - ["label", "weight", "features"]) - lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) - model = lr.fit(df) - self.assertTrue(model.hasSummary) - s = model.summary - # test that api is callable and returns expected types - self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.probabilityCol, "probability") - self.assertEqual(s.labelCol, "label") - self.assertEqual(s.featuresCol, "features") - self.assertEqual(s.predictionCol, "prediction") - objHist = s.objectiveHistory - self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) - self.assertGreater(s.totalIterations, 0) - self.assertTrue(isinstance(s.labels, list)) - self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) - self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) - self.assertTrue(isinstance(s.precisionByLabel, list)) - self.assertTrue(isinstance(s.recallByLabel, list)) - self.assertTrue(isinstance(s.fMeasureByLabel(), list)) - self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) - self.assertTrue(isinstance(s.roc, DataFrame)) - self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) - self.assertTrue(isinstance(s.pr, DataFrame)) - self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) - self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) - self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) - self.assertAlmostEqual(s.accuracy, 1.0, 2) - self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2) - self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2) - self.assertAlmostEqual(s.weightedRecall, 1.0, 2) - self.assertAlmostEqual(s.weightedPrecision, 1.0, 2) - self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2) - self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2) - # test evaluation (with training dataset) produces a summary with same values - # one check is enough to verify a summary is returned, Scala version runs full test - sameSummary = model.evaluate(df) - self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) - - def test_multiclass_logistic_regression_summary(self): - df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), - (0.0, 2.0, Vectors.sparse(1, [], [])), - (2.0, 2.0, Vectors.dense(2.0)), - (2.0, 2.0, Vectors.dense(1.9))], - ["label", "weight", "features"]) - lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) - model = lr.fit(df) - self.assertTrue(model.hasSummary) - s = model.summary - # test that api is callable and returns expected types - self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.probabilityCol, "probability") - self.assertEqual(s.labelCol, "label") - self.assertEqual(s.featuresCol, "features") - self.assertEqual(s.predictionCol, "prediction") - objHist = s.objectiveHistory - self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) - self.assertGreater(s.totalIterations, 0) - self.assertTrue(isinstance(s.labels, list)) - self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) - self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) - self.assertTrue(isinstance(s.precisionByLabel, list)) - self.assertTrue(isinstance(s.recallByLabel, list)) - self.assertTrue(isinstance(s.fMeasureByLabel(), list)) - self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) - self.assertAlmostEqual(s.accuracy, 0.75, 2) - self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2) - self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2) - self.assertAlmostEqual(s.weightedRecall, 0.75, 2) - self.assertAlmostEqual(s.weightedPrecision, 0.583, 2) - self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2) - self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2) - # test evaluation (with training dataset) produces a summary with same values - # one check is enough to verify a summary is returned, Scala version runs full test - sameSummary = model.evaluate(df) - self.assertAlmostEqual(sameSummary.accuracy, s.accuracy) - - def test_gaussian_mixture_summary(self): - data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), - (Vectors.sparse(1, [], []),)] - df = self.spark.createDataFrame(data, ["features"]) - gmm = GaussianMixture(k=2) - model = gmm.fit(df) - self.assertTrue(model.hasSummary) - s = model.summary - self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.probabilityCol, "probability") - self.assertTrue(isinstance(s.probability, DataFrame)) - self.assertEqual(s.featuresCol, "features") - self.assertEqual(s.predictionCol, "prediction") - self.assertTrue(isinstance(s.cluster, DataFrame)) - self.assertEqual(len(s.clusterSizes), 2) - self.assertEqual(s.k, 2) - self.assertEqual(s.numIter, 3) - - def test_bisecting_kmeans_summary(self): - data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), - (Vectors.sparse(1, [], []),)] - df = self.spark.createDataFrame(data, ["features"]) - bkm = BisectingKMeans(k=2) - model = bkm.fit(df) - self.assertTrue(model.hasSummary) - s = model.summary - self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.featuresCol, "features") - self.assertEqual(s.predictionCol, "prediction") - self.assertTrue(isinstance(s.cluster, DataFrame)) - self.assertEqual(len(s.clusterSizes), 2) - self.assertEqual(s.k, 2) - self.assertEqual(s.numIter, 20) - - def test_kmeans_summary(self): - data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), - (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] - df = self.spark.createDataFrame(data, ["features"]) - kmeans = KMeans(k=2, seed=1) - model = kmeans.fit(df) - self.assertTrue(model.hasSummary) - s = model.summary - self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.featuresCol, "features") - self.assertEqual(s.predictionCol, "prediction") - self.assertTrue(isinstance(s.cluster, DataFrame)) - self.assertEqual(len(s.clusterSizes), 2) - self.assertEqual(s.k, 2) - self.assertEqual(s.numIter, 1) - - -class KMeansTests(SparkSessionTestCase): - - def test_kmeans_cosine_distance(self): - data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),), - (Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),), - (Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)] - df = self.spark.createDataFrame(data, ["features"]) - kmeans = KMeans(k=3, seed=1, distanceMeasure="cosine") - model = kmeans.fit(df) - result = model.transform(df).collect() - self.assertTrue(result[0].prediction == result[1].prediction) - self.assertTrue(result[2].prediction == result[3].prediction) - self.assertTrue(result[4].prediction == result[5].prediction) - - -class OneVsRestTests(SparkSessionTestCase): - - def test_copy(self): - df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), - (1.0, Vectors.sparse(2, [], [])), - (2.0, Vectors.dense(0.5, 0.5))], - ["label", "features"]) - lr = LogisticRegression(maxIter=5, regParam=0.01) - ovr = OneVsRest(classifier=lr) - ovr1 = ovr.copy({lr.maxIter: 10}) - self.assertEqual(ovr.getClassifier().getMaxIter(), 5) - self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) - model = ovr.fit(df) - model1 = model.copy({model.predictionCol: "indexed"}) - self.assertEqual(model1.getPredictionCol(), "indexed") - - def test_output_columns(self): - df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), - (1.0, Vectors.sparse(2, [], [])), - (2.0, Vectors.dense(0.5, 0.5))], - ["label", "features"]) - lr = LogisticRegression(maxIter=5, regParam=0.01) - ovr = OneVsRest(classifier=lr, parallelism=1) - model = ovr.fit(df) - output = model.transform(df) - self.assertEqual(output.columns, ["label", "features", "prediction"]) - - def test_parallelism_doesnt_change_output(self): - df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), - (1.0, Vectors.sparse(2, [], [])), - (2.0, Vectors.dense(0.5, 0.5))], - ["label", "features"]) - ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1) - modelPar1 = ovrPar1.fit(df) - ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2) - modelPar2 = ovrPar2.fit(df) - for i, model in enumerate(modelPar1.models): - self.assertTrue(np.allclose(model.coefficients.toArray(), - modelPar2.models[i].coefficients.toArray(), atol=1E-4)) - self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4)) - - def test_support_for_weightCol(self): - df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0), - (1.0, Vectors.sparse(2, [], []), 1.0), - (2.0, Vectors.dense(0.5, 0.5), 1.0)], - ["label", "features", "weight"]) - # classifier inherits hasWeightCol - lr = LogisticRegression(maxIter=5, regParam=0.01) - ovr = OneVsRest(classifier=lr, weightCol="weight") - self.assertIsNotNone(ovr.fit(df)) - # classifier doesn't inherit hasWeightCol - dt = DecisionTreeClassifier() - ovr2 = OneVsRest(classifier=dt, weightCol="weight") - self.assertIsNotNone(ovr2.fit(df)) - - -class HashingTFTest(SparkSessionTestCase): - - def test_apply_binary_term_freqs(self): - - df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"]) - n = 10 - hashingTF = HashingTF() - hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) - output = hashingTF.transform(df) - features = output.select("features").first().features.toArray() - expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray() - for i in range(0, n): - self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) + - ": expected " + str(expected[i]) + ", got " + str(features[i])) - - -class GeneralizedLinearRegressionTest(SparkSessionTestCase): - - def test_tweedie_distribution(self): - - df = self.spark.createDataFrame( - [(1.0, Vectors.dense(0.0, 0.0)), - (1.0, Vectors.dense(1.0, 2.0)), - (2.0, Vectors.dense(0.0, 0.0)), - (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"]) - - glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6) - model = glr.fit(df) - self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4)) - self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4)) - - model2 = glr.setLinkPower(-1.0).fit(df) - self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4)) - self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4)) - - def test_offset(self): - - df = self.spark.createDataFrame( - [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)), - (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)), - (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)), - (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"]) - - glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset") - model = glr.fit(df) - self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581], - atol=1E-4)) - self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4)) - - -class LinearRegressionTest(SparkSessionTestCase): - - def test_linear_regression_with_huber_loss(self): - - data_path = "data/mllib/sample_linear_regression_data.txt" - df = self.spark.read.format("libsvm").load(data_path) - - lir = LinearRegression(loss="huber", epsilon=2.0) - model = lir.fit(df) - - expectedCoefficients = [0.136, 0.7648, -0.7761, 2.4236, 0.537, - 1.2612, -0.333, -0.5694, -0.6311, 0.6053] - expectedIntercept = 0.1607 - expectedScale = 9.758 - - self.assertTrue( - np.allclose(model.coefficients.toArray(), expectedCoefficients, atol=1E-3)) - self.assertTrue(np.isclose(model.intercept, expectedIntercept, atol=1E-3)) - self.assertTrue(np.isclose(model.scale, expectedScale, atol=1E-3)) - - -class LogisticRegressionTest(SparkSessionTestCase): - - def test_binomial_logistic_regression_with_bound(self): - - df = self.spark.createDataFrame( - [(1.0, 1.0, Vectors.dense(0.0, 5.0)), - (0.0, 2.0, Vectors.dense(1.0, 2.0)), - (1.0, 3.0, Vectors.dense(2.0, 1.0)), - (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"]) - - lor = LogisticRegression(regParam=0.01, weightCol="weight", - lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]), - upperBoundsOnIntercepts=Vectors.dense(0.0)) - model = lor.fit(df) - self.assertTrue( - np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4)) - self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4)) - - def test_multinomial_logistic_regression_with_bound(self): - - data_path = "data/mllib/sample_multiclass_classification_data.txt" - df = self.spark.read.format("libsvm").load(data_path) - - lor = LogisticRegression(regParam=0.01, - lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)), - upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0)) - model = lor.fit(df) - expected = [[4.593, 4.5516, 9.0099, 12.2904], - [1.0, 8.1093, 7.0, 10.0], - [3.041, 5.0, 8.0, 11.0]] - for i in range(0, len(expected)): - self.assertTrue( - np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4)) - self.assertTrue( - np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4)) - - -class MultilayerPerceptronClassifierTest(SparkSessionTestCase): - - def test_raw_and_probability_prediction(self): - - data_path = "data/mllib/sample_multiclass_classification_data.txt" - df = self.spark.read.format("libsvm").load(data_path) - - mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], - blockSize=128, seed=123) - model = mlp.fit(df) - test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF() - result = model.transform(test).head() - expected_prediction = 2.0 - expected_probability = [0.0, 0.0, 1.0] - expected_rawPrediction = [57.3955, -124.5462, 67.9943] - self.assertTrue(result.prediction, expected_prediction) - self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4)) - self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4)) - - -class FPGrowthTests(SparkSessionTestCase): - def setUp(self): - super(FPGrowthTests, self).setUp() - self.data = self.spark.createDataFrame( - [([1, 2], ), ([1, 2], ), ([1, 2, 3], ), ([1, 3], )], - ["items"]) - - def test_association_rules(self): - fp = FPGrowth() - fpm = fp.fit(self.data) - - expected_association_rules = self.spark.createDataFrame( - [([3], [1], 1.0, 1.0), ([2], [1], 1.0, 1.0)], - ["antecedent", "consequent", "confidence", "lift"] - ) - actual_association_rules = fpm.associationRules - - self.assertEqual(actual_association_rules.subtract(expected_association_rules).count(), 0) - self.assertEqual(expected_association_rules.subtract(actual_association_rules).count(), 0) - - def test_freq_itemsets(self): - fp = FPGrowth() - fpm = fp.fit(self.data) - - expected_freq_itemsets = self.spark.createDataFrame( - [([1], 4), ([2], 3), ([2, 1], 3), ([3], 2), ([3, 1], 2)], - ["items", "freq"] - ) - actual_freq_itemsets = fpm.freqItemsets - - self.assertEqual(actual_freq_itemsets.subtract(expected_freq_itemsets).count(), 0) - self.assertEqual(expected_freq_itemsets.subtract(actual_freq_itemsets).count(), 0) - - def tearDown(self): - del self.data - - -class ImageReaderTest(SparkSessionTestCase): - - def test_read_images(self): - data_path = 'data/mllib/images/origin/kittens' - df = ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True) - self.assertEqual(df.count(), 4) - first_row = df.take(1)[0][0] - array = ImageSchema.toNDArray(first_row) - self.assertEqual(len(array), first_row[1]) - self.assertEqual(ImageSchema.toImage(array, origin=first_row[0]), first_row) - self.assertEqual(df.schema, ImageSchema.imageSchema) - self.assertEqual(df.schema["image"].dataType, ImageSchema.columnSchema) - expected = {'CV_8UC3': 16, 'Undefined': -1, 'CV_8U': 0, 'CV_8UC1': 0, 'CV_8UC4': 24} - self.assertEqual(ImageSchema.ocvTypes, expected) - expected = ['origin', 'height', 'width', 'nChannels', 'mode', 'data'] - self.assertEqual(ImageSchema.imageFields, expected) - self.assertEqual(ImageSchema.undefinedImageType, "Undefined") - - with QuietTest(self.sc): - self.assertRaisesRegexp( - TypeError, - "image argument should be pyspark.sql.types.Row; however", - lambda: ImageSchema.toNDArray("a")) - - with QuietTest(self.sc): - self.assertRaisesRegexp( - ValueError, - "image argument should have attributes specified in", - lambda: ImageSchema.toNDArray(Row(a=1))) - - with QuietTest(self.sc): - self.assertRaisesRegexp( - TypeError, - "array argument should be numpy.ndarray; however, it got", - lambda: ImageSchema.toImage("a")) - - -class ImageReaderTest2(PySparkTestCase): - - @classmethod - def setUpClass(cls): - super(ImageReaderTest2, cls).setUpClass() - cls.hive_available = True - # Note that here we enable Hive's support. - cls.spark = None - try: - cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf() - except py4j.protocol.Py4JError: - cls.tearDownClass() - cls.hive_available = False - except TypeError: - cls.tearDownClass() - cls.hive_available = False - if cls.hive_available: - cls.spark = HiveContext._createForTesting(cls.sc) - - def setUp(self): - if not self.hive_available: - self.skipTest("Hive is not available.") - - @classmethod - def tearDownClass(cls): - super(ImageReaderTest2, cls).tearDownClass() - if cls.spark is not None: - cls.spark.sparkSession.stop() - cls.spark = None - - def test_read_images_multiple_times(self): - # This test case is to check if `ImageSchema.readImages` tries to - # initiate Hive client multiple times. See SPARK-22651. - data_path = 'data/mllib/images/origin/kittens' - ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True) - ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True) - - -class ALSTest(SparkSessionTestCase): - - def test_storage_levels(self): - df = self.spark.createDataFrame( - [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)], - ["user", "item", "rating"]) - als = ALS().setMaxIter(1).setRank(1) - # test default params - als.fit(df) - self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_AND_DISK") - self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_AND_DISK") - self.assertEqual(als.getFinalStorageLevel(), "MEMORY_AND_DISK") - self.assertEqual(als._java_obj.getFinalStorageLevel(), "MEMORY_AND_DISK") - # test non-default params - als.setIntermediateStorageLevel("MEMORY_ONLY_2") - als.setFinalStorageLevel("DISK_ONLY") - als.fit(df) - self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_ONLY_2") - self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_ONLY_2") - self.assertEqual(als.getFinalStorageLevel(), "DISK_ONLY") - self.assertEqual(als._java_obj.getFinalStorageLevel(), "DISK_ONLY") - - -class DefaultValuesTests(PySparkTestCase): - """ - Test :py:class:`JavaParams` classes to see if their default Param values match - those in their Scala counterparts. - """ - - def test_java_params(self): - import pyspark.ml.feature - import pyspark.ml.classification - import pyspark.ml.clustering - import pyspark.ml.evaluation - import pyspark.ml.pipeline - import pyspark.ml.recommendation - import pyspark.ml.regression - - modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, - pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, - pyspark.ml.regression] - for module in modules: - for name, cls in inspect.getmembers(module, inspect.isclass): - if not name.endswith('Model') and not name.endswith('Params')\ - and issubclass(cls, JavaParams) and not inspect.isabstract(cls): - # NOTE: disable check_params_exist until there is parity with Scala API - ParamTests.check_params(self, cls(), check_params_exist=False) - - # Additional classes that need explicit construction - from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel - ParamTests.check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'), - check_params_exist=False) - ParamTests.check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'), - check_params_exist=False) - - -def _squared_distance(a, b): - if isinstance(a, Vector): - return a.squared_distance(b) - else: - return b.squared_distance(a) - - -class VectorTests(MLlibTestCase): - - def _test_serialize(self, v): - self.assertEqual(v, ser.loads(ser.dumps(v))) - jvec = self.sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(bytearray(ser.dumps(v))) - nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(jvec))) - self.assertEqual(v, nv) - vs = [v] * 100 - jvecs = self.sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(bytearray(ser.dumps(vs))) - nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.ml.python.MLSerDe.dumps(jvecs))) - self.assertEqual(vs, nvs) - - def test_serialize(self): - self._test_serialize(DenseVector(range(10))) - self._test_serialize(DenseVector(array([1., 2., 3., 4.]))) - self._test_serialize(DenseVector(pyarray.array('d', range(10)))) - self._test_serialize(SparseVector(4, {1: 1, 3: 2})) - self._test_serialize(SparseVector(3, {})) - self._test_serialize(DenseMatrix(2, 3, range(6))) - sm1 = SparseMatrix( - 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0]) - self._test_serialize(sm1) - - def test_dot(self): - sv = SparseVector(4, {1: 1, 3: 2}) - dv = DenseVector(array([1., 2., 3., 4.])) - lst = DenseVector([1, 2, 3, 4]) - mat = array([[1., 2., 3., 4.], - [1., 2., 3., 4.], - [1., 2., 3., 4.], - [1., 2., 3., 4.]]) - arr = pyarray.array('d', [0, 1, 2, 3]) - self.assertEqual(10.0, sv.dot(dv)) - self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) - self.assertEqual(30.0, dv.dot(dv)) - self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) - self.assertEqual(30.0, lst.dot(dv)) - self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat))) - self.assertEqual(7.0, sv.dot(arr)) - - def test_squared_distance(self): - sv = SparseVector(4, {1: 1, 3: 2}) - dv = DenseVector(array([1., 2., 3., 4.])) - lst = DenseVector([4, 3, 2, 1]) - lst1 = [4, 3, 2, 1] - arr = pyarray.array('d', [0, 2, 1, 3]) - narr = array([0, 2, 1, 3]) - self.assertEqual(15.0, _squared_distance(sv, dv)) - self.assertEqual(25.0, _squared_distance(sv, lst)) - self.assertEqual(20.0, _squared_distance(dv, lst)) - self.assertEqual(15.0, _squared_distance(dv, sv)) - self.assertEqual(25.0, _squared_distance(lst, sv)) - self.assertEqual(20.0, _squared_distance(lst, dv)) - self.assertEqual(0.0, _squared_distance(sv, sv)) - self.assertEqual(0.0, _squared_distance(dv, dv)) - self.assertEqual(0.0, _squared_distance(lst, lst)) - self.assertEqual(25.0, _squared_distance(sv, lst1)) - self.assertEqual(3.0, _squared_distance(sv, arr)) - self.assertEqual(3.0, _squared_distance(sv, narr)) - - def test_hash(self): - v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) - v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) - v4 = SparseVector(4, [(1, 1.0), (3, 2.5)]) - self.assertEqual(hash(v1), hash(v2)) - self.assertEqual(hash(v1), hash(v3)) - self.assertEqual(hash(v2), hash(v3)) - self.assertFalse(hash(v1) == hash(v4)) - self.assertFalse(hash(v2) == hash(v4)) - - def test_eq(self): - v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) - v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) - v4 = SparseVector(6, [(1, 1.0), (3, 5.5)]) - v5 = DenseVector([0.0, 1.0, 0.0, 2.5]) - v6 = SparseVector(4, [(1, 1.0), (3, 2.5)]) - self.assertEqual(v1, v2) - self.assertEqual(v1, v3) - self.assertFalse(v2 == v4) - self.assertFalse(v1 == v5) - self.assertFalse(v1 == v6) - - def test_equals(self): - indices = [1, 2, 4] - values = [1., 3., 2.] - self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.])) - self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.])) - self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.])) - self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.])) - - def test_conversion(self): - # numpy arrays should be automatically upcast to float64 - # tests for fix of [SPARK-5089] - v = array([1, 2, 3, 4], dtype='float64') - dv = DenseVector(v) - self.assertTrue(dv.array.dtype == 'float64') - v = array([1, 2, 3, 4], dtype='float32') - dv = DenseVector(v) - self.assertTrue(dv.array.dtype == 'float64') - - def test_sparse_vector_indexing(self): - sv = SparseVector(5, {1: 1, 3: 2}) - self.assertEqual(sv[0], 0.) - self.assertEqual(sv[3], 2.) - self.assertEqual(sv[1], 1.) - self.assertEqual(sv[2], 0.) - self.assertEqual(sv[4], 0.) - self.assertEqual(sv[-1], 0.) - self.assertEqual(sv[-2], 2.) - self.assertEqual(sv[-3], 0.) - self.assertEqual(sv[-5], 0.) - for ind in [5, -6]: - self.assertRaises(IndexError, sv.__getitem__, ind) - for ind in [7.8, '1']: - self.assertRaises(TypeError, sv.__getitem__, ind) - - zeros = SparseVector(4, {}) - self.assertEqual(zeros[0], 0.0) - self.assertEqual(zeros[3], 0.0) - for ind in [4, -5]: - self.assertRaises(IndexError, zeros.__getitem__, ind) - - empty = SparseVector(0, {}) - for ind in [-1, 0, 1]: - self.assertRaises(IndexError, empty.__getitem__, ind) - - def test_sparse_vector_iteration(self): - self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0]) - self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0]) - - def test_matrix_indexing(self): - mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10]) - expected = [[0, 6], [1, 8], [4, 10]] - for i in range(3): - for j in range(2): - self.assertEqual(mat[i, j], expected[i][j]) - - for i, j in [(-1, 0), (4, 1), (3, 4)]: - self.assertRaises(IndexError, mat.__getitem__, (i, j)) - - def test_repr_dense_matrix(self): - mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10]) - self.assertTrue( - repr(mat), - 'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)') - - mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True) - self.assertTrue( - repr(mat), - 'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)') - - mat = DenseMatrix(6, 3, zeros(18)) - self.assertTrue( - repr(mat), - 'DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)') - - def test_repr_sparse_matrix(self): - sm1t = SparseMatrix( - 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], - isTransposed=True) - self.assertTrue( - repr(sm1t), - 'SparseMatrix(3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], True)') - - indices = tile(arange(6), 3) - values = ones(18) - sm = SparseMatrix(6, 3, [0, 6, 12, 18], indices, values) - self.assertTrue( - repr(sm), "SparseMatrix(6, 3, [0, 6, 12, 18], \ - [0, 1, 2, 3, 4, 5, 0, 1, ..., 4, 5, 0, 1, 2, 3, 4, 5], \ - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..., \ - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], False)") - - self.assertTrue( - str(sm), - "6 X 3 CSCMatrix\n\ - (0,0) 1.0\n(1,0) 1.0\n(2,0) 1.0\n(3,0) 1.0\n(4,0) 1.0\n(5,0) 1.0\n\ - (0,1) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(3,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n\ - (0,2) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(3,2) 1.0\n..\n..") - - sm = SparseMatrix(1, 18, zeros(19), [], []) - self.assertTrue( - repr(sm), - 'SparseMatrix(1, 18, \ - [0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0], [], [], False)') - - def test_sparse_matrix(self): - # Test sparse matrix creation. - sm1 = SparseMatrix( - 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0]) - self.assertEqual(sm1.numRows, 3) - self.assertEqual(sm1.numCols, 4) - self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4]) - self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2]) - self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0]) - self.assertTrue( - repr(sm1), - 'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)') - - # Test indexing - expected = [ - [0, 0, 0, 0], - [1, 0, 4, 0], - [2, 0, 5, 0]] - - for i in range(3): - for j in range(4): - self.assertEqual(expected[i][j], sm1[i, j]) - self.assertTrue(array_equal(sm1.toArray(), expected)) - - for i, j in [(-1, 1), (4, 3), (3, 5)]: - self.assertRaises(IndexError, sm1.__getitem__, (i, j)) - - # Test conversion to dense and sparse. - smnew = sm1.toDense().toSparse() - self.assertEqual(sm1.numRows, smnew.numRows) - self.assertEqual(sm1.numCols, smnew.numCols) - self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs)) - self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices)) - self.assertTrue(array_equal(sm1.values, smnew.values)) - - sm1t = SparseMatrix( - 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], - isTransposed=True) - self.assertEqual(sm1t.numRows, 3) - self.assertEqual(sm1t.numCols, 4) - self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5]) - self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2]) - self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0]) - - expected = [ - [3, 2, 0, 0], - [0, 0, 4, 0], - [9, 0, 8, 0]] - - for i in range(3): - for j in range(4): - self.assertEqual(expected[i][j], sm1t[i, j]) - self.assertTrue(array_equal(sm1t.toArray(), expected)) - - def test_dense_matrix_is_transposed(self): - mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True) - mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9]) - self.assertEqual(mat1, mat) - - expected = [[0, 4], [1, 6], [3, 9]] - for i in range(3): - for j in range(2): - self.assertEqual(mat1[i, j], expected[i][j]) - self.assertTrue(array_equal(mat1.toArray(), expected)) - - sm = mat1.toSparse() - self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2])) - self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5])) - self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9])) - - def test_norms(self): - a = DenseVector([0, 2, 3, -1]) - self.assertAlmostEqual(a.norm(2), 3.742, 3) - self.assertTrue(a.norm(1), 6) - self.assertTrue(a.norm(inf), 3) - a = SparseVector(4, [0, 2], [3, -4]) - self.assertAlmostEqual(a.norm(2), 5) - self.assertTrue(a.norm(1), 7) - self.assertTrue(a.norm(inf), 4) - - tmp = SparseVector(4, [0, 2], [3, 0]) - self.assertEqual(tmp.numNonzeros(), 1) - - -class VectorUDTTests(MLlibTestCase): - - dv0 = DenseVector([]) - dv1 = DenseVector([1.0, 2.0]) - sv0 = SparseVector(2, [], []) - sv1 = SparseVector(2, [1], [2.0]) - udt = VectorUDT() - - def test_json_schema(self): - self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt) - - def test_serialization(self): - for v in [self.dv0, self.dv1, self.sv0, self.sv1]: - self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v))) - - def test_infer_schema(self): - rdd = self.sc.parallelize([Row(label=1.0, features=self.dv1), - Row(label=0.0, features=self.sv1)]) - df = rdd.toDF() - schema = df.schema - field = [f for f in schema.fields if f.name == "features"][0] - self.assertEqual(field.dataType, self.udt) - vectors = df.rdd.map(lambda p: p.features).collect() - self.assertEqual(len(vectors), 2) - for v in vectors: - if isinstance(v, SparseVector): - self.assertEqual(v, self.sv1) - elif isinstance(v, DenseVector): - self.assertEqual(v, self.dv1) - else: - raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) - - -class MatrixUDTTests(MLlibTestCase): - - dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10]) - dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True) - sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0]) - sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True) - udt = MatrixUDT() - - def test_json_schema(self): - self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt) - - def test_serialization(self): - for m in [self.dm1, self.dm2, self.sm1, self.sm2]: - self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m))) - - def test_infer_schema(self): - rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)]) - df = rdd.toDF() - schema = df.schema - self.assertTrue(schema.fields[1].dataType, self.udt) - matrices = df.rdd.map(lambda x: x._2).collect() - self.assertEqual(len(matrices), 2) - for m in matrices: - if isinstance(m, DenseMatrix): - self.assertTrue(m, self.dm1) - elif isinstance(m, SparseMatrix): - self.assertTrue(m, self.sm1) - else: - raise ValueError("Expected a matrix but got type %r" % type(m)) - - -class WrapperTests(MLlibTestCase): - - def test_new_java_array(self): - # test array of strings - str_list = ["a", "b", "c"] - java_class = self.sc._gateway.jvm.java.lang.String - java_array = JavaWrapper._new_java_array(str_list, java_class) - self.assertEqual(_java2py(self.sc, java_array), str_list) - # test array of integers - int_list = [1, 2, 3] - java_class = self.sc._gateway.jvm.java.lang.Integer - java_array = JavaWrapper._new_java_array(int_list, java_class) - self.assertEqual(_java2py(self.sc, java_array), int_list) - # test array of floats - float_list = [0.1, 0.2, 0.3] - java_class = self.sc._gateway.jvm.java.lang.Double - java_array = JavaWrapper._new_java_array(float_list, java_class) - self.assertEqual(_java2py(self.sc, java_array), float_list) - # test array of bools - bool_list = [False, True, True] - java_class = self.sc._gateway.jvm.java.lang.Boolean - java_array = JavaWrapper._new_java_array(bool_list, java_class) - self.assertEqual(_java2py(self.sc, java_array), bool_list) - # test array of Java DenseVectors - v1 = DenseVector([0.0, 1.0]) - v2 = DenseVector([1.0, 0.0]) - vec_java_list = [_py2java(self.sc, v1), _py2java(self.sc, v2)] - java_class = self.sc._gateway.jvm.org.apache.spark.ml.linalg.DenseVector - java_array = JavaWrapper._new_java_array(vec_java_list, java_class) - self.assertEqual(_java2py(self.sc, java_array), [v1, v2]) - # test empty array - java_class = self.sc._gateway.jvm.java.lang.Integer - java_array = JavaWrapper._new_java_array([], java_class) - self.assertEqual(_java2py(self.sc, java_array), []) - - -class ChiSquareTestTests(SparkSessionTestCase): - - def test_chisquaretest(self): - data = [[0, Vectors.dense([0, 1, 2])], - [1, Vectors.dense([1, 1, 1])], - [2, Vectors.dense([2, 1, 0])]] - df = self.spark.createDataFrame(data, ['label', 'feat']) - res = ChiSquareTest.test(df, 'feat', 'label') - # This line is hitting the collect bug described in #17218, commented for now. - # pValues = res.select("degreesOfFreedom").collect()) - self.assertIsInstance(res, DataFrame) - fieldNames = set(field.name for field in res.schema.fields) - expectedFields = ["pValues", "degreesOfFreedom", "statistics"] - self.assertTrue(all(field in fieldNames for field in expectedFields)) - - -class UnaryTransformerTests(SparkSessionTestCase): - - def test_unary_transformer_validate_input_type(self): - shiftVal = 3 - transformer = MockUnaryTransformer(shiftVal=shiftVal)\ - .setInputCol("input").setOutputCol("output") - - # should not raise any errors - transformer.validateInputType(DoubleType()) - - with self.assertRaises(TypeError): - # passing the wrong input type should raise an error - transformer.validateInputType(IntegerType()) - - def test_unary_transformer_transform(self): - shiftVal = 3 - transformer = MockUnaryTransformer(shiftVal=shiftVal)\ - .setInputCol("input").setOutputCol("output") - - df = self.spark.range(0, 10).toDF('input') - df = df.withColumn("input", df.input.cast(dataType="double")) - - transformed_df = transformer.transform(df) - results = transformed_df.select("input", "output").collect() - - for res in results: - self.assertEqual(res.input + shiftVal, res.output) - - -class EstimatorTest(unittest.TestCase): - - def testDefaultFitMultiple(self): - N = 4 - data = MockDataset() - estimator = MockEstimator() - params = [{estimator.fake: i} for i in range(N)] - modelIter = estimator.fitMultiple(data, params) - indexList = [] - for index, model in modelIter: - self.assertEqual(model.getFake(), index) - indexList.append(index) - self.assertEqual(sorted(indexList), list(range(N))) - - -if __name__ == "__main__": - from pyspark.ml.tests import * - if xmlrunner: - unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2) - else: - unittest.main(verbosity=2) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/tuning.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/tuning.py deleted file mode 100644 index 1f4abf5..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/tuning.py +++ /dev/null @@ -1,785 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import itertools -import sys -from multiprocessing.pool import ThreadPool - -import numpy as np - -from pyspark import since, keyword_only -from pyspark.ml import Estimator, Model -from pyspark.ml.common import _py2java -from pyspark.ml.param import Params, Param, TypeConverters -from pyspark.ml.param.shared import HasCollectSubModels, HasParallelism, HasSeed -from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaParams -from pyspark.sql.functions import rand - -__all__ = ['ParamGridBuilder', 'CrossValidator', 'CrossValidatorModel', 'TrainValidationSplit', - 'TrainValidationSplitModel'] - - -def _parallelFitTasks(est, train, eva, validation, epm, collectSubModel): - """ - Creates a list of callables which can be called from different threads to fit and evaluate - an estimator in parallel. Each callable returns an `(index, metric)` pair. - - :param est: Estimator, the estimator to be fit. - :param train: DataFrame, training data set, used for fitting. - :param eva: Evaluator, used to compute `metric` - :param validation: DataFrame, validation data set, used for evaluation. - :param epm: Sequence of ParamMap, params maps to be used during fitting & evaluation. - :param collectSubModel: Whether to collect sub model. - :return: (int, float, subModel), an index into `epm` and the associated metric value. - """ - modelIter = est.fitMultiple(train, epm) - - def singleTask(): - index, model = next(modelIter) - metric = eva.evaluate(model.transform(validation, epm[index])) - return index, metric, model if collectSubModel else None - - return [singleTask] * len(epm) - - -class ParamGridBuilder(object): - r""" - Builder for a param grid used in grid search-based model selection. - - >>> from pyspark.ml.classification import LogisticRegression - >>> lr = LogisticRegression() - >>> output = ParamGridBuilder() \ - ... .baseOn({lr.labelCol: 'l'}) \ - ... .baseOn([lr.predictionCol, 'p']) \ - ... .addGrid(lr.regParam, [1.0, 2.0]) \ - ... .addGrid(lr.maxIter, [1, 5]) \ - ... .build() - >>> expected = [ - ... {lr.regParam: 1.0, lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'}, - ... {lr.regParam: 2.0, lr.maxIter: 1, lr.labelCol: 'l', lr.predictionCol: 'p'}, - ... {lr.regParam: 1.0, lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'}, - ... {lr.regParam: 2.0, lr.maxIter: 5, lr.labelCol: 'l', lr.predictionCol: 'p'}] - >>> len(output) == len(expected) - True - >>> all([m in expected for m in output]) - True - - .. versionadded:: 1.4.0 - """ - - def __init__(self): - self._param_grid = {} - - @since("1.4.0") - def addGrid(self, param, values): - """ - Sets the given parameters in this grid to fixed values. - """ - self._param_grid[param] = values - - return self - - @since("1.4.0") - def baseOn(self, *args): - """ - Sets the given parameters in this grid to fixed values. - Accepts either a parameter dictionary or a list of (parameter, value) pairs. - """ - if isinstance(args[0], dict): - self.baseOn(*args[0].items()) - else: - for (param, value) in args: - self.addGrid(param, [value]) - - return self - - @since("1.4.0") - def build(self): - """ - Builds and returns all combinations of parameters specified - by the param grid. - """ - keys = self._param_grid.keys() - grid_values = self._param_grid.values() - - def to_key_value_pairs(keys, values): - return [(key, key.typeConverter(value)) for key, value in zip(keys, values)] - - return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)] - - -class ValidatorParams(HasSeed): - """ - Common params for TrainValidationSplit and CrossValidator. - """ - - estimator = Param(Params._dummy(), "estimator", "estimator to be cross-validated") - estimatorParamMaps = Param(Params._dummy(), "estimatorParamMaps", "estimator param maps") - evaluator = Param( - Params._dummy(), "evaluator", - "evaluator used to select hyper-parameters that maximize the validator metric") - - def setEstimator(self, value): - """ - Sets the value of :py:attr:`estimator`. - """ - return self._set(estimator=value) - - def getEstimator(self): - """ - Gets the value of estimator or its default value. - """ - return self.getOrDefault(self.estimator) - - def setEstimatorParamMaps(self, value): - """ - Sets the value of :py:attr:`estimatorParamMaps`. - """ - return self._set(estimatorParamMaps=value) - - def getEstimatorParamMaps(self): - """ - Gets the value of estimatorParamMaps or its default value. - """ - return self.getOrDefault(self.estimatorParamMaps) - - def setEvaluator(self, value): - """ - Sets the value of :py:attr:`evaluator`. - """ - return self._set(evaluator=value) - - def getEvaluator(self): - """ - Gets the value of evaluator or its default value. - """ - return self.getOrDefault(self.evaluator) - - @classmethod - def _from_java_impl(cls, java_stage): - """ - Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams. - """ - - # Load information from java_stage to the instance. - estimator = JavaParams._from_java(java_stage.getEstimator()) - evaluator = JavaParams._from_java(java_stage.getEvaluator()) - epms = [estimator._transfer_param_map_from_java(epm) - for epm in java_stage.getEstimatorParamMaps()] - return estimator, epms, evaluator - - def _to_java_impl(self): - """ - Return Java estimator, estimatorParamMaps, and evaluator from this Python instance. - """ - - gateway = SparkContext._gateway - cls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap - - java_epms = gateway.new_array(cls, len(self.getEstimatorParamMaps())) - for idx, epm in enumerate(self.getEstimatorParamMaps()): - java_epms[idx] = self.getEstimator()._transfer_param_map_to_java(epm) - - java_estimator = self.getEstimator()._to_java() - java_evaluator = self.getEvaluator()._to_java() - return java_estimator, java_epms, java_evaluator - - -class CrossValidator(Estimator, ValidatorParams, HasParallelism, HasCollectSubModels, - MLReadable, MLWritable): - """ - - K-fold cross validation performs model selection by splitting the dataset into a set of - non-overlapping randomly partitioned folds which are used as separate training and test datasets - e.g., with k=3 folds, K-fold cross validation will generate 3 (training, test) dataset pairs, - each of which uses 2/3 of the data for training and 1/3 for testing. Each fold is used as the - test set exactly once. - - - >>> from pyspark.ml.classification import LogisticRegression - >>> from pyspark.ml.evaluation import BinaryClassificationEvaluator - >>> from pyspark.ml.linalg import Vectors - >>> dataset = spark.createDataFrame( - ... [(Vectors.dense([0.0]), 0.0), - ... (Vectors.dense([0.4]), 1.0), - ... (Vectors.dense([0.5]), 0.0), - ... (Vectors.dense([0.6]), 1.0), - ... (Vectors.dense([1.0]), 1.0)] * 10, - ... ["features", "label"]) - >>> lr = LogisticRegression() - >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() - >>> evaluator = BinaryClassificationEvaluator() - >>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, - ... parallelism=2) - >>> cvModel = cv.fit(dataset) - >>> cvModel.avgMetrics[0] - 0.5 - >>> evaluator.evaluate(cvModel.transform(dataset)) - 0.8333... - - .. versionadded:: 1.4.0 - """ - - numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation", - typeConverter=TypeConverters.toInt) - - @keyword_only - def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, - seed=None, parallelism=1, collectSubModels=False): - """ - __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,\ - seed=None, parallelism=1, collectSubModels=False) - """ - super(CrossValidator, self).__init__() - self._setDefault(numFolds=3, parallelism=1) - kwargs = self._input_kwargs - self._set(**kwargs) - - @keyword_only - @since("1.4.0") - def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, - seed=None, parallelism=1, collectSubModels=False): - """ - setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,\ - seed=None, parallelism=1, collectSubModels=False): - Sets params for cross validator. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("1.4.0") - def setNumFolds(self, value): - """ - Sets the value of :py:attr:`numFolds`. - """ - return self._set(numFolds=value) - - @since("1.4.0") - def getNumFolds(self): - """ - Gets the value of numFolds or its default value. - """ - return self.getOrDefault(self.numFolds) - - def _fit(self, dataset): - est = self.getOrDefault(self.estimator) - epm = self.getOrDefault(self.estimatorParamMaps) - numModels = len(epm) - eva = self.getOrDefault(self.evaluator) - nFolds = self.getOrDefault(self.numFolds) - seed = self.getOrDefault(self.seed) - h = 1.0 / nFolds - randCol = self.uid + "_rand" - df = dataset.select("*", rand(seed).alias(randCol)) - metrics = [0.0] * numModels - - pool = ThreadPool(processes=min(self.getParallelism(), numModels)) - subModels = None - collectSubModelsParam = self.getCollectSubModels() - if collectSubModelsParam: - subModels = [[None for j in range(numModels)] for i in range(nFolds)] - - for i in range(nFolds): - validateLB = i * h - validateUB = (i + 1) * h - condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) - validation = df.filter(condition).cache() - train = df.filter(~condition).cache() - - tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam) - for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks): - metrics[j] += (metric / nFolds) - if collectSubModelsParam: - subModels[i][j] = subModel - - validation.unpersist() - train.unpersist() - - if eva.isLargerBetter(): - bestIndex = np.argmax(metrics) - else: - bestIndex = np.argmin(metrics) - bestModel = est.fit(dataset, epm[bestIndex]) - return self._copyValues(CrossValidatorModel(bestModel, metrics, subModels)) - - @since("1.4.0") - def copy(self, extra=None): - """ - Creates a copy of this instance with a randomly generated uid - and some extra params. This copies creates a deep copy of - the embedded paramMap, and copies the embedded and extra parameters over. - - :param extra: Extra parameters to copy to the new instance - :return: Copy of this instance - """ - if extra is None: - extra = dict() - newCV = Params.copy(self, extra) - if self.isSet(self.estimator): - newCV.setEstimator(self.getEstimator().copy(extra)) - # estimatorParamMaps remain the same - if self.isSet(self.evaluator): - newCV.setEvaluator(self.getEvaluator().copy(extra)) - return newCV - - @since("2.3.0") - def write(self): - """Returns an MLWriter instance for this ML instance.""" - return JavaMLWriter(self) - - @classmethod - @since("2.3.0") - def read(cls): - """Returns an MLReader instance for this class.""" - return JavaMLReader(cls) - - @classmethod - def _from_java(cls, java_stage): - """ - Given a Java CrossValidator, create and return a Python wrapper of it. - Used for ML persistence. - """ - - estimator, epms, evaluator = super(CrossValidator, cls)._from_java_impl(java_stage) - numFolds = java_stage.getNumFolds() - seed = java_stage.getSeed() - parallelism = java_stage.getParallelism() - collectSubModels = java_stage.getCollectSubModels() - # Create a new instance of this stage. - py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator, - numFolds=numFolds, seed=seed, parallelism=parallelism, - collectSubModels=collectSubModels) - py_stage._resetUid(java_stage.uid()) - return py_stage - - def _to_java(self): - """ - Transfer this instance to a Java CrossValidator. Used for ML persistence. - - :return: Java object equivalent to this instance. - """ - - estimator, epms, evaluator = super(CrossValidator, self)._to_java_impl() - - _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidator", self.uid) - _java_obj.setEstimatorParamMaps(epms) - _java_obj.setEvaluator(evaluator) - _java_obj.setEstimator(estimator) - _java_obj.setSeed(self.getSeed()) - _java_obj.setNumFolds(self.getNumFolds()) - _java_obj.setParallelism(self.getParallelism()) - _java_obj.setCollectSubModels(self.getCollectSubModels()) - - return _java_obj - - -class CrossValidatorModel(Model, ValidatorParams, MLReadable, MLWritable): - """ - - CrossValidatorModel contains the model with the highest average cross-validation - metric across folds and uses this model to transform input data. CrossValidatorModel - also tracks the metrics for each param map evaluated. - - .. versionadded:: 1.4.0 - """ - - def __init__(self, bestModel, avgMetrics=[], subModels=None): - super(CrossValidatorModel, self).__init__() - #: best model from cross validation - self.bestModel = bestModel - #: Average cross-validation metrics for each paramMap in - #: CrossValidator.estimatorParamMaps, in the corresponding order. - self.avgMetrics = avgMetrics - #: sub model list from cross validation - self.subModels = subModels - - def _transform(self, dataset): - return self.bestModel.transform(dataset) - - @since("1.4.0") - def copy(self, extra=None): - """ - Creates a copy of this instance with a randomly generated uid - and some extra params. This copies the underlying bestModel, - creates a deep copy of the embedded paramMap, and - copies the embedded and extra parameters over. - It does not copy the extra Params into the subModels. - - :param extra: Extra parameters to copy to the new instance - :return: Copy of this instance - """ - if extra is None: - extra = dict() - bestModel = self.bestModel.copy(extra) - avgMetrics = self.avgMetrics - subModels = self.subModels - return CrossValidatorModel(bestModel, avgMetrics, subModels) - - @since("2.3.0") - def write(self): - """Returns an MLWriter instance for this ML instance.""" - return JavaMLWriter(self) - - @classmethod - @since("2.3.0") - def read(cls): - """Returns an MLReader instance for this class.""" - return JavaMLReader(cls) - - @classmethod - def _from_java(cls, java_stage): - """ - Given a Java CrossValidatorModel, create and return a Python wrapper of it. - Used for ML persistence. - """ - bestModel = JavaParams._from_java(java_stage.bestModel()) - estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) - - py_stage = cls(bestModel=bestModel).setEstimator(estimator) - py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator) - - if java_stage.hasSubModels(): - py_stage.subModels = [[JavaParams._from_java(sub_model) - for sub_model in fold_sub_models] - for fold_sub_models in java_stage.subModels()] - - py_stage._resetUid(java_stage.uid()) - return py_stage - - def _to_java(self): - """ - Transfer this instance to a Java CrossValidatorModel. Used for ML persistence. - - :return: Java object equivalent to this instance. - """ - - sc = SparkContext._active_spark_context - # TODO: persist average metrics as well - _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", - self.uid, - self.bestModel._to_java(), - _py2java(sc, [])) - estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() - - _java_obj.set("evaluator", evaluator) - _java_obj.set("estimator", estimator) - _java_obj.set("estimatorParamMaps", epms) - - if self.subModels is not None: - java_sub_models = [[sub_model._to_java() for sub_model in fold_sub_models] - for fold_sub_models in self.subModels] - _java_obj.setSubModels(java_sub_models) - return _java_obj - - -class TrainValidationSplit(Estimator, ValidatorParams, HasParallelism, HasCollectSubModels, - MLReadable, MLWritable): - """ - .. note:: Experimental - - Validation for hyper-parameter tuning. Randomly splits the input dataset into train and - validation sets, and uses evaluation metric on the validation set to select the best model. - Similar to :class:`CrossValidator`, but only splits the set once. - - >>> from pyspark.ml.classification import LogisticRegression - >>> from pyspark.ml.evaluation import BinaryClassificationEvaluator - >>> from pyspark.ml.linalg import Vectors - >>> dataset = spark.createDataFrame( - ... [(Vectors.dense([0.0]), 0.0), - ... (Vectors.dense([0.4]), 1.0), - ... (Vectors.dense([0.5]), 0.0), - ... (Vectors.dense([0.6]), 1.0), - ... (Vectors.dense([1.0]), 1.0)] * 10, - ... ["features", "label"]) - >>> lr = LogisticRegression() - >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() - >>> evaluator = BinaryClassificationEvaluator() - >>> tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, - ... parallelism=2) - >>> tvsModel = tvs.fit(dataset) - >>> evaluator.evaluate(tvsModel.transform(dataset)) - 0.8333... - - .. versionadded:: 2.0.0 - """ - - trainRatio = Param(Params._dummy(), "trainRatio", "Param for ratio between train and\ - validation data. Must be between 0 and 1.", typeConverter=TypeConverters.toFloat) - - @keyword_only - def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75, - parallelism=1, collectSubModels=False, seed=None): - """ - __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75,\ - parallelism=1, collectSubModels=False, seed=None) - """ - super(TrainValidationSplit, self).__init__() - self._setDefault(trainRatio=0.75, parallelism=1) - kwargs = self._input_kwargs - self._set(**kwargs) - - @since("2.0.0") - @keyword_only - def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75, - parallelism=1, collectSubModels=False, seed=None): - """ - setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75,\ - parallelism=1, collectSubModels=False, seed=None): - Sets params for the train validation split. - """ - kwargs = self._input_kwargs - return self._set(**kwargs) - - @since("2.0.0") - def setTrainRatio(self, value): - """ - Sets the value of :py:attr:`trainRatio`. - """ - return self._set(trainRatio=value) - - @since("2.0.0") - def getTrainRatio(self): - """ - Gets the value of trainRatio or its default value. - """ - return self.getOrDefault(self.trainRatio) - - def _fit(self, dataset): - est = self.getOrDefault(self.estimator) - epm = self.getOrDefault(self.estimatorParamMaps) - numModels = len(epm) - eva = self.getOrDefault(self.evaluator) - tRatio = self.getOrDefault(self.trainRatio) - seed = self.getOrDefault(self.seed) - randCol = self.uid + "_rand" - df = dataset.select("*", rand(seed).alias(randCol)) - condition = (df[randCol] >= tRatio) - validation = df.filter(condition).cache() - train = df.filter(~condition).cache() - - subModels = None - collectSubModelsParam = self.getCollectSubModels() - if collectSubModelsParam: - subModels = [None for i in range(numModels)] - - tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam) - pool = ThreadPool(processes=min(self.getParallelism(), numModels)) - metrics = [None] * numModels - for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks): - metrics[j] = metric - if collectSubModelsParam: - subModels[j] = subModel - - train.unpersist() - validation.unpersist() - - if eva.isLargerBetter(): - bestIndex = np.argmax(metrics) - else: - bestIndex = np.argmin(metrics) - bestModel = est.fit(dataset, epm[bestIndex]) - return self._copyValues(TrainValidationSplitModel(bestModel, metrics, subModels)) - - @since("2.0.0") - def copy(self, extra=None): - """ - Creates a copy of this instance with a randomly generated uid - and some extra params. This copies creates a deep copy of - the embedded paramMap, and copies the embedded and extra parameters over. - - :param extra: Extra parameters to copy to the new instance - :return: Copy of this instance - """ - if extra is None: - extra = dict() - newTVS = Params.copy(self, extra) - if self.isSet(self.estimator): - newTVS.setEstimator(self.getEstimator().copy(extra)) - # estimatorParamMaps remain the same - if self.isSet(self.evaluator): - newTVS.setEvaluator(self.getEvaluator().copy(extra)) - return newTVS - - @since("2.3.0") - def write(self): - """Returns an MLWriter instance for this ML instance.""" - return JavaMLWriter(self) - - @classmethod - @since("2.3.0") - def read(cls): - """Returns an MLReader instance for this class.""" - return JavaMLReader(cls) - - @classmethod - def _from_java(cls, java_stage): - """ - Given a Java TrainValidationSplit, create and return a Python wrapper of it. - Used for ML persistence. - """ - - estimator, epms, evaluator = super(TrainValidationSplit, cls)._from_java_impl(java_stage) - trainRatio = java_stage.getTrainRatio() - seed = java_stage.getSeed() - parallelism = java_stage.getParallelism() - collectSubModels = java_stage.getCollectSubModels() - # Create a new instance of this stage. - py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator, - trainRatio=trainRatio, seed=seed, parallelism=parallelism, - collectSubModels=collectSubModels) - py_stage._resetUid(java_stage.uid()) - return py_stage - - def _to_java(self): - """ - Transfer this instance to a Java TrainValidationSplit. Used for ML persistence. - :return: Java object equivalent to this instance. - """ - - estimator, epms, evaluator = super(TrainValidationSplit, self)._to_java_impl() - - _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.TrainValidationSplit", - self.uid) - _java_obj.setEstimatorParamMaps(epms) - _java_obj.setEvaluator(evaluator) - _java_obj.setEstimator(estimator) - _java_obj.setTrainRatio(self.getTrainRatio()) - _java_obj.setSeed(self.getSeed()) - _java_obj.setParallelism(self.getParallelism()) - _java_obj.setCollectSubModels(self.getCollectSubModels()) - return _java_obj - - -class TrainValidationSplitModel(Model, ValidatorParams, MLReadable, MLWritable): - """ - .. note:: Experimental - - Model from train validation split. - - .. versionadded:: 2.0.0 - """ - - def __init__(self, bestModel, validationMetrics=[], subModels=None): - super(TrainValidationSplitModel, self).__init__() - #: best model from train validation split - self.bestModel = bestModel - #: evaluated validation metrics - self.validationMetrics = validationMetrics - #: sub models from train validation split - self.subModels = subModels - - def _transform(self, dataset): - return self.bestModel.transform(dataset) - - @since("2.0.0") - def copy(self, extra=None): - """ - Creates a copy of this instance with a randomly generated uid - and some extra params. This copies the underlying bestModel, - creates a deep copy of the embedded paramMap, and - copies the embedded and extra parameters over. - And, this creates a shallow copy of the validationMetrics. - It does not copy the extra Params into the subModels. - - :param extra: Extra parameters to copy to the new instance - :return: Copy of this instance - """ - if extra is None: - extra = dict() - bestModel = self.bestModel.copy(extra) - validationMetrics = list(self.validationMetrics) - subModels = self.subModels - return TrainValidationSplitModel(bestModel, validationMetrics, subModels) - - @since("2.3.0") - def write(self): - """Returns an MLWriter instance for this ML instance.""" - return JavaMLWriter(self) - - @classmethod - @since("2.3.0") - def read(cls): - """Returns an MLReader instance for this class.""" - return JavaMLReader(cls) - - @classmethod - def _from_java(cls, java_stage): - """ - Given a Java TrainValidationSplitModel, create and return a Python wrapper of it. - Used for ML persistence. - """ - - # Load information from java_stage to the instance. - bestModel = JavaParams._from_java(java_stage.bestModel()) - estimator, epms, evaluator = super(TrainValidationSplitModel, - cls)._from_java_impl(java_stage) - # Create a new instance of this stage. - py_stage = cls(bestModel=bestModel).setEstimator(estimator) - py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator) - - if java_stage.hasSubModels(): - py_stage.subModels = [JavaParams._from_java(sub_model) - for sub_model in java_stage.subModels()] - - py_stage._resetUid(java_stage.uid()) - return py_stage - - def _to_java(self): - """ - Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence. - :return: Java object equivalent to this instance. - """ - - sc = SparkContext._active_spark_context - # TODO: persst validation metrics as well - _java_obj = JavaParams._new_java_obj( - "org.apache.spark.ml.tuning.TrainValidationSplitModel", - self.uid, - self.bestModel._to_java(), - _py2java(sc, [])) - estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl() - - _java_obj.set("evaluator", evaluator) - _java_obj.set("estimator", estimator) - _java_obj.set("estimatorParamMaps", epms) - - if self.subModels is not None: - java_sub_models = [sub_model._to_java() for sub_model in self.subModels] - _java_obj.setSubModels(java_sub_models) - - return _java_obj - - -if __name__ == "__main__": - import doctest - - from pyspark.sql import SparkSession - globs = globals().copy() - - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("ml.tuning tests")\ - .getOrCreate() - sc = spark.sparkContext - globs['sc'] = sc - globs['spark'] = spark - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/util.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/util.py deleted file mode 100644 index e846834..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/util.py +++ /dev/null @@ -1,613 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import json -import sys -import os -import time -import uuid -import warnings - -if sys.version > '3': - basestring = str - unicode = str - long = int - -from pyspark import SparkContext, since -from pyspark.ml.common import inherit_doc -from pyspark.sql import SparkSession -from pyspark.util import VersionUtils - - -def _jvm(): - """ - Returns the JVM view associated with SparkContext. Must be called - after SparkContext is initialized. - """ - jvm = SparkContext._jvm - if jvm: - return jvm - else: - raise AttributeError("Cannot load _jvm from SparkContext. Is SparkContext initialized?") - - -class Identifiable(object): - """ - Object with a unique ID. - """ - - def __init__(self): - #: A unique id for the object. - self.uid = self._randomUID() - - def __repr__(self): - return self.uid - - @classmethod - def _randomUID(cls): - """ - Generate a unique unicode id for the object. The default implementation - concatenates the class name, "_", and 12 random hex chars. - """ - return unicode(cls.__name__ + "_" + uuid.uuid4().hex[-12:]) - - -@inherit_doc -class BaseReadWrite(object): - """ - Base class for MLWriter and MLReader. Stores information about the SparkContext - and SparkSession. - - .. versionadded:: 2.3.0 - """ - - def __init__(self): - self._sparkSession = None - - def context(self, sqlContext): - """ - Sets the Spark SQLContext to use for saving/loading. - - .. note:: Deprecated in 2.1 and will be removed in 3.0, use session instead. - """ - raise NotImplementedError("Read/Write is not yet implemented for type: %s" % type(self)) - - def session(self, sparkSession): - """ - Sets the Spark Session to use for saving/loading. - """ - self._sparkSession = sparkSession - return self - - @property - def sparkSession(self): - """ - Returns the user-specified Spark Session or the default. - """ - if self._sparkSession is None: - self._sparkSession = SparkSession.builder.getOrCreate() - return self._sparkSession - - @property - def sc(self): - """ - Returns the underlying `SparkContext`. - """ - return self.sparkSession.sparkContext - - -@inherit_doc -class MLWriter(BaseReadWrite): - """ - Utility class that can save ML instances. - - .. versionadded:: 2.0.0 - """ - - def __init__(self): - super(MLWriter, self).__init__() - self.shouldOverwrite = False - - def _handleOverwrite(self, path): - from pyspark.ml.wrapper import JavaWrapper - - _java_obj = JavaWrapper._new_java_obj("org.apache.spark.ml.util.FileSystemOverwrite") - wrapper = JavaWrapper(_java_obj) - wrapper._call_java("handleOverwrite", path, True, self.sc._jsc.sc()) - - def save(self, path): - """Save the ML instance to the input path.""" - if self.shouldOverwrite: - self._handleOverwrite(path) - self.saveImpl(path) - - def saveImpl(self, path): - """ - save() handles overwriting and then calls this method. Subclasses should override this - method to implement the actual saving of the instance. - """ - raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self)) - - def overwrite(self): - """Overwrites if the output path already exists.""" - self.shouldOverwrite = True - return self - - -@inherit_doc -class GeneralMLWriter(MLWriter): - """ - Utility class that can save ML instances in different formats. - - .. versionadded:: 2.4.0 - """ - - def format(self, source): - """ - Specifies the format of ML export (e.g. "pmml", "internal", or the fully qualified class - name for export). - """ - self.source = source - return self - - -@inherit_doc -class JavaMLWriter(MLWriter): - """ - (Private) Specialization of :py:class:`MLWriter` for :py:class:`JavaParams` types - """ - - def __init__(self, instance): - super(JavaMLWriter, self).__init__() - _java_obj = instance._to_java() - self._jwrite = _java_obj.write() - - def save(self, path): - """Save the ML instance to the input path.""" - if not isinstance(path, basestring): - raise TypeError("path should be a basestring, got type %s" % type(path)) - self._jwrite.save(path) - - def overwrite(self): - """Overwrites if the output path already exists.""" - self._jwrite.overwrite() - return self - - def option(self, key, value): - self._jwrite.option(key, value) - return self - - def context(self, sqlContext): - """ - Sets the SQL context to use for saving. - - .. note:: Deprecated in 2.1 and will be removed in 3.0, use session instead. - """ - warnings.warn( - "Deprecated in 2.1 and will be removed in 3.0, use session instead.", - DeprecationWarning) - self._jwrite.context(sqlContext._ssql_ctx) - return self - - def session(self, sparkSession): - """Sets the Spark Session to use for saving.""" - self._jwrite.session(sparkSession._jsparkSession) - return self - - -@inherit_doc -class GeneralJavaMLWriter(JavaMLWriter): - """ - (Private) Specialization of :py:class:`GeneralMLWriter` for :py:class:`JavaParams` types - """ - - def __init__(self, instance): - super(GeneralJavaMLWriter, self).__init__(instance) - - def format(self, source): - """ - Specifies the format of ML export (e.g. "pmml", "internal", or the fully qualified class - name for export). - """ - self._jwrite.format(source) - return self - - -@inherit_doc -class MLWritable(object): - """ - Mixin for ML instances that provide :py:class:`MLWriter`. - - .. versionadded:: 2.0.0 - """ - - def write(self): - """Returns an MLWriter instance for this ML instance.""" - raise NotImplementedError("MLWritable is not yet implemented for type: %r" % type(self)) - - def save(self, path): - """Save this ML instance to the given path, a shortcut of 'write().save(path)'.""" - self.write().save(path) - - -@inherit_doc -class JavaMLWritable(MLWritable): - """ - (Private) Mixin for ML instances that provide :py:class:`JavaMLWriter`. - """ - - def write(self): - """Returns an MLWriter instance for this ML instance.""" - return JavaMLWriter(self) - - -@inherit_doc -class GeneralJavaMLWritable(JavaMLWritable): - """ - (Private) Mixin for ML instances that provide :py:class:`GeneralJavaMLWriter`. - """ - - def write(self): - """Returns an GeneralMLWriter instance for this ML instance.""" - return GeneralJavaMLWriter(self) - - -@inherit_doc -class MLReader(BaseReadWrite): - """ - Utility class that can load ML instances. - - .. versionadded:: 2.0.0 - """ - - def __init__(self): - super(MLReader, self).__init__() - - def load(self, path): - """Load the ML instance from the input path.""" - raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self)) - - -@inherit_doc -class JavaMLReader(MLReader): - """ - (Private) Specialization of :py:class:`MLReader` for :py:class:`JavaParams` types - """ - - def __init__(self, clazz): - super(JavaMLReader, self).__init__() - self._clazz = clazz - self._jread = self._load_java_obj(clazz).read() - - def load(self, path): - """Load the ML instance from the input path.""" - if not isinstance(path, basestring): - raise TypeError("path should be a basestring, got type %s" % type(path)) - java_obj = self._jread.load(path) - if not hasattr(self._clazz, "_from_java"): - raise NotImplementedError("This Java ML type cannot be loaded into Python currently: %r" - % self._clazz) - return self._clazz._from_java(java_obj) - - def context(self, sqlContext): - """ - Sets the SQL context to use for loading. - - .. note:: Deprecated in 2.1 and will be removed in 3.0, use session instead. - """ - warnings.warn( - "Deprecated in 2.1 and will be removed in 3.0, use session instead.", - DeprecationWarning) - self._jread.context(sqlContext._ssql_ctx) - return self - - def session(self, sparkSession): - """Sets the Spark Session to use for loading.""" - self._jread.session(sparkSession._jsparkSession) - return self - - @classmethod - def _java_loader_class(cls, clazz): - """ - Returns the full class name of the Java ML instance. The default - implementation replaces "pyspark" by "org.apache.spark" in - the Python full class name. - """ - java_package = clazz.__module__.replace("pyspark", "org.apache.spark") - if clazz.__name__ in ("Pipeline", "PipelineModel"): - # Remove the last package name "pipeline" for Pipeline and PipelineModel. - java_package = ".".join(java_package.split(".")[0:-1]) - return java_package + "." + clazz.__name__ - - @classmethod - def _load_java_obj(cls, clazz): - """Load the peer Java object of the ML instance.""" - java_class = cls._java_loader_class(clazz) - java_obj = _jvm() - for name in java_class.split("."): - java_obj = getattr(java_obj, name) - return java_obj - - -@inherit_doc -class MLReadable(object): - """ - Mixin for instances that provide :py:class:`MLReader`. - - .. versionadded:: 2.0.0 - """ - - @classmethod - def read(cls): - """Returns an MLReader instance for this class.""" - raise NotImplementedError("MLReadable.read() not implemented for type: %r" % cls) - - @classmethod - def load(cls, path): - """Reads an ML instance from the input path, a shortcut of `read().load(path)`.""" - return cls.read().load(path) - - -@inherit_doc -class JavaMLReadable(MLReadable): - """ - (Private) Mixin for instances that provide JavaMLReader. - """ - - @classmethod - def read(cls): - """Returns an MLReader instance for this class.""" - return JavaMLReader(cls) - - -@inherit_doc -class JavaPredictionModel(): - """ - (Private) Java Model for prediction tasks (regression and classification). - To be mixed in with class:`pyspark.ml.JavaModel` - """ - - @property - @since("2.1.0") - def numFeatures(self): - """ - Returns the number of features the model was trained on. If unknown, returns -1 - """ - return self._call_java("numFeatures") - - -@inherit_doc -class DefaultParamsWritable(MLWritable): - """ - .. note:: DeveloperApi - - Helper trait for making simple :py:class:`Params` types writable. If a :py:class:`Params` - class stores all data as :py:class:`Param` values, then extending this trait will provide - a default implementation of writing saved instances of the class. - This only handles simple :py:class:`Param` types; e.g., it will not handle - :py:class:`Dataset`. See :py:class:`DefaultParamsReadable`, the counterpart to this trait. - - .. versionadded:: 2.3.0 - """ - - def write(self): - """Returns a DefaultParamsWriter instance for this class.""" - from pyspark.ml.param import Params - - if isinstance(self, Params): - return DefaultParamsWriter(self) - else: - raise TypeError("Cannot use DefautParamsWritable with type %s because it does not " + - " extend Params.", type(self)) - - -@inherit_doc -class DefaultParamsWriter(MLWriter): - """ - .. note:: DeveloperApi - - Specialization of :py:class:`MLWriter` for :py:class:`Params` types - - Class for writing Estimators and Transformers whose parameters are JSON-serializable. - - .. versionadded:: 2.3.0 - """ - - def __init__(self, instance): - super(DefaultParamsWriter, self).__init__() - self.instance = instance - - def saveImpl(self, path): - DefaultParamsWriter.saveMetadata(self.instance, path, self.sc) - - @staticmethod - def saveMetadata(instance, path, sc, extraMetadata=None, paramMap=None): - """ - Saves metadata + Params to: path + "/metadata" - - class - - timestamp - - sparkVersion - - uid - - paramMap - - defaultParamMap (since 2.4.0) - - (optionally, extra metadata) - :param extraMetadata: Extra metadata to be saved at same level as uid, paramMap, etc. - :param paramMap: If given, this is saved in the "paramMap" field. - """ - metadataPath = os.path.join(path, "metadata") - metadataJson = DefaultParamsWriter._get_metadata_to_save(instance, - sc, - extraMetadata, - paramMap) - sc.parallelize([metadataJson], 1).saveAsTextFile(metadataPath) - - @staticmethod - def _get_metadata_to_save(instance, sc, extraMetadata=None, paramMap=None): - """ - Helper for :py:meth:`DefaultParamsWriter.saveMetadata` which extracts the JSON to save. - This is useful for ensemble models which need to save metadata for many sub-models. - - .. note:: :py:meth:`DefaultParamsWriter.saveMetadata` for details on what this includes. - """ - uid = instance.uid - cls = instance.__module__ + '.' + instance.__class__.__name__ - - # User-supplied param values - params = instance._paramMap - jsonParams = {} - if paramMap is not None: - jsonParams = paramMap - else: - for p in params: - jsonParams[p.name] = params[p] - - # Default param values - jsonDefaultParams = {} - for p in instance._defaultParamMap: - jsonDefaultParams[p.name] = instance._defaultParamMap[p] - - basicMetadata = {"class": cls, "timestamp": long(round(time.time() * 1000)), - "sparkVersion": sc.version, "uid": uid, "paramMap": jsonParams, - "defaultParamMap": jsonDefaultParams} - if extraMetadata is not None: - basicMetadata.update(extraMetadata) - return json.dumps(basicMetadata, separators=[',', ':']) - - -@inherit_doc -class DefaultParamsReadable(MLReadable): - """ - .. note:: DeveloperApi - - Helper trait for making simple :py:class:`Params` types readable. - If a :py:class:`Params` class stores all data as :py:class:`Param` values, - then extending this trait will provide a default implementation of reading saved - instances of the class. This only handles simple :py:class:`Param` types; - e.g., it will not handle :py:class:`Dataset`. See :py:class:`DefaultParamsWritable`, - the counterpart to this trait. - - .. versionadded:: 2.3.0 - """ - - @classmethod - def read(cls): - """Returns a DefaultParamsReader instance for this class.""" - return DefaultParamsReader(cls) - - -@inherit_doc -class DefaultParamsReader(MLReader): - """ - .. note:: DeveloperApi - - Specialization of :py:class:`MLReader` for :py:class:`Params` types - - Default :py:class:`MLReader` implementation for transformers and estimators that - contain basic (json-serializable) params and no data. This will not handle - more complex params or types with data (e.g., models with coefficients). - - .. versionadded:: 2.3.0 - """ - - def __init__(self, cls): - super(DefaultParamsReader, self).__init__() - self.cls = cls - - @staticmethod - def __get_class(clazz): - """ - Loads Python class from its name. - """ - parts = clazz.split('.') - module = ".".join(parts[:-1]) - m = __import__(module) - for comp in parts[1:]: - m = getattr(m, comp) - return m - - def load(self, path): - metadata = DefaultParamsReader.loadMetadata(path, self.sc) - py_type = DefaultParamsReader.__get_class(metadata['class']) - instance = py_type() - instance._resetUid(metadata['uid']) - DefaultParamsReader.getAndSetParams(instance, metadata) - return instance - - @staticmethod - def loadMetadata(path, sc, expectedClassName=""): - """ - Load metadata saved using :py:meth:`DefaultParamsWriter.saveMetadata` - - :param expectedClassName: If non empty, this is checked against the loaded metadata. - """ - metadataPath = os.path.join(path, "metadata") - metadataStr = sc.textFile(metadataPath, 1).first() - loadedVals = DefaultParamsReader._parseMetaData(metadataStr, expectedClassName) - return loadedVals - - @staticmethod - def _parseMetaData(metadataStr, expectedClassName=""): - """ - Parse metadata JSON string produced by :py:meth`DefaultParamsWriter._get_metadata_to_save`. - This is a helper function for :py:meth:`DefaultParamsReader.loadMetadata`. - - :param metadataStr: JSON string of metadata - :param expectedClassName: If non empty, this is checked against the loaded metadata. - """ - metadata = json.loads(metadataStr) - className = metadata['class'] - if len(expectedClassName) > 0: - assert className == expectedClassName, "Error loading metadata: Expected " + \ - "class name {} but found class name {}".format(expectedClassName, className) - return metadata - - @staticmethod - def getAndSetParams(instance, metadata): - """ - Extract Params from metadata, and set them in the instance. - """ - # Set user-supplied param values - for paramName in metadata['paramMap']: - param = instance.getParam(paramName) - paramValue = metadata['paramMap'][paramName] - instance.set(param, paramValue) - - # Set default param values - majorAndMinorVersions = VersionUtils.majorMinorVersion(metadata['sparkVersion']) - major = majorAndMinorVersions[0] - minor = majorAndMinorVersions[1] - - # For metadata file prior to Spark 2.4, there is no default section. - if major > 2 or (major == 2 and minor >= 4): - assert 'defaultParamMap' in metadata, "Error loading metadata: Expected " + \ - "`defaultParamMap` section not found" - - for paramName in metadata['defaultParamMap']: - paramValue = metadata['defaultParamMap'][paramName] - instance._setDefault(**{paramName: paramValue}) - - @staticmethod - def loadParamsInstance(path, sc): - """ - Load a :py:class:`Params` instance from the given path, and return it. - This assumes the instance inherits from :py:class:`MLReadable`. - """ - metadata = DefaultParamsReader.loadMetadata(path, sc) - pythonClassName = metadata['class'].replace("org.apache.spark", "pyspark") - py_type = DefaultParamsReader.__get_class(pythonClassName) - instance = py_type.load(path) - return instance diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/wrapper.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/wrapper.py deleted file mode 100644 index d325633..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/ml/wrapper.py +++ /dev/null @@ -1,347 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from abc import ABCMeta, abstractmethod -import sys -if sys.version >= '3': - xrange = range - -from pyspark import SparkContext -from pyspark.sql import DataFrame -from pyspark.ml import Estimator, Transformer, Model -from pyspark.ml.param import Params -from pyspark.ml.util import _jvm -from pyspark.ml.common import inherit_doc, _java2py, _py2java - - -class JavaWrapper(object): - """ - Wrapper class for a Java companion object - """ - def __init__(self, java_obj=None): - super(JavaWrapper, self).__init__() - self._java_obj = java_obj - - def __del__(self): - if SparkContext._active_spark_context and self._java_obj is not None: - SparkContext._active_spark_context._gateway.detach(self._java_obj) - - @classmethod - def _create_from_java_class(cls, java_class, *args): - """ - Construct this object from given Java classname and arguments - """ - java_obj = JavaWrapper._new_java_obj(java_class, *args) - return cls(java_obj) - - def _call_java(self, name, *args): - m = getattr(self._java_obj, name) - sc = SparkContext._active_spark_context - java_args = [_py2java(sc, arg) for arg in args] - return _java2py(sc, m(*java_args)) - - @staticmethod - def _new_java_obj(java_class, *args): - """ - Returns a new Java object. - """ - sc = SparkContext._active_spark_context - java_obj = _jvm() - for name in java_class.split("."): - java_obj = getattr(java_obj, name) - java_args = [_py2java(sc, arg) for arg in args] - return java_obj(*java_args) - - @staticmethod - def _new_java_array(pylist, java_class): - """ - Create a Java array of given java_class type. Useful for - calling a method with a Scala Array from Python with Py4J. - - :param pylist: - Python list to convert to a Java Array. - :param java_class: - Java class to specify the type of Array. Should be in the - form of sc._gateway.jvm.* (sc is a valid Spark Context). - :return: - Java Array of converted pylist. - - Example primitive Java classes: - - basestring -> sc._gateway.jvm.java.lang.String - - int -> sc._gateway.jvm.java.lang.Integer - - float -> sc._gateway.jvm.java.lang.Double - - bool -> sc._gateway.jvm.java.lang.Boolean - """ - sc = SparkContext._active_spark_context - java_array = sc._gateway.new_array(java_class, len(pylist)) - for i in xrange(len(pylist)): - java_array[i] = pylist[i] - return java_array - - -@inherit_doc -class JavaParams(JavaWrapper, Params): - """ - Utility class to help create wrapper classes from Java/Scala - implementations of pipeline components. - """ - #: The param values in the Java object should be - #: synced with the Python wrapper in fit/transform/evaluate/copy. - - __metaclass__ = ABCMeta - - def _make_java_param_pair(self, param, value): - """ - Makes a Java param pair. - """ - sc = SparkContext._active_spark_context - param = self._resolveParam(param) - java_param = self._java_obj.getParam(param.name) - java_value = _py2java(sc, value) - return java_param.w(java_value) - - def _transfer_params_to_java(self): - """ - Transforms the embedded params to the companion Java object. - """ - pair_defaults = [] - for param in self.params: - if self.isSet(param): - pair = self._make_java_param_pair(param, self._paramMap[param]) - self._java_obj.set(pair) - if self.hasDefault(param): - pair = self._make_java_param_pair(param, self._defaultParamMap[param]) - pair_defaults.append(pair) - if len(pair_defaults) > 0: - sc = SparkContext._active_spark_context - pair_defaults_seq = sc._jvm.PythonUtils.toSeq(pair_defaults) - self._java_obj.setDefault(pair_defaults_seq) - - def _transfer_param_map_to_java(self, pyParamMap): - """ - Transforms a Python ParamMap into a Java ParamMap. - """ - paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap") - for param in self.params: - if param in pyParamMap: - pair = self._make_java_param_pair(param, pyParamMap[param]) - paramMap.put([pair]) - return paramMap - - def _create_params_from_java(self): - """ - SPARK-10931: Temporary fix to create params that are defined in the Java obj but not here - """ - java_params = list(self._java_obj.params()) - from pyspark.ml.param import Param - for java_param in java_params: - java_param_name = java_param.name() - if not hasattr(self, java_param_name): - param = Param(self, java_param_name, java_param.doc()) - setattr(param, "created_from_java_param", True) - setattr(self, java_param_name, param) - self._params = None # need to reset so self.params will discover new params - - def _transfer_params_from_java(self): - """ - Transforms the embedded params from the companion Java object. - """ - sc = SparkContext._active_spark_context - for param in self.params: - if self._java_obj.hasParam(param.name): - java_param = self._java_obj.getParam(param.name) - # SPARK-14931: Only check set params back to avoid default params mismatch. - if self._java_obj.isSet(java_param): - value = _java2py(sc, self._java_obj.getOrDefault(java_param)) - self._set(**{param.name: value}) - # SPARK-10931: Temporary fix for params that have a default in Java - if self._java_obj.hasDefault(java_param) and not self.isDefined(param): - value = _java2py(sc, self._java_obj.getDefault(java_param)).get() - self._setDefault(**{param.name: value}) - - def _transfer_param_map_from_java(self, javaParamMap): - """ - Transforms a Java ParamMap into a Python ParamMap. - """ - sc = SparkContext._active_spark_context - paramMap = dict() - for pair in javaParamMap.toList(): - param = pair.param() - if self.hasParam(str(param.name())): - paramMap[self.getParam(param.name())] = _java2py(sc, pair.value()) - return paramMap - - @staticmethod - def _empty_java_param_map(): - """ - Returns an empty Java ParamMap reference. - """ - return _jvm().org.apache.spark.ml.param.ParamMap() - - def _to_java(self): - """ - Transfer this instance's Params to the wrapped Java object, and return the Java object. - Used for ML persistence. - - Meta-algorithms such as Pipeline should override this method. - - :return: Java object equivalent to this instance. - """ - self._transfer_params_to_java() - return self._java_obj - - @staticmethod - def _from_java(java_stage): - """ - Given a Java object, create and return a Python wrapper of it. - Used for ML persistence. - - Meta-algorithms such as Pipeline should override this method as a classmethod. - """ - def __get_class(clazz): - """ - Loads Python class from its name. - """ - parts = clazz.split('.') - module = ".".join(parts[:-1]) - m = __import__(module) - for comp in parts[1:]: - m = getattr(m, comp) - return m - stage_name = java_stage.getClass().getName().replace("org.apache.spark", "pyspark") - # Generate a default new instance from the stage_name class. - py_type = __get_class(stage_name) - if issubclass(py_type, JavaParams): - # Load information from java_stage to the instance. - py_stage = py_type() - py_stage._java_obj = java_stage - - # SPARK-10931: Temporary fix so that persisted models would own params from Estimator - if issubclass(py_type, JavaModel): - py_stage._create_params_from_java() - - py_stage._resetUid(java_stage.uid()) - py_stage._transfer_params_from_java() - elif hasattr(py_type, "_from_java"): - py_stage = py_type._from_java(java_stage) - else: - raise NotImplementedError("This Java stage cannot be loaded into Python currently: %r" - % stage_name) - return py_stage - - def copy(self, extra=None): - """ - Creates a copy of this instance with the same uid and some - extra params. This implementation first calls Params.copy and - then make a copy of the companion Java pipeline component with - extra params. So both the Python wrapper and the Java pipeline - component get copied. - - :param extra: Extra parameters to copy to the new instance - :return: Copy of this instance - """ - if extra is None: - extra = dict() - that = super(JavaParams, self).copy(extra) - if self._java_obj is not None: - that._java_obj = self._java_obj.copy(self._empty_java_param_map()) - that._transfer_params_to_java() - return that - - -@inherit_doc -class JavaEstimator(JavaParams, Estimator): - """ - Base class for :py:class:`Estimator`s that wrap Java/Scala - implementations. - """ - - __metaclass__ = ABCMeta - - @abstractmethod - def _create_model(self, java_model): - """ - Creates a model from the input Java model reference. - """ - raise NotImplementedError() - - def _fit_java(self, dataset): - """ - Fits a Java model to the input dataset. - - :param dataset: input dataset, which is an instance of - :py:class:`pyspark.sql.DataFrame` - :param params: additional params (overwriting embedded values) - :return: fitted Java model - """ - self._transfer_params_to_java() - return self._java_obj.fit(dataset._jdf) - - def _fit(self, dataset): - java_model = self._fit_java(dataset) - model = self._create_model(java_model) - return self._copyValues(model) - - -@inherit_doc -class JavaTransformer(JavaParams, Transformer): - """ - Base class for :py:class:`Transformer`s that wrap Java/Scala - implementations. Subclasses should ensure they have the transformer Java object - available as _java_obj. - """ - - __metaclass__ = ABCMeta - - def _transform(self, dataset): - self._transfer_params_to_java() - return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx) - - -@inherit_doc -class JavaModel(JavaTransformer, Model): - """ - Base class for :py:class:`Model`s that wrap Java/Scala - implementations. Subclasses should inherit this class before - param mix-ins, because this sets the UID from the Java model. - """ - - __metaclass__ = ABCMeta - - def __init__(self, java_model=None): - """ - Initialize this instance with a Java model object. - Subclasses should call this constructor, initialize params, - and then call _transfer_params_from_java. - - This instance can be instantiated without specifying java_model, - it will be assigned after that, but this scenario only used by - :py:class:`JavaMLReader` to load models. This is a bit of a - hack, but it is easiest since a proper fix would require - MLReader (in pyspark.ml.util) to depend on these wrappers, but - these wrappers depend on pyspark.ml.util (both directly and via - other ML classes). - """ - super(JavaModel, self).__init__(java_model) - if java_model is not None: - - # SPARK-10931: This is a temporary fix to allow models to own params - # from estimators. Eventually, these params should be in models through - # using common base classes between estimators and models. - self._create_params_from_java() - - self._resetUid(java_model.uid()) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/__init__.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/__init__.py deleted file mode 100644 index ae26521..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -RDD-based machine learning APIs for Python (in maintenance mode). - -The `pyspark.mllib` package is in maintenance mode as of the Spark 2.0.0 release to encourage -migration to the DataFrame-based APIs under the `pyspark.ml` package. -""" -from __future__ import absolute_import - -# MLlib currently needs NumPy 1.4+, so complain if lower - -import numpy - -ver = [int(x) for x in numpy.version.version.split('.')[:2]] -if ver < [1, 4]: - raise Exception("MLlib requires NumPy 1.4+") - -__all__ = ['classification', 'clustering', 'feature', 'fpm', 'linalg', 'random', - 'recommendation', 'regression', 'stat', 'tree', 'util'] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/classification.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/classification.py deleted file mode 100644 index e00ed95..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/classification.py +++ /dev/null @@ -1,771 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from math import exp -import sys -import warnings - -import numpy -from numpy import array - -from pyspark import RDD, since -from pyspark.streaming import DStream -from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py -from pyspark.mllib.linalg import DenseVector, SparseVector, _convert_to_vector -from pyspark.mllib.regression import ( - LabeledPoint, LinearModel, _regression_train_wrapper, - StreamingLinearAlgorithm) -from pyspark.mllib.util import Saveable, Loader, inherit_doc - - -__all__ = ['LogisticRegressionModel', 'LogisticRegressionWithSGD', 'LogisticRegressionWithLBFGS', - 'SVMModel', 'SVMWithSGD', 'NaiveBayesModel', 'NaiveBayes', - 'StreamingLogisticRegressionWithSGD'] - - -class LinearClassificationModel(LinearModel): - """ - A private abstract class representing a multiclass classification - model. The categories are represented by int values: 0, 1, 2, etc. - """ - def __init__(self, weights, intercept): - super(LinearClassificationModel, self).__init__(weights, intercept) - self._threshold = None - - @since('1.4.0') - def setThreshold(self, value): - """ - Sets the threshold that separates positive predictions from - negative predictions. An example with prediction score greater - than or equal to this threshold is identified as a positive, - and negative otherwise. It is used for binary classification - only. - """ - self._threshold = value - - @property - @since('1.4.0') - def threshold(self): - """ - Returns the threshold (if any) used for converting raw - prediction scores into 0/1 predictions. It is used for - binary classification only. - """ - return self._threshold - - @since('1.4.0') - def clearThreshold(self): - """ - Clears the threshold so that `predict` will output raw - prediction scores. It is used for binary classification only. - """ - self._threshold = None - - @since('1.4.0') - def predict(self, test): - """ - Predict values for a single data point or an RDD of points - using the model trained. - """ - raise NotImplementedError - - -class LogisticRegressionModel(LinearClassificationModel): - - """ - Classification model trained using Multinomial/Binary Logistic - Regression. - - :param weights: - Weights computed for every feature. - :param intercept: - Intercept computed for this model. (Only used in Binary Logistic - Regression. In Multinomial Logistic Regression, the intercepts will - not bea single value, so the intercepts will be part of the - weights.) - :param numFeatures: - The dimension of the features. - :param numClasses: - The number of possible outcomes for k classes classification problem - in Multinomial Logistic Regression. By default, it is binary - logistic regression so numClasses will be set to 2. - - >>> data = [ - ... LabeledPoint(0.0, [0.0, 1.0]), - ... LabeledPoint(1.0, [1.0, 0.0]), - ... ] - >>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10) - >>> lrm.predict([1.0, 0.0]) - 1 - >>> lrm.predict([0.0, 1.0]) - 0 - >>> lrm.predict(sc.parallelize([[1.0, 0.0], [0.0, 1.0]])).collect() - [1, 0] - >>> lrm.clearThreshold() - >>> lrm.predict([0.0, 1.0]) - 0.279... - - >>> sparse_data = [ - ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})), - ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), - ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), - ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) - ... ] - >>> lrm = LogisticRegressionWithSGD.train(sc.parallelize(sparse_data), iterations=10) - >>> lrm.predict(array([0.0, 1.0])) - 1 - >>> lrm.predict(array([1.0, 0.0])) - 0 - >>> lrm.predict(SparseVector(2, {1: 1.0})) - 1 - >>> lrm.predict(SparseVector(2, {0: 1.0})) - 0 - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> lrm.save(sc, path) - >>> sameModel = LogisticRegressionModel.load(sc, path) - >>> sameModel.predict(array([0.0, 1.0])) - 1 - >>> sameModel.predict(SparseVector(2, {0: 1.0})) - 0 - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except: - ... pass - >>> multi_class_data = [ - ... LabeledPoint(0.0, [0.0, 1.0, 0.0]), - ... LabeledPoint(1.0, [1.0, 0.0, 0.0]), - ... LabeledPoint(2.0, [0.0, 0.0, 1.0]) - ... ] - >>> data = sc.parallelize(multi_class_data) - >>> mcm = LogisticRegressionWithLBFGS.train(data, iterations=10, numClasses=3) - >>> mcm.predict([0.0, 0.5, 0.0]) - 0 - >>> mcm.predict([0.8, 0.0, 0.0]) - 1 - >>> mcm.predict([0.0, 0.0, 0.3]) - 2 - - .. versionadded:: 0.9.0 - """ - def __init__(self, weights, intercept, numFeatures, numClasses): - super(LogisticRegressionModel, self).__init__(weights, intercept) - self._numFeatures = int(numFeatures) - self._numClasses = int(numClasses) - self._threshold = 0.5 - if self._numClasses == 2: - self._dataWithBiasSize = None - self._weightsMatrix = None - else: - self._dataWithBiasSize = self._coeff.size // (self._numClasses - 1) - self._weightsMatrix = self._coeff.toArray().reshape(self._numClasses - 1, - self._dataWithBiasSize) - - @property - @since('1.4.0') - def numFeatures(self): - """ - Dimension of the features. - """ - return self._numFeatures - - @property - @since('1.4.0') - def numClasses(self): - """ - Number of possible outcomes for k classes classification problem - in Multinomial Logistic Regression. - """ - return self._numClasses - - @since('0.9.0') - def predict(self, x): - """ - Predict values for a single data point or an RDD of points - using the model trained. - """ - if isinstance(x, RDD): - return x.map(lambda v: self.predict(v)) - - x = _convert_to_vector(x) - if self.numClasses == 2: - margin = self.weights.dot(x) + self._intercept - if margin > 0: - prob = 1 / (1 + exp(-margin)) - else: - exp_margin = exp(margin) - prob = exp_margin / (1 + exp_margin) - if self._threshold is None: - return prob - else: - return 1 if prob > self._threshold else 0 - else: - best_class = 0 - max_margin = 0.0 - if x.size + 1 == self._dataWithBiasSize: - for i in range(0, self._numClasses - 1): - margin = x.dot(self._weightsMatrix[i][0:x.size]) + \ - self._weightsMatrix[i][x.size] - if margin > max_margin: - max_margin = margin - best_class = i + 1 - else: - for i in range(0, self._numClasses - 1): - margin = x.dot(self._weightsMatrix[i]) - if margin > max_margin: - max_margin = margin - best_class = i + 1 - return best_class - - @since('1.4.0') - def save(self, sc, path): - """ - Save this model to the given path. - """ - java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel( - _py2java(sc, self._coeff), self.intercept, self.numFeatures, self.numClasses) - java_model.save(sc._jsc.sc(), path) - - @classmethod - @since('1.4.0') - def load(cls, sc, path): - """ - Load a model from the given path. - """ - java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel.load( - sc._jsc.sc(), path) - weights = _java2py(sc, java_model.weights()) - intercept = java_model.intercept() - numFeatures = java_model.numFeatures() - numClasses = java_model.numClasses() - threshold = java_model.getThreshold().get() - model = LogisticRegressionModel(weights, intercept, numFeatures, numClasses) - model.setThreshold(threshold) - return model - - def __repr__(self): - return self._call_java("toString") - - -class LogisticRegressionWithSGD(object): - """ - .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.classification.LogisticRegression or - LogisticRegressionWithLBFGS. - """ - @classmethod - @since('0.9.0') - def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, - initialWeights=None, regParam=0.01, regType="l2", intercept=False, - validateData=True, convergenceTol=0.001): - """ - Train a logistic regression model on the given data. - - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param regType: - The type of regularizer used for training our model. - Supported values: - - - "l1" for using L1 regularization - - "l2" for using L2 regularization (default) - - None for no regularization - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e., whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) - """ - warnings.warn( - "Deprecated in 2.0.0. Use ml.classification.LogisticRegression or " - "LogisticRegressionWithLBFGS.", DeprecationWarning) - - def train(rdd, i): - return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations), - float(step), float(miniBatchFraction), i, float(regParam), regType, - bool(intercept), bool(validateData), float(convergenceTol)) - - return _regression_train_wrapper(train, LogisticRegressionModel, data, initialWeights) - - -class LogisticRegressionWithLBFGS(object): - """ - .. versionadded:: 1.2.0 - """ - @classmethod - @since('1.2.0') - def train(cls, data, iterations=100, initialWeights=None, regParam=0.0, regType="l2", - intercept=False, corrections=10, tolerance=1e-6, validateData=True, numClasses=2): - """ - Train a logistic regression model on the given data. - - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param initialWeights: - The initial weights. - (default: None) - :param regParam: - The regularizer parameter. - (default: 0.0) - :param regType: - The type of regularizer used for training our model. - Supported values: - - - "l1" for using L1 regularization - - "l2" for using L2 regularization (default) - - None for no regularization - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e., whether bias - features are activated or not). - (default: False) - :param corrections: - The number of corrections used in the LBFGS update. - If a known updater is used for binary classification, - it calls the ml implementation and this parameter will - have no effect. (default: 10) - :param tolerance: - The convergence tolerance of iterations for L-BFGS. - (default: 1e-6) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param numClasses: - The number of classes (i.e., outcomes) a label can take in - Multinomial Logistic Regression. - (default: 2) - - >>> data = [ - ... LabeledPoint(0.0, [0.0, 1.0]), - ... LabeledPoint(1.0, [1.0, 0.0]), - ... ] - >>> lrm = LogisticRegressionWithLBFGS.train(sc.parallelize(data), iterations=10) - >>> lrm.predict([1.0, 0.0]) - 1 - >>> lrm.predict([0.0, 1.0]) - 0 - """ - def train(rdd, i): - return callMLlibFunc("trainLogisticRegressionModelWithLBFGS", rdd, int(iterations), i, - float(regParam), regType, bool(intercept), int(corrections), - float(tolerance), bool(validateData), int(numClasses)) - - if initialWeights is None: - if numClasses == 2: - initialWeights = [0.0] * len(data.first().features) - else: - if intercept: - initialWeights = [0.0] * (len(data.first().features) + 1) * (numClasses - 1) - else: - initialWeights = [0.0] * len(data.first().features) * (numClasses - 1) - return _regression_train_wrapper(train, LogisticRegressionModel, data, initialWeights) - - -class SVMModel(LinearClassificationModel): - - """ - Model for Support Vector Machines (SVMs). - - :param weights: - Weights computed for every feature. - :param intercept: - Intercept computed for this model. - - >>> data = [ - ... LabeledPoint(0.0, [0.0]), - ... LabeledPoint(1.0, [1.0]), - ... LabeledPoint(1.0, [2.0]), - ... LabeledPoint(1.0, [3.0]) - ... ] - >>> svm = SVMWithSGD.train(sc.parallelize(data), iterations=10) - >>> svm.predict([1.0]) - 1 - >>> svm.predict(sc.parallelize([[1.0]])).collect() - [1] - >>> svm.clearThreshold() - >>> svm.predict(array([1.0])) - 1.44... - - >>> sparse_data = [ - ... LabeledPoint(0.0, SparseVector(2, {0: -1.0})), - ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), - ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})), - ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) - ... ] - >>> svm = SVMWithSGD.train(sc.parallelize(sparse_data), iterations=10) - >>> svm.predict(SparseVector(2, {1: 1.0})) - 1 - >>> svm.predict(SparseVector(2, {0: -1.0})) - 0 - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> svm.save(sc, path) - >>> sameModel = SVMModel.load(sc, path) - >>> sameModel.predict(SparseVector(2, {1: 1.0})) - 1 - >>> sameModel.predict(SparseVector(2, {0: -1.0})) - 0 - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except: - ... pass - - .. versionadded:: 0.9.0 - """ - def __init__(self, weights, intercept): - super(SVMModel, self).__init__(weights, intercept) - self._threshold = 0.0 - - @since('0.9.0') - def predict(self, x): - """ - Predict values for a single data point or an RDD of points - using the model trained. - """ - if isinstance(x, RDD): - return x.map(lambda v: self.predict(v)) - - x = _convert_to_vector(x) - margin = self.weights.dot(x) + self.intercept - if self._threshold is None: - return margin - else: - return 1 if margin > self._threshold else 0 - - @since('1.4.0') - def save(self, sc, path): - """ - Save this model to the given path. - """ - java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel( - _py2java(sc, self._coeff), self.intercept) - java_model.save(sc._jsc.sc(), path) - - @classmethod - @since('1.4.0') - def load(cls, sc, path): - """ - Load a model from the given path. - """ - java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load( - sc._jsc.sc(), path) - weights = _java2py(sc, java_model.weights()) - intercept = java_model.intercept() - threshold = java_model.getThreshold().get() - model = SVMModel(weights, intercept) - model.setThreshold(threshold) - return model - - -class SVMWithSGD(object): - """ - .. versionadded:: 0.9.0 - """ - - @classmethod - @since('0.9.0') - def train(cls, data, iterations=100, step=1.0, regParam=0.01, - miniBatchFraction=1.0, initialWeights=None, regType="l2", - intercept=False, validateData=True, convergenceTol=0.001): - """ - Train a support vector machine on the given data. - - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param regType: - The type of regularizer used for training our model. - Allowed values: - - - "l1" for using L1 regularization - - "l2" for using L2 regularization (default) - - None for no regularization - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e. whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) - """ - def train(rdd, i): - return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step), - float(regParam), float(miniBatchFraction), i, regType, - bool(intercept), bool(validateData), float(convergenceTol)) - - return _regression_train_wrapper(train, SVMModel, data, initialWeights) - - -@inherit_doc -class NaiveBayesModel(Saveable, Loader): - - """ - Model for Naive Bayes classifiers. - - :param labels: - List of labels. - :param pi: - Log of class priors, whose dimension is C, number of labels. - :param theta: - Log of class conditional probabilities, whose dimension is C-by-D, - where D is number of features. - - >>> data = [ - ... LabeledPoint(0.0, [0.0, 0.0]), - ... LabeledPoint(0.0, [0.0, 1.0]), - ... LabeledPoint(1.0, [1.0, 0.0]), - ... ] - >>> model = NaiveBayes.train(sc.parallelize(data)) - >>> model.predict(array([0.0, 1.0])) - 0.0 - >>> model.predict(array([1.0, 0.0])) - 1.0 - >>> model.predict(sc.parallelize([[1.0, 0.0]])).collect() - [1.0] - >>> sparse_data = [ - ... LabeledPoint(0.0, SparseVector(2, {1: 0.0})), - ... LabeledPoint(0.0, SparseVector(2, {1: 1.0})), - ... LabeledPoint(1.0, SparseVector(2, {0: 1.0})) - ... ] - >>> model = NaiveBayes.train(sc.parallelize(sparse_data)) - >>> model.predict(SparseVector(2, {1: 1.0})) - 0.0 - >>> model.predict(SparseVector(2, {0: 1.0})) - 1.0 - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> model.save(sc, path) - >>> sameModel = NaiveBayesModel.load(sc, path) - >>> sameModel.predict(SparseVector(2, {0: 1.0})) == model.predict(SparseVector(2, {0: 1.0})) - True - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except OSError: - ... pass - - .. versionadded:: 0.9.0 - """ - def __init__(self, labels, pi, theta): - self.labels = labels - self.pi = pi - self.theta = theta - - @since('0.9.0') - def predict(self, x): - """ - Return the most likely class for a data vector - or an RDD of vectors - """ - if isinstance(x, RDD): - return x.map(lambda v: self.predict(v)) - x = _convert_to_vector(x) - return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))] - - def save(self, sc, path): - """ - Save this model to the given path. - """ - java_labels = _py2java(sc, self.labels.tolist()) - java_pi = _py2java(sc, self.pi.tolist()) - java_theta = _py2java(sc, self.theta.tolist()) - java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel( - java_labels, java_pi, java_theta) - java_model.save(sc._jsc.sc(), path) - - @classmethod - @since('1.4.0') - def load(cls, sc, path): - """ - Load a model from the given path. - """ - java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load( - sc._jsc.sc(), path) - # Can not unpickle array.array from Pyrolite in Python3 with "bytes" - py_labels = _java2py(sc, java_model.labels(), "latin1") - py_pi = _java2py(sc, java_model.pi(), "latin1") - py_theta = _java2py(sc, java_model.theta(), "latin1") - return NaiveBayesModel(py_labels, py_pi, numpy.array(py_theta)) - - -class NaiveBayes(object): - """ - .. versionadded:: 0.9.0 - """ - - @classmethod - @since('0.9.0') - def train(cls, data, lambda_=1.0): - """ - Train a Naive Bayes model given an RDD of (label, features) - vectors. - - This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which - can handle all kinds of discrete data. For example, by - converting documents into TF-IDF vectors, it can be used for - document classification. By making every vector a 0-1 vector, - it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). - The input feature values must be nonnegative. - - :param data: - RDD of LabeledPoint. - :param lambda_: - The smoothing parameter. - (default: 1.0) - """ - first = data.first() - if not isinstance(first, LabeledPoint): - raise ValueError("`data` should be an RDD of LabeledPoint") - labels, pi, theta = callMLlibFunc("trainNaiveBayesModel", data, lambda_) - return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta)) - - -@inherit_doc -class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm): - """ - Train or predict a logistic regression model on streaming data. - Training uses Stochastic Gradient Descent to update the model based on - each new batch of incoming data from a DStream. - - Each batch of data is assumed to be an RDD of LabeledPoints. - The number of data points per batch can vary, but the number - of features must be constant. An initial weight - vector must be provided. - - :param stepSize: - Step size for each iteration of gradient descent. - (default: 0.1) - :param numIterations: - Number of iterations run for each batch of data. - (default: 50) - :param miniBatchFraction: - Fraction of each batch of data to use for updates. - (default: 1.0) - :param regParam: - L2 Regularization parameter. - (default: 0.0) - :param convergenceTol: - Value used to determine when to terminate iterations. - (default: 0.001) - - .. versionadded:: 1.5.0 - """ - def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.0, - convergenceTol=0.001): - self.stepSize = stepSize - self.numIterations = numIterations - self.regParam = regParam - self.miniBatchFraction = miniBatchFraction - self.convergenceTol = convergenceTol - self._model = None - super(StreamingLogisticRegressionWithSGD, self).__init__( - model=self._model) - - @since('1.5.0') - def setInitialWeights(self, initialWeights): - """ - Set the initial value of weights. - - This must be set before running trainOn and predictOn. - """ - initialWeights = _convert_to_vector(initialWeights) - - # LogisticRegressionWithSGD does only binary classification. - self._model = LogisticRegressionModel( - initialWeights, 0, initialWeights.size, 2) - return self - - @since('1.5.0') - def trainOn(self, dstream): - """Train the model on the incoming dstream.""" - self._validate(dstream) - - def update(rdd): - # LogisticRegressionWithSGD.train raises an error for an empty RDD. - if not rdd.isEmpty(): - self._model = LogisticRegressionWithSGD.train( - rdd, self.numIterations, self.stepSize, - self.miniBatchFraction, self._model.weights, - regParam=self.regParam, convergenceTol=self.convergenceTol) - - dstream.foreachRDD(update) - - -def _test(): - import doctest - from pyspark.sql import SparkSession - import pyspark.mllib.classification - globs = pyspark.mllib.classification.__dict__.copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("mllib.classification tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/clustering.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/clustering.py deleted file mode 100644 index b1a8af6..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/clustering.py +++ /dev/null @@ -1,1061 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import array as pyarray -import warnings - -if sys.version > '3': - xrange = range - basestring = str - -from math import exp, log - -from numpy import array, random, tile - -from collections import namedtuple - -from pyspark import SparkContext, since -from pyspark.rdd import RDD, ignore_unicode_prefix -from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py -from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.stat.distribution import MultivariateGaussian -from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, JavaSaveable -from pyspark.streaming import DStream - -__all__ = ['BisectingKMeansModel', 'BisectingKMeans', 'KMeansModel', 'KMeans', - 'GaussianMixtureModel', 'GaussianMixture', 'PowerIterationClusteringModel', - 'PowerIterationClustering', 'StreamingKMeans', 'StreamingKMeansModel', - 'LDA', 'LDAModel'] - - -@inherit_doc -class BisectingKMeansModel(JavaModelWrapper): - """ - A clustering model derived from the bisecting k-means method. - - >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2) - >>> bskm = BisectingKMeans() - >>> model = bskm.train(sc.parallelize(data, 2), k=4) - >>> p = array([0.0, 0.0]) - >>> model.predict(p) - 0 - >>> model.k - 4 - >>> model.computeCost(p) - 0.0 - - .. versionadded:: 2.0.0 - """ - - def __init__(self, java_model): - super(BisectingKMeansModel, self).__init__(java_model) - self.centers = [c.toArray() for c in self.call("clusterCenters")] - - @property - @since('2.0.0') - def clusterCenters(self): - """Get the cluster centers, represented as a list of NumPy - arrays.""" - return self.centers - - @property - @since('2.0.0') - def k(self): - """Get the number of clusters""" - return self.call("k") - - @since('2.0.0') - def predict(self, x): - """ - Find the cluster that each of the points belongs to in this - model. - - :param x: - A data point (or RDD of points) to determine cluster index. - :return: - Predicted cluster index or an RDD of predicted cluster indices - if the input is an RDD. - """ - if isinstance(x, RDD): - vecs = x.map(_convert_to_vector) - return self.call("predict", vecs) - - x = _convert_to_vector(x) - return self.call("predict", x) - - @since('2.0.0') - def computeCost(self, x): - """ - Return the Bisecting K-means cost (sum of squared distances of - points to their nearest center) for this model on the given - data. If provided with an RDD of points returns the sum. - - :param point: - A data point (or RDD of points) to compute the cost(s). - """ - if isinstance(x, RDD): - vecs = x.map(_convert_to_vector) - return self.call("computeCost", vecs) - - return self.call("computeCost", _convert_to_vector(x)) - - -class BisectingKMeans(object): - """ - A bisecting k-means algorithm based on the paper "A comparison of - document clustering techniques" by Steinbach, Karypis, and Kumar, - with modification to fit Spark. - The algorithm starts from a single cluster that contains all points. - Iteratively it finds divisible clusters on the bottom level and - bisects each of them using k-means, until there are `k` leaf - clusters in total or no leaf clusters are divisible. - The bisecting steps of clusters on the same level are grouped - together to increase parallelism. If bisecting all divisible - clusters on the bottom level would result more than `k` leaf - clusters, larger clusters get higher priority. - - Based on - U{http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf} - Steinbach, Karypis, and Kumar, A comparison of document clustering - techniques, KDD Workshop on Text Mining, 2000. - - .. versionadded:: 2.0.0 - """ - - @classmethod - @since('2.0.0') - def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604): - """ - Runs the bisecting k-means algorithm return the model. - - :param rdd: - Training points as an `RDD` of `Vector` or convertible - sequence types. - :param k: - The desired number of leaf clusters. The actual number could - be smaller if there are no divisible leaf clusters. - (default: 4) - :param maxIterations: - Maximum number of iterations allowed to split clusters. - (default: 20) - :param minDivisibleClusterSize: - Minimum number of points (if >= 1.0) or the minimum proportion - of points (if < 1.0) of a divisible cluster. - (default: 1) - :param seed: - Random seed value for cluster initialization. - (default: -1888008604 from classOf[BisectingKMeans].getName.##) - """ - java_model = callMLlibFunc( - "trainBisectingKMeans", rdd.map(_convert_to_vector), - k, maxIterations, minDivisibleClusterSize, seed) - return BisectingKMeansModel(java_model) - - -@inherit_doc -class KMeansModel(Saveable, Loader): - - """A clustering model derived from the k-means method. - - >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2) - >>> model = KMeans.train( - ... sc.parallelize(data), 2, maxIterations=10, initializationMode="random", - ... seed=50, initializationSteps=5, epsilon=1e-4) - >>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0])) - True - >>> model.predict(array([8.0, 9.0])) == model.predict(array([9.0, 8.0])) - True - >>> model.k - 2 - >>> model.computeCost(sc.parallelize(data)) - 2.0000000000000004 - >>> model = KMeans.train(sc.parallelize(data), 2) - >>> sparse_data = [ - ... SparseVector(3, {1: 1.0}), - ... SparseVector(3, {1: 1.1}), - ... SparseVector(3, {2: 1.0}), - ... SparseVector(3, {2: 1.1}) - ... ] - >>> model = KMeans.train(sc.parallelize(sparse_data), 2, initializationMode="k-means||", - ... seed=50, initializationSteps=5, epsilon=1e-4) - >>> model.predict(array([0., 1., 0.])) == model.predict(array([0, 1.1, 0.])) - True - >>> model.predict(array([0., 0., 1.])) == model.predict(array([0, 0, 1.1])) - True - >>> model.predict(sparse_data[0]) == model.predict(sparse_data[1]) - True - >>> model.predict(sparse_data[2]) == model.predict(sparse_data[3]) - True - >>> isinstance(model.clusterCenters, list) - True - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> model.save(sc, path) - >>> sameModel = KMeansModel.load(sc, path) - >>> sameModel.predict(sparse_data[0]) == model.predict(sparse_data[0]) - True - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except OSError: - ... pass - - >>> data = array([-383.1,-382.9, 28.7,31.2, 366.2,367.3]).reshape(3, 2) - >>> model = KMeans.train(sc.parallelize(data), 3, maxIterations=0, - ... initialModel = KMeansModel([(-1000.0,-1000.0),(5.0,5.0),(1000.0,1000.0)])) - >>> model.clusterCenters - [array([-1000., -1000.]), array([ 5., 5.]), array([ 1000., 1000.])] - - .. versionadded:: 0.9.0 - """ - - def __init__(self, centers): - self.centers = centers - - @property - @since('1.0.0') - def clusterCenters(self): - """Get the cluster centers, represented as a list of NumPy arrays.""" - return self.centers - - @property - @since('1.4.0') - def k(self): - """Total number of clusters.""" - return len(self.centers) - - @since('0.9.0') - def predict(self, x): - """ - Find the cluster that each of the points belongs to in this - model. - - :param x: - A data point (or RDD of points) to determine cluster index. - :return: - Predicted cluster index or an RDD of predicted cluster indices - if the input is an RDD. - """ - best = 0 - best_distance = float("inf") - if isinstance(x, RDD): - return x.map(self.predict) - - x = _convert_to_vector(x) - for i in xrange(len(self.centers)): - distance = x.squared_distance(self.centers[i]) - if distance < best_distance: - best = i - best_distance = distance - return best - - @since('1.4.0') - def computeCost(self, rdd): - """ - Return the K-means cost (sum of squared distances of points to - their nearest center) for this model on the given - data. - - :param rdd: - The RDD of points to compute the cost on. - """ - cost = callMLlibFunc("computeCostKmeansModel", rdd.map(_convert_to_vector), - [_convert_to_vector(c) for c in self.centers]) - return cost - - @since('1.4.0') - def save(self, sc, path): - """ - Save this model to the given path. - """ - java_centers = _py2java(sc, [_convert_to_vector(c) for c in self.centers]) - java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers) - java_model.save(sc._jsc.sc(), path) - - @classmethod - @since('1.4.0') - def load(cls, sc, path): - """ - Load a model from the given path. - """ - java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel.load(sc._jsc.sc(), path) - return KMeansModel(_java2py(sc, java_model.clusterCenters())) - - -class KMeans(object): - """ - .. versionadded:: 0.9.0 - """ - - @classmethod - @since('0.9.0') - def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||", - seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None): - """ - Train a k-means clustering model. - - :param rdd: - Training points as an `RDD` of `Vector` or convertible - sequence types. - :param k: - Number of clusters to create. - :param maxIterations: - Maximum number of iterations allowed. - (default: 100) - :param runs: - This param has no effect since Spark 2.0.0. - :param initializationMode: - The initialization algorithm. This can be either "random" or - "k-means||". - (default: "k-means||") - :param seed: - Random seed value for cluster initialization. Set as None to - generate seed based on system time. - (default: None) - :param initializationSteps: - Number of steps for the k-means|| initialization mode. - This is an advanced setting -- the default of 2 is almost - always enough. - (default: 2) - :param epsilon: - Distance threshold within which a center will be considered to - have converged. If all centers move less than this Euclidean - distance, iterations are stopped. - (default: 1e-4) - :param initialModel: - Initial cluster centers can be provided as a KMeansModel object - rather than using the random or k-means|| initializationModel. - (default: None) - """ - if runs != 1: - warnings.warn("The param `runs` has no effect since Spark 2.0.0.") - clusterInitialModel = [] - if initialModel is not None: - if not isinstance(initialModel, KMeansModel): - raise Exception("initialModel is of "+str(type(initialModel))+". It needs " - "to be of ") - clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters] - model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations, - runs, initializationMode, seed, initializationSteps, epsilon, - clusterInitialModel) - centers = callJavaFunc(rdd.context, model.clusterCenters) - return KMeansModel([c.toArray() for c in centers]) - - -@inherit_doc -class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader): - - """ - A clustering model derived from the Gaussian Mixture Model method. - - >>> from pyspark.mllib.linalg import Vectors, DenseMatrix - >>> from numpy.testing import assert_equal - >>> from shutil import rmtree - >>> import os, tempfile - - >>> clusterdata_1 = sc.parallelize(array([-0.1,-0.05,-0.01,-0.1, - ... 0.9,0.8,0.75,0.935, - ... -0.83,-0.68,-0.91,-0.76 ]).reshape(6, 2), 2) - >>> model = GaussianMixture.train(clusterdata_1, 3, convergenceTol=0.0001, - ... maxIterations=50, seed=10) - >>> labels = model.predict(clusterdata_1).collect() - >>> labels[0]==labels[1] - False - >>> labels[1]==labels[2] - False - >>> labels[4]==labels[5] - True - >>> model.predict([-0.1,-0.05]) - 0 - >>> softPredicted = model.predictSoft([-0.1,-0.05]) - >>> abs(softPredicted[0] - 1.0) < 0.001 - True - >>> abs(softPredicted[1] - 0.0) < 0.001 - True - >>> abs(softPredicted[2] - 0.0) < 0.001 - True - - >>> path = tempfile.mkdtemp() - >>> model.save(sc, path) - >>> sameModel = GaussianMixtureModel.load(sc, path) - >>> assert_equal(model.weights, sameModel.weights) - >>> mus, sigmas = list( - ... zip(*[(g.mu, g.sigma) for g in model.gaussians])) - >>> sameMus, sameSigmas = list( - ... zip(*[(g.mu, g.sigma) for g in sameModel.gaussians])) - >>> mus == sameMus - True - >>> sigmas == sameSigmas - True - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except OSError: - ... pass - - >>> data = array([-5.1971, -2.5359, -3.8220, - ... -5.2211, -5.0602, 4.7118, - ... 6.8989, 3.4592, 4.6322, - ... 5.7048, 4.6567, 5.5026, - ... 4.5605, 5.2043, 6.2734]) - >>> clusterdata_2 = sc.parallelize(data.reshape(5,3)) - >>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001, - ... maxIterations=150, seed=4) - >>> labels = model.predict(clusterdata_2).collect() - >>> labels[0]==labels[1] - True - >>> labels[2]==labels[3]==labels[4] - True - - .. versionadded:: 1.3.0 - """ - - @property - @since('1.4.0') - def weights(self): - """ - Weights for each Gaussian distribution in the mixture, where weights[i] is - the weight for Gaussian i, and weights.sum == 1. - """ - return array(self.call("weights")) - - @property - @since('1.4.0') - def gaussians(self): - """ - Array of MultivariateGaussian where gaussians[i] represents - the Multivariate Gaussian (Normal) Distribution for Gaussian i. - """ - return [ - MultivariateGaussian(gaussian[0], gaussian[1]) - for gaussian in self.call("gaussians")] - - @property - @since('1.4.0') - def k(self): - """Number of gaussians in mixture.""" - return len(self.weights) - - @since('1.3.0') - def predict(self, x): - """ - Find the cluster to which the point 'x' or each point in RDD 'x' - has maximum membership in this model. - - :param x: - A feature vector or an RDD of vectors representing data points. - :return: - Predicted cluster label or an RDD of predicted cluster labels - if the input is an RDD. - """ - if isinstance(x, RDD): - cluster_labels = self.predictSoft(x).map(lambda z: z.index(max(z))) - return cluster_labels - else: - z = self.predictSoft(x) - return z.argmax() - - @since('1.3.0') - def predictSoft(self, x): - """ - Find the membership of point 'x' or each point in RDD 'x' to all mixture components. - - :param x: - A feature vector or an RDD of vectors representing data points. - :return: - The membership value to all mixture components for vector 'x' - or each vector in RDD 'x'. - """ - if isinstance(x, RDD): - means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians]) - membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector), - _convert_to_vector(self.weights), means, sigmas) - return membership_matrix.map(lambda x: pyarray.array('d', x)) - else: - return self.call("predictSoft", _convert_to_vector(x)).toArray() - - @classmethod - @since('1.5.0') - def load(cls, sc, path): - """Load the GaussianMixtureModel from disk. - - :param sc: - SparkContext. - :param path: - Path to where the model is stored. - """ - model = cls._load_java(sc, path) - wrapper = sc._jvm.org.apache.spark.mllib.api.python.GaussianMixtureModelWrapper(model) - return cls(wrapper) - - -class GaussianMixture(object): - """ - Learning algorithm for Gaussian Mixtures using the expectation-maximization algorithm. - - .. versionadded:: 1.3.0 - """ - @classmethod - @since('1.3.0') - def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initialModel=None): - """ - Train a Gaussian Mixture clustering model. - - :param rdd: - Training points as an `RDD` of `Vector` or convertible - sequence types. - :param k: - Number of independent Gaussians in the mixture model. - :param convergenceTol: - Maximum change in log-likelihood at which convergence is - considered to have occurred. - (default: 1e-3) - :param maxIterations: - Maximum number of iterations allowed. - (default: 100) - :param seed: - Random seed for initial Gaussian distribution. Set as None to - generate seed based on system time. - (default: None) - :param initialModel: - Initial GMM starting point, bypassing the random - initialization. - (default: None) - """ - initialModelWeights = None - initialModelMu = None - initialModelSigma = None - if initialModel is not None: - if initialModel.k != k: - raise Exception("Mismatched cluster count, initialModel.k = %s, however k = %s" - % (initialModel.k, k)) - initialModelWeights = list(initialModel.weights) - initialModelMu = [initialModel.gaussians[i].mu for i in range(initialModel.k)] - initialModelSigma = [initialModel.gaussians[i].sigma for i in range(initialModel.k)] - java_model = callMLlibFunc("trainGaussianMixtureModel", rdd.map(_convert_to_vector), - k, convergenceTol, maxIterations, seed, - initialModelWeights, initialModelMu, initialModelSigma) - return GaussianMixtureModel(java_model) - - -class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader): - - """ - Model produced by [[PowerIterationClustering]]. - - >>> import math - >>> def genCircle(r, n): - ... points = [] - ... for i in range(0, n): - ... theta = 2.0 * math.pi * i / n - ... points.append((r * math.cos(theta), r * math.sin(theta))) - ... return points - >>> def sim(x, y): - ... dist2 = (x[0] - y[0]) * (x[0] - y[0]) + (x[1] - y[1]) * (x[1] - y[1]) - ... return math.exp(-dist2 / 2.0) - >>> r1 = 1.0 - >>> n1 = 10 - >>> r2 = 4.0 - >>> n2 = 40 - >>> n = n1 + n2 - >>> points = genCircle(r1, n1) + genCircle(r2, n2) - >>> similarities = [(i, j, sim(points[i], points[j])) for i in range(1, n) for j in range(0, i)] - >>> rdd = sc.parallelize(similarities, 2) - >>> model = PowerIterationClustering.train(rdd, 2, 40) - >>> model.k - 2 - >>> result = sorted(model.assignments().collect(), key=lambda x: x.id) - >>> result[0].cluster == result[1].cluster == result[2].cluster == result[3].cluster - True - >>> result[4].cluster == result[5].cluster == result[6].cluster == result[7].cluster - True - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> model.save(sc, path) - >>> sameModel = PowerIterationClusteringModel.load(sc, path) - >>> sameModel.k - 2 - >>> result = sorted(model.assignments().collect(), key=lambda x: x.id) - >>> result[0].cluster == result[1].cluster == result[2].cluster == result[3].cluster - True - >>> result[4].cluster == result[5].cluster == result[6].cluster == result[7].cluster - True - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except OSError: - ... pass - - .. versionadded:: 1.5.0 - """ - - @property - @since('1.5.0') - def k(self): - """ - Returns the number of clusters. - """ - return self.call("k") - - @since('1.5.0') - def assignments(self): - """ - Returns the cluster assignments of this model. - """ - return self.call("getAssignments").map( - lambda x: (PowerIterationClustering.Assignment(*x))) - - @classmethod - @since('1.5.0') - def load(cls, sc, path): - """ - Load a model from the given path. - """ - model = cls._load_java(sc, path) - wrapper =\ - sc._jvm.org.apache.spark.mllib.api.python.PowerIterationClusteringModelWrapper(model) - return PowerIterationClusteringModel(wrapper) - - -class PowerIterationClustering(object): - """ - Power Iteration Clustering (PIC), a scalable graph clustering algorithm - developed by [[http://www.cs.cmu.edu/~frank/papers/icml2010-pic-final.pdf Lin and Cohen]]. - From the abstract: PIC finds a very low-dimensional embedding of a - dataset using truncated power iteration on a normalized pair-wise - similarity matrix of the data. - - .. versionadded:: 1.5.0 - """ - - @classmethod - @since('1.5.0') - def train(cls, rdd, k, maxIterations=100, initMode="random"): - r""" - :param rdd: - An RDD of (i, j, s\ :sub:`ij`\) tuples representing the - affinity matrix, which is the matrix A in the PIC paper. The - similarity s\ :sub:`ij`\ must be nonnegative. This is a symmetric - matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\ For any (i, j) with - nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or - (j, i, s\ :sub:`ji`\) in the input. Tuples with i = j are ignored, - because it is assumed s\ :sub:`ij`\ = 0.0. - :param k: - Number of clusters. - :param maxIterations: - Maximum number of iterations of the PIC algorithm. - (default: 100) - :param initMode: - Initialization mode. This can be either "random" to use - a random vector as vertex properties, or "degree" to use - normalized sum similarities. - (default: "random") - """ - model = callMLlibFunc("trainPowerIterationClusteringModel", - rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode) - return PowerIterationClusteringModel(model) - - class Assignment(namedtuple("Assignment", ["id", "cluster"])): - """ - Represents an (id, cluster) tuple. - - .. versionadded:: 1.5.0 - """ - - -class StreamingKMeansModel(KMeansModel): - """ - Clustering model which can perform an online update of the centroids. - - The update formula for each centroid is given by - - * c_t+1 = ((c_t * n_t * a) + (x_t * m_t)) / (n_t + m_t) - * n_t+1 = n_t * a + m_t - - where - - * c_t: Centroid at the n_th iteration. - * n_t: Number of samples (or) weights associated with the centroid - at the n_th iteration. - * x_t: Centroid of the new data closest to c_t. - * m_t: Number of samples (or) weights of the new data closest to c_t - * c_t+1: New centroid. - * n_t+1: New number of weights. - * a: Decay Factor, which gives the forgetfulness. - - .. note:: If a is set to 1, it is the weighted mean of the previous - and new data. If it set to zero, the old centroids are completely - forgotten. - - :param clusterCenters: - Initial cluster centers. - :param clusterWeights: - List of weights assigned to each cluster. - - >>> initCenters = [[0.0, 0.0], [1.0, 1.0]] - >>> initWeights = [1.0, 1.0] - >>> stkm = StreamingKMeansModel(initCenters, initWeights) - >>> data = sc.parallelize([[-0.1, -0.1], [0.1, 0.1], - ... [0.9, 0.9], [1.1, 1.1]]) - >>> stkm = stkm.update(data, 1.0, u"batches") - >>> stkm.centers - array([[ 0., 0.], - [ 1., 1.]]) - >>> stkm.predict([-0.1, -0.1]) - 0 - >>> stkm.predict([0.9, 0.9]) - 1 - >>> stkm.clusterWeights - [3.0, 3.0] - >>> decayFactor = 0.0 - >>> data = sc.parallelize([DenseVector([1.5, 1.5]), DenseVector([0.2, 0.2])]) - >>> stkm = stkm.update(data, 0.0, u"batches") - >>> stkm.centers - array([[ 0.2, 0.2], - [ 1.5, 1.5]]) - >>> stkm.clusterWeights - [1.0, 1.0] - >>> stkm.predict([0.2, 0.2]) - 0 - >>> stkm.predict([1.5, 1.5]) - 1 - - .. versionadded:: 1.5.0 - """ - def __init__(self, clusterCenters, clusterWeights): - super(StreamingKMeansModel, self).__init__(centers=clusterCenters) - self._clusterWeights = list(clusterWeights) - - @property - @since('1.5.0') - def clusterWeights(self): - """Return the cluster weights.""" - return self._clusterWeights - - @ignore_unicode_prefix - @since('1.5.0') - def update(self, data, decayFactor, timeUnit): - """Update the centroids, according to data - - :param data: - RDD with new data for the model update. - :param decayFactor: - Forgetfulness of the previous centroids. - :param timeUnit: - Can be "batches" or "points". If points, then the decay factor - is raised to the power of number of new points and if batches, - then decay factor will be used as is. - """ - if not isinstance(data, RDD): - raise TypeError("Data should be of an RDD, got %s." % type(data)) - data = data.map(_convert_to_vector) - decayFactor = float(decayFactor) - if timeUnit not in ["batches", "points"]: - raise ValueError( - "timeUnit should be 'batches' or 'points', got %s." % timeUnit) - vectorCenters = [_convert_to_vector(center) for center in self.centers] - updatedModel = callMLlibFunc( - "updateStreamingKMeansModel", vectorCenters, self._clusterWeights, - data, decayFactor, timeUnit) - self.centers = array(updatedModel[0]) - self._clusterWeights = list(updatedModel[1]) - return self - - -class StreamingKMeans(object): - """ - Provides methods to set k, decayFactor, timeUnit to configure the - KMeans algorithm for fitting and predicting on incoming dstreams. - More details on how the centroids are updated are provided under the - docs of StreamingKMeansModel. - - :param k: - Number of clusters. - (default: 2) - :param decayFactor: - Forgetfulness of the previous centroids. - (default: 1.0) - :param timeUnit: - Can be "batches" or "points". If points, then the decay factor is - raised to the power of number of new points and if batches, then - decay factor will be used as is. - (default: "batches") - - .. versionadded:: 1.5.0 - """ - def __init__(self, k=2, decayFactor=1.0, timeUnit="batches"): - self._k = k - self._decayFactor = decayFactor - if timeUnit not in ["batches", "points"]: - raise ValueError( - "timeUnit should be 'batches' or 'points', got %s." % timeUnit) - self._timeUnit = timeUnit - self._model = None - - @since('1.5.0') - def latestModel(self): - """Return the latest model""" - return self._model - - def _validate(self, dstream): - if self._model is None: - raise ValueError( - "Initial centers should be set either by setInitialCenters " - "or setRandomCenters.") - if not isinstance(dstream, DStream): - raise TypeError( - "Expected dstream to be of type DStream, " - "got type %s" % type(dstream)) - - @since('1.5.0') - def setK(self, k): - """Set number of clusters.""" - self._k = k - return self - - @since('1.5.0') - def setDecayFactor(self, decayFactor): - """Set decay factor.""" - self._decayFactor = decayFactor - return self - - @since('1.5.0') - def setHalfLife(self, halfLife, timeUnit): - """ - Set number of batches after which the centroids of that - particular batch has half the weightage. - """ - self._timeUnit = timeUnit - self._decayFactor = exp(log(0.5) / halfLife) - return self - - @since('1.5.0') - def setInitialCenters(self, centers, weights): - """ - Set initial centers. Should be set before calling trainOn. - """ - self._model = StreamingKMeansModel(centers, weights) - return self - - @since('1.5.0') - def setRandomCenters(self, dim, weight, seed): - """ - Set the initial centres to be random samples from - a gaussian population with constant weights. - """ - rng = random.RandomState(seed) - clusterCenters = rng.randn(self._k, dim) - clusterWeights = tile(weight, self._k) - self._model = StreamingKMeansModel(clusterCenters, clusterWeights) - return self - - @since('1.5.0') - def trainOn(self, dstream): - """Train the model on the incoming dstream.""" - self._validate(dstream) - - def update(rdd): - self._model.update(rdd, self._decayFactor, self._timeUnit) - - dstream.foreachRDD(update) - - @since('1.5.0') - def predictOn(self, dstream): - """ - Make predictions on a dstream. - Returns a transformed dstream object - """ - self._validate(dstream) - return dstream.map(lambda x: self._model.predict(x)) - - @since('1.5.0') - def predictOnValues(self, dstream): - """ - Make predictions on a keyed dstream. - Returns a transformed dstream object. - """ - self._validate(dstream) - return dstream.mapValues(lambda x: self._model.predict(x)) - - -class LDAModel(JavaModelWrapper, JavaSaveable, Loader): - - """ A clustering model derived from the LDA method. - - Latent Dirichlet Allocation (LDA), a topic model designed for text documents. - Terminology - - "word" = "term": an element of the vocabulary - - "token": instance of a term appearing in a document - - "topic": multinomial distribution over words representing some concept - References: - - Original LDA paper (journal version): - Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003. - - >>> from pyspark.mllib.linalg import Vectors - >>> from numpy.testing import assert_almost_equal, assert_equal - >>> data = [ - ... [1, Vectors.dense([0.0, 1.0])], - ... [2, SparseVector(2, {0: 1.0})], - ... ] - >>> rdd = sc.parallelize(data) - >>> model = LDA.train(rdd, k=2, seed=1) - >>> model.vocabSize() - 2 - >>> model.describeTopics() - [([1, 0], [0.5..., 0.49...]), ([0, 1], [0.5..., 0.49...])] - >>> model.describeTopics(1) - [([1], [0.5...]), ([0], [0.5...])] - - >>> topics = model.topicsMatrix() - >>> topics_expect = array([[0.5, 0.5], [0.5, 0.5]]) - >>> assert_almost_equal(topics, topics_expect, 1) - - >>> import os, tempfile - >>> from shutil import rmtree - >>> path = tempfile.mkdtemp() - >>> model.save(sc, path) - >>> sameModel = LDAModel.load(sc, path) - >>> assert_equal(sameModel.topicsMatrix(), model.topicsMatrix()) - >>> sameModel.vocabSize() == model.vocabSize() - True - >>> try: - ... rmtree(path) - ... except OSError: - ... pass - - .. versionadded:: 1.5.0 - """ - - @since('1.5.0') - def topicsMatrix(self): - """Inferred topics, where each topic is represented by a distribution over terms.""" - return self.call("topicsMatrix").toArray() - - @since('1.5.0') - def vocabSize(self): - """Vocabulary size (number of terms or terms in the vocabulary)""" - return self.call("vocabSize") - - @since('1.6.0') - def describeTopics(self, maxTermsPerTopic=None): - """Return the topics described by weighted terms. - - WARNING: If vocabSize and k are large, this can return a large object! - - :param maxTermsPerTopic: - Maximum number of terms to collect for each topic. - (default: vocabulary size) - :return: - Array over topics. Each topic is represented as a pair of - matching arrays: (term indices, term weights in topic). - Each topic's terms are sorted in order of decreasing weight. - """ - if maxTermsPerTopic is None: - topics = self.call("describeTopics") - else: - topics = self.call("describeTopics", maxTermsPerTopic) - return topics - - @classmethod - @since('1.5.0') - def load(cls, sc, path): - """Load the LDAModel from disk. - - :param sc: - SparkContext. - :param path: - Path to where the model is stored. - """ - if not isinstance(sc, SparkContext): - raise TypeError("sc should be a SparkContext, got type %s" % type(sc)) - if not isinstance(path, basestring): - raise TypeError("path should be a basestring, got type %s" % type(path)) - model = callMLlibFunc("loadLDAModel", sc, path) - return LDAModel(model) - - -class LDA(object): - """ - .. versionadded:: 1.5.0 - """ - - @classmethod - @since('1.5.0') - def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0, - topicConcentration=-1.0, seed=None, checkpointInterval=10, optimizer="em"): - """Train a LDA model. - - :param rdd: - RDD of documents, which are tuples of document IDs and term - (word) count vectors. The term count vectors are "bags of - words" with a fixed-size vocabulary (where the vocabulary size - is the length of the vector). Document IDs must be unique - and >= 0. - :param k: - Number of topics to infer, i.e., the number of soft cluster - centers. - (default: 10) - :param maxIterations: - Maximum number of iterations allowed. - (default: 20) - :param docConcentration: - Concentration parameter (commonly named "alpha") for the prior - placed on documents' distributions over topics ("theta"). - (default: -1.0) - :param topicConcentration: - Concentration parameter (commonly named "beta" or "eta") for - the prior placed on topics' distributions over terms. - (default: -1.0) - :param seed: - Random seed for cluster initialization. Set as None to generate - seed based on system time. - (default: None) - :param checkpointInterval: - Period (in iterations) between checkpoints. - (default: 10) - :param optimizer: - LDAOptimizer used to perform the actual calculation. Currently - "em", "online" are supported. - (default: "em") - """ - model = callMLlibFunc("trainLDAModel", rdd, k, maxIterations, - docConcentration, topicConcentration, seed, - checkpointInterval, optimizer) - return LDAModel(model) - - -def _test(): - import doctest - import numpy - import pyspark.mllib.clustering - try: - # Numpy 1.14+ changed it's string format. - numpy.set_printoptions(legacy='1.13') - except TypeError: - pass - globs = pyspark.mllib.clustering.__dict__.copy() - globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2) - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - globs['sc'].stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/common.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/common.py deleted file mode 100644 index bac8f35..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/common.py +++ /dev/null @@ -1,163 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -if sys.version >= '3': - long = int - unicode = str - -import py4j.protocol -from py4j.protocol import Py4JJavaError -from py4j.java_gateway import JavaObject -from py4j.java_collections import JavaArray, JavaList - -from pyspark import RDD, SparkContext -from pyspark.serializers import PickleSerializer, AutoBatchedSerializer -from pyspark.sql import DataFrame, SQLContext - -# Hack for support float('inf') in Py4j -_old_smart_decode = py4j.protocol.smart_decode - -_float_str_mapping = { - 'nan': 'NaN', - 'inf': 'Infinity', - '-inf': '-Infinity', -} - - -def _new_smart_decode(obj): - if isinstance(obj, float): - s = str(obj) - return _float_str_mapping.get(s, s) - return _old_smart_decode(obj) - -py4j.protocol.smart_decode = _new_smart_decode - - -_picklable_classes = [ - 'LinkedList', - 'SparseVector', - 'DenseVector', - 'DenseMatrix', - 'Rating', - 'LabeledPoint', -] - - -# this will call the MLlib version of pythonToJava() -def _to_java_object_rdd(rdd): - """ Return a JavaRDD of Object by unpickling - - It will convert each Python object into Java object by Pyrolite, whenever the - RDD is serialized in batch or not. - """ - rdd = rdd._reserialize(AutoBatchedSerializer(PickleSerializer())) - return rdd.ctx._jvm.org.apache.spark.mllib.api.python.SerDe.pythonToJava(rdd._jrdd, True) - - -def _py2java(sc, obj): - """ Convert Python object into Java """ - if isinstance(obj, RDD): - obj = _to_java_object_rdd(obj) - elif isinstance(obj, DataFrame): - obj = obj._jdf - elif isinstance(obj, SparkContext): - obj = obj._jsc - elif isinstance(obj, list): - obj = [_py2java(sc, x) for x in obj] - elif isinstance(obj, JavaObject): - pass - elif isinstance(obj, (int, long, float, bool, bytes, unicode)): - pass - else: - data = bytearray(PickleSerializer().dumps(obj)) - obj = sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(data) - return obj - - -def _java2py(sc, r, encoding="bytes"): - if isinstance(r, JavaObject): - clsName = r.getClass().getSimpleName() - # convert RDD into JavaRDD - if clsName != 'JavaRDD' and clsName.endswith("RDD"): - r = r.toJavaRDD() - clsName = 'JavaRDD' - - if clsName == 'JavaRDD': - jrdd = sc._jvm.org.apache.spark.mllib.api.python.SerDe.javaToPython(r) - return RDD(jrdd, sc) - - if clsName == 'Dataset': - return DataFrame(r, SQLContext.getOrCreate(sc)) - - if clsName in _picklable_classes: - r = sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(r) - elif isinstance(r, (JavaArray, JavaList)): - try: - r = sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(r) - except Py4JJavaError: - pass # not pickable - - if isinstance(r, (bytearray, bytes)): - r = PickleSerializer().loads(bytes(r), encoding=encoding) - return r - - -def callJavaFunc(sc, func, *args): - """ Call Java Function """ - args = [_py2java(sc, a) for a in args] - return _java2py(sc, func(*args)) - - -def callMLlibFunc(name, *args): - """ Call API in PythonMLLibAPI """ - sc = SparkContext.getOrCreate() - api = getattr(sc._jvm.PythonMLLibAPI(), name) - return callJavaFunc(sc, api, *args) - - -class JavaModelWrapper(object): - """ - Wrapper for the model in JVM - """ - def __init__(self, java_model): - self._sc = SparkContext.getOrCreate() - self._java_model = java_model - - def __del__(self): - self._sc._gateway.detach(self._java_model) - - def call(self, name, *a): - """Call method of java_model""" - return callJavaFunc(self._sc, getattr(self._java_model, name), *a) - - -def inherit_doc(cls): - """ - A decorator that makes a class inherit documentation from its parents. - """ - for name, func in vars(cls).items(): - # only inherit docstring for public functions - if name.startswith("_"): - continue - if not func.__doc__: - for parent in cls.__bases__: - parent_func = getattr(parent, name, None) - if parent_func and getattr(parent_func, "__doc__", None): - func.__doc__ = parent_func.__doc__ - break - return cls diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/evaluation.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/evaluation.py deleted file mode 100644 index 0bb0ca3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/evaluation.py +++ /dev/null @@ -1,556 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import warnings - -from pyspark import since -from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc -from pyspark.sql import SQLContext -from pyspark.sql.types import StructField, StructType, DoubleType, IntegerType, ArrayType - -__all__ = ['BinaryClassificationMetrics', 'RegressionMetrics', - 'MulticlassMetrics', 'RankingMetrics'] - - -class BinaryClassificationMetrics(JavaModelWrapper): - """ - Evaluator for binary classification. - - :param scoreAndLabels: an RDD of (score, label) pairs - - >>> scoreAndLabels = sc.parallelize([ - ... (0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)], 2) - >>> metrics = BinaryClassificationMetrics(scoreAndLabels) - >>> metrics.areaUnderROC - 0.70... - >>> metrics.areaUnderPR - 0.83... - >>> metrics.unpersist() - - .. versionadded:: 1.4.0 - """ - - def __init__(self, scoreAndLabels): - sc = scoreAndLabels.ctx - sql_ctx = SQLContext.getOrCreate(sc) - df = sql_ctx.createDataFrame(scoreAndLabels, schema=StructType([ - StructField("score", DoubleType(), nullable=False), - StructField("label", DoubleType(), nullable=False)])) - java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics - java_model = java_class(df._jdf) - super(BinaryClassificationMetrics, self).__init__(java_model) - - @property - @since('1.4.0') - def areaUnderROC(self): - """ - Computes the area under the receiver operating characteristic - (ROC) curve. - """ - return self.call("areaUnderROC") - - @property - @since('1.4.0') - def areaUnderPR(self): - """ - Computes the area under the precision-recall curve. - """ - return self.call("areaUnderPR") - - @since('1.4.0') - def unpersist(self): - """ - Unpersists intermediate RDDs used in the computation. - """ - self.call("unpersist") - - -class RegressionMetrics(JavaModelWrapper): - """ - Evaluator for regression. - - :param predictionAndObservations: an RDD of (prediction, - observation) pairs. - - >>> predictionAndObservations = sc.parallelize([ - ... (2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)]) - >>> metrics = RegressionMetrics(predictionAndObservations) - >>> metrics.explainedVariance - 8.859... - >>> metrics.meanAbsoluteError - 0.5... - >>> metrics.meanSquaredError - 0.37... - >>> metrics.rootMeanSquaredError - 0.61... - >>> metrics.r2 - 0.94... - - .. versionadded:: 1.4.0 - """ - - def __init__(self, predictionAndObservations): - sc = predictionAndObservations.ctx - sql_ctx = SQLContext.getOrCreate(sc) - df = sql_ctx.createDataFrame(predictionAndObservations, schema=StructType([ - StructField("prediction", DoubleType(), nullable=False), - StructField("observation", DoubleType(), nullable=False)])) - java_class = sc._jvm.org.apache.spark.mllib.evaluation.RegressionMetrics - java_model = java_class(df._jdf) - super(RegressionMetrics, self).__init__(java_model) - - @property - @since('1.4.0') - def explainedVariance(self): - r""" - Returns the explained variance regression score. - explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}` - """ - return self.call("explainedVariance") - - @property - @since('1.4.0') - def meanAbsoluteError(self): - """ - Returns the mean absolute error, which is a risk function corresponding to the - expected value of the absolute error loss or l1-norm loss. - """ - return self.call("meanAbsoluteError") - - @property - @since('1.4.0') - def meanSquaredError(self): - """ - Returns the mean squared error, which is a risk function corresponding to the - expected value of the squared error loss or quadratic loss. - """ - return self.call("meanSquaredError") - - @property - @since('1.4.0') - def rootMeanSquaredError(self): - """ - Returns the root mean squared error, which is defined as the square root of - the mean squared error. - """ - return self.call("rootMeanSquaredError") - - @property - @since('1.4.0') - def r2(self): - """ - Returns R^2^, the coefficient of determination. - """ - return self.call("r2") - - -class MulticlassMetrics(JavaModelWrapper): - """ - Evaluator for multiclass classification. - - :param predictionAndLabels: an RDD of (prediction, label) pairs. - - >>> predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0), - ... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]) - >>> metrics = MulticlassMetrics(predictionAndLabels) - >>> metrics.confusionMatrix().toArray() - array([[ 2., 1., 1.], - [ 1., 3., 0.], - [ 0., 0., 1.]]) - >>> metrics.falsePositiveRate(0.0) - 0.2... - >>> metrics.precision(1.0) - 0.75... - >>> metrics.recall(2.0) - 1.0... - >>> metrics.fMeasure(0.0, 2.0) - 0.52... - >>> metrics.accuracy - 0.66... - >>> metrics.weightedFalsePositiveRate - 0.19... - >>> metrics.weightedPrecision - 0.68... - >>> metrics.weightedRecall - 0.66... - >>> metrics.weightedFMeasure() - 0.66... - >>> metrics.weightedFMeasure(2.0) - 0.65... - - .. versionadded:: 1.4.0 - """ - - def __init__(self, predictionAndLabels): - sc = predictionAndLabels.ctx - sql_ctx = SQLContext.getOrCreate(sc) - df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([ - StructField("prediction", DoubleType(), nullable=False), - StructField("label", DoubleType(), nullable=False)])) - java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics - java_model = java_class(df._jdf) - super(MulticlassMetrics, self).__init__(java_model) - - @since('1.4.0') - def confusionMatrix(self): - """ - Returns confusion matrix: predicted classes are in columns, - they are ordered by class label ascending, as in "labels". - """ - return self.call("confusionMatrix") - - @since('1.4.0') - def truePositiveRate(self, label): - """ - Returns true positive rate for a given label (category). - """ - return self.call("truePositiveRate", label) - - @since('1.4.0') - def falsePositiveRate(self, label): - """ - Returns false positive rate for a given label (category). - """ - return self.call("falsePositiveRate", label) - - @since('1.4.0') - def precision(self, label=None): - """ - Returns precision or precision for a given label (category) if specified. - """ - if label is None: - # note:: Deprecated in 2.0.0. Use accuracy. - warnings.warn("Deprecated in 2.0.0. Use accuracy.", DeprecationWarning) - return self.call("precision") - else: - return self.call("precision", float(label)) - - @since('1.4.0') - def recall(self, label=None): - """ - Returns recall or recall for a given label (category) if specified. - """ - if label is None: - # note:: Deprecated in 2.0.0. Use accuracy. - warnings.warn("Deprecated in 2.0.0. Use accuracy.", DeprecationWarning) - return self.call("recall") - else: - return self.call("recall", float(label)) - - @since('1.4.0') - def fMeasure(self, label=None, beta=None): - """ - Returns f-measure or f-measure for a given label (category) if specified. - """ - if beta is None: - if label is None: - # note:: Deprecated in 2.0.0. Use accuracy. - warnings.warn("Deprecated in 2.0.0. Use accuracy.", DeprecationWarning) - return self.call("fMeasure") - else: - return self.call("fMeasure", label) - else: - if label is None: - raise Exception("If the beta parameter is specified, label can not be none") - else: - return self.call("fMeasure", label, beta) - - @property - @since('2.0.0') - def accuracy(self): - """ - Returns accuracy (equals to the total number of correctly classified instances - out of the total number of instances). - """ - return self.call("accuracy") - - @property - @since('1.4.0') - def weightedTruePositiveRate(self): - """ - Returns weighted true positive rate. - (equals to precision, recall and f-measure) - """ - return self.call("weightedTruePositiveRate") - - @property - @since('1.4.0') - def weightedFalsePositiveRate(self): - """ - Returns weighted false positive rate. - """ - return self.call("weightedFalsePositiveRate") - - @property - @since('1.4.0') - def weightedRecall(self): - """ - Returns weighted averaged recall. - (equals to precision, recall and f-measure) - """ - return self.call("weightedRecall") - - @property - @since('1.4.0') - def weightedPrecision(self): - """ - Returns weighted averaged precision. - """ - return self.call("weightedPrecision") - - @since('1.4.0') - def weightedFMeasure(self, beta=None): - """ - Returns weighted averaged f-measure. - """ - if beta is None: - return self.call("weightedFMeasure") - else: - return self.call("weightedFMeasure", beta) - - -class RankingMetrics(JavaModelWrapper): - """ - Evaluator for ranking algorithms. - - :param predictionAndLabels: an RDD of (predicted ranking, - ground truth set) pairs. - - >>> predictionAndLabels = sc.parallelize([ - ... ([1, 6, 2, 7, 8, 3, 9, 10, 4, 5], [1, 2, 3, 4, 5]), - ... ([4, 1, 5, 6, 2, 7, 3, 8, 9, 10], [1, 2, 3]), - ... ([1, 2, 3, 4, 5], [])]) - >>> metrics = RankingMetrics(predictionAndLabels) - >>> metrics.precisionAt(1) - 0.33... - >>> metrics.precisionAt(5) - 0.26... - >>> metrics.precisionAt(15) - 0.17... - >>> metrics.meanAveragePrecision - 0.35... - >>> metrics.ndcgAt(3) - 0.33... - >>> metrics.ndcgAt(10) - 0.48... - - .. versionadded:: 1.4.0 - """ - - def __init__(self, predictionAndLabels): - sc = predictionAndLabels.ctx - sql_ctx = SQLContext.getOrCreate(sc) - df = sql_ctx.createDataFrame(predictionAndLabels, - schema=sql_ctx._inferSchema(predictionAndLabels)) - java_model = callMLlibFunc("newRankingMetrics", df._jdf) - super(RankingMetrics, self).__init__(java_model) - - @since('1.4.0') - def precisionAt(self, k): - """ - Compute the average precision of all the queries, truncated at ranking position k. - - If for a query, the ranking algorithm returns n (n < k) results, the precision value - will be computed as #(relevant items retrieved) / k. This formula also applies when - the size of the ground truth set is less than k. - - If a query has an empty ground truth set, zero will be used as precision together - with a log warning. - """ - return self.call("precisionAt", int(k)) - - @property - @since('1.4.0') - def meanAveragePrecision(self): - """ - Returns the mean average precision (MAP) of all the queries. - If a query has an empty ground truth set, the average precision will be zero and - a log warining is generated. - """ - return self.call("meanAveragePrecision") - - @since('1.4.0') - def ndcgAt(self, k): - """ - Compute the average NDCG value of all the queries, truncated at ranking position k. - The discounted cumulative gain at position k is computed as: - sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1), - and the NDCG is obtained by dividing the DCG value on the ground truth set. - In the current implementation, the relevance value is binary. - If a query has an empty ground truth set, zero will be used as NDCG together with - a log warning. - """ - return self.call("ndcgAt", int(k)) - - -class MultilabelMetrics(JavaModelWrapper): - """ - Evaluator for multilabel classification. - - :param predictionAndLabels: an RDD of (predictions, labels) pairs, - both are non-null Arrays, each with - unique elements. - - >>> predictionAndLabels = sc.parallelize([([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]), - ... ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]), - ... ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])]) - >>> metrics = MultilabelMetrics(predictionAndLabels) - >>> metrics.precision(0.0) - 1.0 - >>> metrics.recall(1.0) - 0.66... - >>> metrics.f1Measure(2.0) - 0.5 - >>> metrics.precision() - 0.66... - >>> metrics.recall() - 0.64... - >>> metrics.f1Measure() - 0.63... - >>> metrics.microPrecision - 0.72... - >>> metrics.microRecall - 0.66... - >>> metrics.microF1Measure - 0.69... - >>> metrics.hammingLoss - 0.33... - >>> metrics.subsetAccuracy - 0.28... - >>> metrics.accuracy - 0.54... - - .. versionadded:: 1.4.0 - """ - - def __init__(self, predictionAndLabels): - sc = predictionAndLabels.ctx - sql_ctx = SQLContext.getOrCreate(sc) - df = sql_ctx.createDataFrame(predictionAndLabels, - schema=sql_ctx._inferSchema(predictionAndLabels)) - java_class = sc._jvm.org.apache.spark.mllib.evaluation.MultilabelMetrics - java_model = java_class(df._jdf) - super(MultilabelMetrics, self).__init__(java_model) - - @since('1.4.0') - def precision(self, label=None): - """ - Returns precision or precision for a given label (category) if specified. - """ - if label is None: - return self.call("precision") - else: - return self.call("precision", float(label)) - - @since('1.4.0') - def recall(self, label=None): - """ - Returns recall or recall for a given label (category) if specified. - """ - if label is None: - return self.call("recall") - else: - return self.call("recall", float(label)) - - @since('1.4.0') - def f1Measure(self, label=None): - """ - Returns f1Measure or f1Measure for a given label (category) if specified. - """ - if label is None: - return self.call("f1Measure") - else: - return self.call("f1Measure", float(label)) - - @property - @since('1.4.0') - def microPrecision(self): - """ - Returns micro-averaged label-based precision. - (equals to micro-averaged document-based precision) - """ - return self.call("microPrecision") - - @property - @since('1.4.0') - def microRecall(self): - """ - Returns micro-averaged label-based recall. - (equals to micro-averaged document-based recall) - """ - return self.call("microRecall") - - @property - @since('1.4.0') - def microF1Measure(self): - """ - Returns micro-averaged label-based f1-measure. - (equals to micro-averaged document-based f1-measure) - """ - return self.call("microF1Measure") - - @property - @since('1.4.0') - def hammingLoss(self): - """ - Returns Hamming-loss. - """ - return self.call("hammingLoss") - - @property - @since('1.4.0') - def subsetAccuracy(self): - """ - Returns subset accuracy. - (for equal sets of labels) - """ - return self.call("subsetAccuracy") - - @property - @since('1.4.0') - def accuracy(self): - """ - Returns accuracy. - """ - return self.call("accuracy") - - -def _test(): - import doctest - import numpy - from pyspark.sql import SparkSession - import pyspark.mllib.evaluation - try: - # Numpy 1.14+ changed it's string format. - numpy.set_printoptions(legacy='1.13') - except TypeError: - pass - globs = pyspark.mllib.evaluation.__dict__.copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("mllib.evaluation tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/feature.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/feature.py deleted file mode 100644 index 6d7d4d6..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/feature.py +++ /dev/null @@ -1,826 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Python package for feature in MLlib. -""" -from __future__ import absolute_import - -import sys -import warnings -import random -import binascii -if sys.version >= '3': - basestring = str - unicode = str - -from py4j.protocol import Py4JJavaError - -from pyspark import since -from pyspark.rdd import RDD, ignore_unicode_prefix -from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper -from pyspark.mllib.linalg import ( - Vector, Vectors, DenseVector, SparseVector, _convert_to_vector) -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.util import JavaLoader, JavaSaveable - -__all__ = ['Normalizer', 'StandardScalerModel', 'StandardScaler', - 'HashingTF', 'IDFModel', 'IDF', 'Word2Vec', 'Word2VecModel', - 'ChiSqSelector', 'ChiSqSelectorModel', 'ElementwiseProduct'] - - -class VectorTransformer(object): - """ - .. note:: DeveloperApi - - Base class for transformation of a vector or RDD of vector - """ - def transform(self, vector): - """ - Applies transformation on a vector. - - :param vector: vector to be transformed. - """ - raise NotImplementedError - - -class Normalizer(VectorTransformer): - r""" - Normalizes samples individually to unit L\ :sup:`p`\ norm - - For any 1 <= `p` < float('inf'), normalizes samples using - sum(abs(vector) :sup:`p`) :sup:`(1/p)` as norm. - - For `p` = float('inf'), max(abs(vector)) will be used as norm for - normalization. - - :param p: Normalization in L^p^ space, p = 2 by default. - - >>> v = Vectors.dense(range(3)) - >>> nor = Normalizer(1) - >>> nor.transform(v) - DenseVector([0.0, 0.3333, 0.6667]) - - >>> rdd = sc.parallelize([v]) - >>> nor.transform(rdd).collect() - [DenseVector([0.0, 0.3333, 0.6667])] - - >>> nor2 = Normalizer(float("inf")) - >>> nor2.transform(v) - DenseVector([0.0, 0.5, 1.0]) - - .. versionadded:: 1.2.0 - """ - def __init__(self, p=2.0): - assert p >= 1.0, "p should be greater than 1.0" - self.p = float(p) - - @since('1.2.0') - def transform(self, vector): - """ - Applies unit length normalization on a vector. - - :param vector: vector or RDD of vector to be normalized. - :return: normalized vector. If the norm of the input is zero, it - will return the input vector. - """ - if isinstance(vector, RDD): - vector = vector.map(_convert_to_vector) - else: - vector = _convert_to_vector(vector) - return callMLlibFunc("normalizeVector", self.p, vector) - - -class JavaVectorTransformer(JavaModelWrapper, VectorTransformer): - """ - Wrapper for the model in JVM - """ - - def transform(self, vector): - """ - Applies transformation on a vector or an RDD[Vector]. - - .. note:: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. - - :param vector: Vector or RDD of Vector to be transformed. - """ - if isinstance(vector, RDD): - vector = vector.map(_convert_to_vector) - else: - vector = _convert_to_vector(vector) - return self.call("transform", vector) - - -class StandardScalerModel(JavaVectorTransformer): - """ - Represents a StandardScaler model that can transform vectors. - - .. versionadded:: 1.2.0 - """ - - @since('1.2.0') - def transform(self, vector): - """ - Applies standardization transformation on a vector. - - .. note:: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. - - :param vector: Vector or RDD of Vector to be standardized. - :return: Standardized vector. If the variance of a column is - zero, it will return default `0.0` for the column with - zero variance. - """ - return JavaVectorTransformer.transform(self, vector) - - @since('1.4.0') - def setWithMean(self, withMean): - """ - Setter of the boolean which decides - whether it uses mean or not - """ - self.call("setWithMean", withMean) - return self - - @since('1.4.0') - def setWithStd(self, withStd): - """ - Setter of the boolean which decides - whether it uses std or not - """ - self.call("setWithStd", withStd) - return self - - @property - @since('2.0.0') - def withStd(self): - """ - Returns if the model scales the data to unit standard deviation. - """ - return self.call("withStd") - - @property - @since('2.0.0') - def withMean(self): - """ - Returns if the model centers the data before scaling. - """ - return self.call("withMean") - - @property - @since('2.0.0') - def std(self): - """ - Return the column standard deviation values. - """ - return self.call("std") - - @property - @since('2.0.0') - def mean(self): - """ - Return the column mean values. - """ - return self.call("mean") - - -class StandardScaler(object): - """ - Standardizes features by removing the mean and scaling to unit - variance using column summary statistics on the samples in the - training set. - - :param withMean: False by default. Centers the data with mean - before scaling. It will build a dense output, so take - care when applying to sparse input. - :param withStd: True by default. Scales the data to unit - standard deviation. - - >>> vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])] - >>> dataset = sc.parallelize(vs) - >>> standardizer = StandardScaler(True, True) - >>> model = standardizer.fit(dataset) - >>> result = model.transform(dataset) - >>> for r in result.collect(): r - DenseVector([-0.7071, 0.7071, -0.7071]) - DenseVector([0.7071, -0.7071, 0.7071]) - >>> int(model.std[0]) - 4 - >>> int(model.mean[0]*10) - 9 - >>> model.withStd - True - >>> model.withMean - True - - .. versionadded:: 1.2.0 - """ - def __init__(self, withMean=False, withStd=True): - if not (withMean or withStd): - warnings.warn("Both withMean and withStd are false. The model does nothing.") - self.withMean = withMean - self.withStd = withStd - - @since('1.2.0') - def fit(self, dataset): - """ - Computes the mean and variance and stores as a model to be used - for later scaling. - - :param dataset: The data used to compute the mean and variance - to build the transformation model. - :return: a StandardScalarModel - """ - dataset = dataset.map(_convert_to_vector) - jmodel = callMLlibFunc("fitStandardScaler", self.withMean, self.withStd, dataset) - return StandardScalerModel(jmodel) - - -class ChiSqSelectorModel(JavaVectorTransformer): - """ - Represents a Chi Squared selector model. - - .. versionadded:: 1.4.0 - """ - - @since('1.4.0') - def transform(self, vector): - """ - Applies transformation on a vector. - - :param vector: Vector or RDD of Vector to be transformed. - :return: transformed vector. - """ - return JavaVectorTransformer.transform(self, vector) - - -class ChiSqSelector(object): - """ - Creates a ChiSquared feature selector. - The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`, - `fdr`, `fwe`. - - * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. - - * `percentile` is similar but chooses a fraction of all features - instead of a fixed number. - - * `fpr` chooses all features whose p-values are below a threshold, - thus controlling the false positive rate of selection. - - * `fdr` uses the `Benjamini-Hochberg procedure `_ - to choose all features whose false discovery rate is below a threshold. - - * `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by - 1/numFeatures, thus controlling the family-wise error rate of selection. - - By default, the selection method is `numTopFeatures`, with the default number of top features - set to 50. - - >>> data = sc.parallelize([ - ... LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})), - ... LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})), - ... LabeledPoint(1.0, [0.0, 9.0, 8.0]), - ... LabeledPoint(2.0, [7.0, 9.0, 5.0]), - ... LabeledPoint(2.0, [8.0, 7.0, 3.0]) - ... ]) - >>> model = ChiSqSelector(numTopFeatures=1).fit(data) - >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0})) - SparseVector(1, {}) - >>> model.transform(DenseVector([7.0, 9.0, 5.0])) - DenseVector([7.0]) - >>> model = ChiSqSelector(selectorType="fpr", fpr=0.2).fit(data) - >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0})) - SparseVector(1, {}) - >>> model.transform(DenseVector([7.0, 9.0, 5.0])) - DenseVector([7.0]) - >>> model = ChiSqSelector(selectorType="percentile", percentile=0.34).fit(data) - >>> model.transform(DenseVector([7.0, 9.0, 5.0])) - DenseVector([7.0]) - - .. versionadded:: 1.4.0 - """ - def __init__(self, numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, fpr=0.05, - fdr=0.05, fwe=0.05): - self.numTopFeatures = numTopFeatures - self.selectorType = selectorType - self.percentile = percentile - self.fpr = fpr - self.fdr = fdr - self.fwe = fwe - - @since('2.1.0') - def setNumTopFeatures(self, numTopFeatures): - """ - set numTopFeature for feature selection by number of top features. - Only applicable when selectorType = "numTopFeatures". - """ - self.numTopFeatures = int(numTopFeatures) - return self - - @since('2.1.0') - def setPercentile(self, percentile): - """ - set percentile [0.0, 1.0] for feature selection by percentile. - Only applicable when selectorType = "percentile". - """ - self.percentile = float(percentile) - return self - - @since('2.1.0') - def setFpr(self, fpr): - """ - set FPR [0.0, 1.0] for feature selection by FPR. - Only applicable when selectorType = "fpr". - """ - self.fpr = float(fpr) - return self - - @since('2.2.0') - def setFdr(self, fdr): - """ - set FDR [0.0, 1.0] for feature selection by FDR. - Only applicable when selectorType = "fdr". - """ - self.fdr = float(fdr) - return self - - @since('2.2.0') - def setFwe(self, fwe): - """ - set FWE [0.0, 1.0] for feature selection by FWE. - Only applicable when selectorType = "fwe". - """ - self.fwe = float(fwe) - return self - - @since('2.1.0') - def setSelectorType(self, selectorType): - """ - set the selector type of the ChisqSelector. - Supported options: "numTopFeatures" (default), "percentile", "fpr", "fdr", "fwe". - """ - self.selectorType = str(selectorType) - return self - - @since('1.4.0') - def fit(self, data): - """ - Returns a ChiSquared feature selector. - - :param data: an `RDD[LabeledPoint]` containing the labeled dataset - with categorical features. Real-valued features will be - treated as categorical for each distinct value. - Apply feature discretizer before using this function. - """ - jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures, - self.percentile, self.fpr, self.fdr, self.fwe, data) - return ChiSqSelectorModel(jmodel) - - -class PCAModel(JavaVectorTransformer): - """ - Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA. - - .. versionadded:: 1.5.0 - """ - - -class PCA(object): - """ - A feature transformer that projects vectors to a low-dimensional space using PCA. - - >>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), - ... Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), - ... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])] - >>> model = PCA(2).fit(sc.parallelize(data)) - >>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray() - >>> pcArray[0] - 1.648... - >>> pcArray[1] - -4.013... - - .. versionadded:: 1.5.0 - """ - def __init__(self, k): - """ - :param k: number of principal components. - """ - self.k = int(k) - - @since('1.5.0') - def fit(self, data): - """ - Computes a [[PCAModel]] that contains the principal components of the input vectors. - :param data: source vectors - """ - jmodel = callMLlibFunc("fitPCA", self.k, data) - return PCAModel(jmodel) - - -class HashingTF(object): - """ - Maps a sequence of terms to their term frequencies using the hashing - trick. - - .. note:: The terms must be hashable (can not be dict/set/list...). - - :param numFeatures: number of features (default: 2^20) - - >>> htf = HashingTF(100) - >>> doc = "a a b b c d".split(" ") - >>> htf.transform(doc) - SparseVector(100, {...}) - - .. versionadded:: 1.2.0 - """ - def __init__(self, numFeatures=1 << 20): - self.numFeatures = numFeatures - self.binary = False - - @since("2.0.0") - def setBinary(self, value): - """ - If True, term frequency vector will be binary such that non-zero - term counts will be set to 1 - (default: False) - """ - self.binary = value - return self - - @since('1.2.0') - def indexOf(self, term): - """ Returns the index of the input term. """ - return hash(term) % self.numFeatures - - @since('1.2.0') - def transform(self, document): - """ - Transforms the input document (list of terms) to term frequency - vectors, or transform the RDD of document to RDD of term - frequency vectors. - """ - if isinstance(document, RDD): - return document.map(self.transform) - - freq = {} - for term in document: - i = self.indexOf(term) - freq[i] = 1.0 if self.binary else freq.get(i, 0) + 1.0 - return Vectors.sparse(self.numFeatures, freq.items()) - - -class IDFModel(JavaVectorTransformer): - """ - Represents an IDF model that can transform term frequency vectors. - - .. versionadded:: 1.2.0 - """ - @since('1.2.0') - def transform(self, x): - """ - Transforms term frequency (TF) vectors to TF-IDF vectors. - - If `minDocFreq` was set for the IDF calculation, - the terms which occur in fewer than `minDocFreq` - documents will have an entry of 0. - - .. note:: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. - - :param x: an RDD of term frequency vectors or a term frequency - vector - :return: an RDD of TF-IDF vectors or a TF-IDF vector - """ - return JavaVectorTransformer.transform(self, x) - - @since('1.4.0') - def idf(self): - """ - Returns the current IDF vector. - """ - return self.call('idf') - - -class IDF(object): - """ - Inverse document frequency (IDF). - - The standard formulation is used: `idf = log((m + 1) / (d(t) + 1))`, - where `m` is the total number of documents and `d(t)` is the number - of documents that contain term `t`. - - This implementation supports filtering out terms which do not appear - in a minimum number of documents (controlled by the variable - `minDocFreq`). For terms that are not in at least `minDocFreq` - documents, the IDF is found as 0, resulting in TF-IDFs of 0. - - :param minDocFreq: minimum of documents in which a term - should appear for filtering - - >>> n = 4 - >>> freqs = [Vectors.sparse(n, (1, 3), (1.0, 2.0)), - ... Vectors.dense([0.0, 1.0, 2.0, 3.0]), - ... Vectors.sparse(n, [1], [1.0])] - >>> data = sc.parallelize(freqs) - >>> idf = IDF() - >>> model = idf.fit(data) - >>> tfidf = model.transform(data) - >>> for r in tfidf.collect(): r - SparseVector(4, {1: 0.0, 3: 0.5754}) - DenseVector([0.0, 0.0, 1.3863, 0.863]) - SparseVector(4, {1: 0.0}) - >>> model.transform(Vectors.dense([0.0, 1.0, 2.0, 3.0])) - DenseVector([0.0, 0.0, 1.3863, 0.863]) - >>> model.transform([0.0, 1.0, 2.0, 3.0]) - DenseVector([0.0, 0.0, 1.3863, 0.863]) - >>> model.transform(Vectors.sparse(n, (1, 3), (1.0, 2.0))) - SparseVector(4, {1: 0.0, 3: 0.5754}) - - .. versionadded:: 1.2.0 - """ - def __init__(self, minDocFreq=0): - self.minDocFreq = minDocFreq - - @since('1.2.0') - def fit(self, dataset): - """ - Computes the inverse document frequency. - - :param dataset: an RDD of term frequency vectors - """ - if not isinstance(dataset, RDD): - raise TypeError("dataset should be an RDD of term frequency vectors") - jmodel = callMLlibFunc("fitIDF", self.minDocFreq, dataset.map(_convert_to_vector)) - return IDFModel(jmodel) - - -class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader): - """ - class for Word2Vec model - - .. versionadded:: 1.2.0 - """ - @since('1.2.0') - def transform(self, word): - """ - Transforms a word to its vector representation - - .. note:: Local use only - - :param word: a word - :return: vector representation of word(s) - """ - try: - return self.call("transform", word) - except Py4JJavaError: - raise ValueError("%s not found" % word) - - @since('1.2.0') - def findSynonyms(self, word, num): - """ - Find synonyms of a word - - :param word: a word or a vector representation of word - :param num: number of synonyms to find - :return: array of (word, cosineSimilarity) - - .. note:: Local use only - """ - if not isinstance(word, basestring): - word = _convert_to_vector(word) - words, similarity = self.call("findSynonyms", word, num) - return zip(words, similarity) - - @since('1.4.0') - def getVectors(self): - """ - Returns a map of words to their vector representations. - """ - return self.call("getVectors") - - @classmethod - @since('1.5.0') - def load(cls, sc, path): - """ - Load a model from the given path. - """ - jmodel = sc._jvm.org.apache.spark.mllib.feature \ - .Word2VecModel.load(sc._jsc.sc(), path) - model = sc._jvm.org.apache.spark.mllib.api.python.Word2VecModelWrapper(jmodel) - return Word2VecModel(model) - - -@ignore_unicode_prefix -class Word2Vec(object): - """Word2Vec creates vector representation of words in a text corpus. - The algorithm first constructs a vocabulary from the corpus - and then learns vector representation of words in the vocabulary. - The vector representation can be used as features in - natural language processing and machine learning algorithms. - - We used skip-gram model in our implementation and hierarchical - softmax method to train the model. The variable names in the - implementation matches the original C implementation. - - For original C implementation, - see https://code.google.com/p/word2vec/ - For research papers, see - Efficient Estimation of Word Representations in Vector Space - and Distributed Representations of Words and Phrases and their - Compositionality. - - >>> sentence = "a b " * 100 + "a c " * 10 - >>> localDoc = [sentence, sentence] - >>> doc = sc.parallelize(localDoc).map(lambda line: line.split(" ")) - >>> model = Word2Vec().setVectorSize(10).setSeed(42).fit(doc) - - Querying for synonyms of a word will not return that word: - - >>> syms = model.findSynonyms("a", 2) - >>> [s[0] for s in syms] - [u'b', u'c'] - - But querying for synonyms of a vector may return the word whose - representation is that vector: - - >>> vec = model.transform("a") - >>> syms = model.findSynonyms(vec, 2) - >>> [s[0] for s in syms] - [u'a', u'b'] - - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> model.save(sc, path) - >>> sameModel = Word2VecModel.load(sc, path) - >>> model.transform("a") == sameModel.transform("a") - True - >>> syms = sameModel.findSynonyms("a", 2) - >>> [s[0] for s in syms] - [u'b', u'c'] - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except OSError: - ... pass - - .. versionadded:: 1.2.0 - - """ - def __init__(self): - """ - Construct Word2Vec instance - """ - self.vectorSize = 100 - self.learningRate = 0.025 - self.numPartitions = 1 - self.numIterations = 1 - self.seed = None - self.minCount = 5 - self.windowSize = 5 - - @since('1.2.0') - def setVectorSize(self, vectorSize): - """ - Sets vector size (default: 100). - """ - self.vectorSize = vectorSize - return self - - @since('1.2.0') - def setLearningRate(self, learningRate): - """ - Sets initial learning rate (default: 0.025). - """ - self.learningRate = learningRate - return self - - @since('1.2.0') - def setNumPartitions(self, numPartitions): - """ - Sets number of partitions (default: 1). Use a small number for - accuracy. - """ - self.numPartitions = numPartitions - return self - - @since('1.2.0') - def setNumIterations(self, numIterations): - """ - Sets number of iterations (default: 1), which should be smaller - than or equal to number of partitions. - """ - self.numIterations = numIterations - return self - - @since('1.2.0') - def setSeed(self, seed): - """ - Sets random seed. - """ - self.seed = seed - return self - - @since('1.4.0') - def setMinCount(self, minCount): - """ - Sets minCount, the minimum number of times a token must appear - to be included in the word2vec model's vocabulary (default: 5). - """ - self.minCount = minCount - return self - - @since('2.0.0') - def setWindowSize(self, windowSize): - """ - Sets window size (default: 5). - """ - self.windowSize = windowSize - return self - - @since('1.2.0') - def fit(self, data): - """ - Computes the vector representation of each word in vocabulary. - - :param data: training data. RDD of list of string - :return: Word2VecModel instance - """ - if not isinstance(data, RDD): - raise TypeError("data should be an RDD of list of string") - jmodel = callMLlibFunc("trainWord2VecModel", data, int(self.vectorSize), - float(self.learningRate), int(self.numPartitions), - int(self.numIterations), self.seed, - int(self.minCount), int(self.windowSize)) - return Word2VecModel(jmodel) - - -class ElementwiseProduct(VectorTransformer): - """ - Scales each column of the vector, with the supplied weight vector. - i.e the elementwise product. - - >>> weight = Vectors.dense([1.0, 2.0, 3.0]) - >>> eprod = ElementwiseProduct(weight) - >>> a = Vectors.dense([2.0, 1.0, 3.0]) - >>> eprod.transform(a) - DenseVector([2.0, 2.0, 9.0]) - >>> b = Vectors.dense([9.0, 3.0, 4.0]) - >>> rdd = sc.parallelize([a, b]) - >>> eprod.transform(rdd).collect() - [DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])] - - .. versionadded:: 1.5.0 - """ - def __init__(self, scalingVector): - self.scalingVector = _convert_to_vector(scalingVector) - - @since('1.5.0') - def transform(self, vector): - """ - Computes the Hadamard product of the vector. - """ - if isinstance(vector, RDD): - vector = vector.map(_convert_to_vector) - - else: - vector = _convert_to_vector(vector) - return callMLlibFunc("elementwiseProductVector", self.scalingVector, vector) - - -def _test(): - import doctest - from pyspark.sql import SparkSession - globs = globals().copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("mllib.feature tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) - -if __name__ == "__main__": - sys.path.pop(0) - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/fpm.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/fpm.py deleted file mode 100644 index de18dad..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/fpm.py +++ /dev/null @@ -1,206 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys - -import numpy -from numpy import array -from collections import namedtuple - -from pyspark import SparkContext, since -from pyspark.rdd import ignore_unicode_prefix -from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc -from pyspark.mllib.util import JavaSaveable, JavaLoader, inherit_doc - -__all__ = ['FPGrowth', 'FPGrowthModel', 'PrefixSpan', 'PrefixSpanModel'] - - -@inherit_doc -@ignore_unicode_prefix -class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader): - """ - A FP-Growth model for mining frequent itemsets - using the Parallel FP-Growth algorithm. - - >>> data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]] - >>> rdd = sc.parallelize(data, 2) - >>> model = FPGrowth.train(rdd, 0.6, 2) - >>> sorted(model.freqItemsets().collect()) - [FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'c'], freq=3), ... - >>> model_path = temp_path + "/fpm" - >>> model.save(sc, model_path) - >>> sameModel = FPGrowthModel.load(sc, model_path) - >>> sorted(model.freqItemsets().collect()) == sorted(sameModel.freqItemsets().collect()) - True - - .. versionadded:: 1.4.0 - """ - - @since("1.4.0") - def freqItemsets(self): - """ - Returns the frequent itemsets of this model. - """ - return self.call("getFreqItemsets").map(lambda x: (FPGrowth.FreqItemset(x[0], x[1]))) - - @classmethod - @since("2.0.0") - def load(cls, sc, path): - """ - Load a model from the given path. - """ - model = cls._load_java(sc, path) - wrapper = sc._jvm.org.apache.spark.mllib.api.python.FPGrowthModelWrapper(model) - return FPGrowthModel(wrapper) - - -class FPGrowth(object): - """ - A Parallel FP-growth algorithm to mine frequent itemsets. - - .. versionadded:: 1.4.0 - """ - - @classmethod - @since("1.4.0") - def train(cls, data, minSupport=0.3, numPartitions=-1): - """ - Computes an FP-Growth model that contains frequent itemsets. - - :param data: - The input data set, each element contains a transaction. - :param minSupport: - The minimal support level. - (default: 0.3) - :param numPartitions: - The number of partitions used by parallel FP-growth. A value - of -1 will use the same number as input data. - (default: -1) - """ - model = callMLlibFunc("trainFPGrowthModel", data, float(minSupport), int(numPartitions)) - return FPGrowthModel(model) - - class FreqItemset(namedtuple("FreqItemset", ["items", "freq"])): - """ - Represents an (items, freq) tuple. - - .. versionadded:: 1.4.0 - """ - - -@inherit_doc -@ignore_unicode_prefix -class PrefixSpanModel(JavaModelWrapper): - """ - Model fitted by PrefixSpan - - >>> data = [ - ... [["a", "b"], ["c"]], - ... [["a"], ["c", "b"], ["a", "b"]], - ... [["a", "b"], ["e"]], - ... [["f"]]] - >>> rdd = sc.parallelize(data, 2) - >>> model = PrefixSpan.train(rdd) - >>> sorted(model.freqSequences().collect()) - [FreqSequence(sequence=[[u'a']], freq=3), FreqSequence(sequence=[[u'a'], [u'a']], freq=1), ... - - .. versionadded:: 1.6.0 - """ - - @since("1.6.0") - def freqSequences(self): - """Gets frequent sequences""" - return self.call("getFreqSequences").map(lambda x: PrefixSpan.FreqSequence(x[0], x[1])) - - -class PrefixSpan(object): - """ - A parallel PrefixSpan algorithm to mine frequent sequential patterns. - The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: - Mining Sequential Patterns Efficiently by Prefix-Projected Pattern Growth - ([[http://doi.org/10.1109/ICDE.2001.914830]]). - - .. versionadded:: 1.6.0 - """ - - @classmethod - @since("1.6.0") - def train(cls, data, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000): - """ - Finds the complete set of frequent sequential patterns in the - input sequences of itemsets. - - :param data: - The input data set, each element contains a sequence of - itemsets. - :param minSupport: - The minimal support level of the sequential pattern, any - pattern that appears more than (minSupport * - size-of-the-dataset) times will be output. - (default: 0.1) - :param maxPatternLength: - The maximal length of the sequential pattern, any pattern - that appears less than maxPatternLength will be output. - (default: 10) - :param maxLocalProjDBSize: - The maximum number of items (including delimiters used in the - internal storage format) allowed in a projected database before - local processing. If a projected database exceeds this size, - another iteration of distributed prefix growth is run. - (default: 32000000) - """ - model = callMLlibFunc("trainPrefixSpanModel", - data, minSupport, maxPatternLength, maxLocalProjDBSize) - return PrefixSpanModel(model) - - class FreqSequence(namedtuple("FreqSequence", ["sequence", "freq"])): - """ - Represents a (sequence, freq) tuple. - - .. versionadded:: 1.6.0 - """ - - -def _test(): - import doctest - from pyspark.sql import SparkSession - import pyspark.mllib.fpm - globs = pyspark.mllib.fpm.__dict__.copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("mllib.fpm tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - import tempfile - - temp_path = tempfile.mkdtemp() - globs['temp_path'] = temp_path - try: - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - finally: - from shutil import rmtree - try: - rmtree(temp_path) - except OSError: - pass - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/linalg/__init__.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/linalg/__init__.py deleted file mode 100644 index 94a3e2a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/linalg/__init__.py +++ /dev/null @@ -1,1386 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -MLlib utilities for linear algebra. For dense vectors, MLlib -uses the NumPy C{array} type, so you can simply pass NumPy arrays -around. For sparse vectors, users can construct a L{SparseVector} -object from MLlib or pass SciPy C{scipy.sparse} column vectors if -SciPy is available in their environment. -""" - -import sys -import array -import struct - -if sys.version >= '3': - basestring = str - xrange = range - import copyreg as copy_reg - long = int -else: - from itertools import izip as zip - import copy_reg - -import numpy as np - -from pyspark import since -from pyspark.ml import linalg as newlinalg -from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \ - IntegerType, ByteType, BooleanType - - -__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors', - 'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices', - 'QRDecomposition'] - - -if sys.version_info[:2] == (2, 7): - # speed up pickling array in Python 2.7 - def fast_pickle_array(ar): - return array.array, (ar.typecode, ar.tostring()) - copy_reg.pickle(array.array, fast_pickle_array) - - -# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods, -# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices. - -try: - import scipy.sparse - _have_scipy = True -except: - # No SciPy in environment, but that's okay - _have_scipy = False - - -def _convert_to_vector(l): - if isinstance(l, Vector): - return l - elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange): - return DenseVector(l) - elif _have_scipy and scipy.sparse.issparse(l): - assert l.shape[1] == 1, "Expected column vector" - # Make sure the converted csc_matrix has sorted indices. - csc = l.tocsc() - if not csc.has_sorted_indices: - csc.sort_indices() - return SparseVector(l.shape[0], csc.indices, csc.data) - else: - raise TypeError("Cannot convert type %s into Vector" % type(l)) - - -def _vector_size(v): - """ - Returns the size of the vector. - - >>> _vector_size([1., 2., 3.]) - 3 - >>> _vector_size((1., 2., 3.)) - 3 - >>> _vector_size(array.array('d', [1., 2., 3.])) - 3 - >>> _vector_size(np.zeros(3)) - 3 - >>> _vector_size(np.zeros((3, 1))) - 3 - >>> _vector_size(np.zeros((1, 3))) - Traceback (most recent call last): - ... - ValueError: Cannot treat an ndarray of shape (1, 3) as a vector - """ - if isinstance(v, Vector): - return len(v) - elif type(v) in (array.array, list, tuple, xrange): - return len(v) - elif type(v) == np.ndarray: - if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1): - return len(v) - else: - raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape)) - elif _have_scipy and scipy.sparse.issparse(v): - assert v.shape[1] == 1, "Expected column vector" - return v.shape[0] - else: - raise TypeError("Cannot treat type %s as a vector" % type(v)) - - -def _format_float(f, digits=4): - s = str(round(f, digits)) - if '.' in s: - s = s[:s.index('.') + 1 + digits] - return s - - -def _format_float_list(l): - return [_format_float(x) for x in l] - - -def _double_to_long_bits(value): - if np.isnan(value): - value = float('nan') - # pack double into 64 bits, then unpack as long int - return struct.unpack('Q', struct.pack('d', value))[0] - - -class VectorUDT(UserDefinedType): - """ - SQL user-defined type (UDT) for Vector. - """ - - @classmethod - def sqlType(cls): - return StructType([ - StructField("type", ByteType(), False), - StructField("size", IntegerType(), True), - StructField("indices", ArrayType(IntegerType(), False), True), - StructField("values", ArrayType(DoubleType(), False), True)]) - - @classmethod - def module(cls): - return "pyspark.mllib.linalg" - - @classmethod - def scalaUDT(cls): - return "org.apache.spark.mllib.linalg.VectorUDT" - - def serialize(self, obj): - if isinstance(obj, SparseVector): - indices = [int(i) for i in obj.indices] - values = [float(v) for v in obj.values] - return (0, obj.size, indices, values) - elif isinstance(obj, DenseVector): - values = [float(v) for v in obj] - return (1, None, None, values) - else: - raise TypeError("cannot serialize %r of type %r" % (obj, type(obj))) - - def deserialize(self, datum): - assert len(datum) == 4, \ - "VectorUDT.deserialize given row with length %d but requires 4" % len(datum) - tpe = datum[0] - if tpe == 0: - return SparseVector(datum[1], datum[2], datum[3]) - elif tpe == 1: - return DenseVector(datum[3]) - else: - raise ValueError("do not recognize type %r" % tpe) - - def simpleString(self): - return "vector" - - -class MatrixUDT(UserDefinedType): - """ - SQL user-defined type (UDT) for Matrix. - """ - - @classmethod - def sqlType(cls): - return StructType([ - StructField("type", ByteType(), False), - StructField("numRows", IntegerType(), False), - StructField("numCols", IntegerType(), False), - StructField("colPtrs", ArrayType(IntegerType(), False), True), - StructField("rowIndices", ArrayType(IntegerType(), False), True), - StructField("values", ArrayType(DoubleType(), False), True), - StructField("isTransposed", BooleanType(), False)]) - - @classmethod - def module(cls): - return "pyspark.mllib.linalg" - - @classmethod - def scalaUDT(cls): - return "org.apache.spark.mllib.linalg.MatrixUDT" - - def serialize(self, obj): - if isinstance(obj, SparseMatrix): - colPtrs = [int(i) for i in obj.colPtrs] - rowIndices = [int(i) for i in obj.rowIndices] - values = [float(v) for v in obj.values] - return (0, obj.numRows, obj.numCols, colPtrs, - rowIndices, values, bool(obj.isTransposed)) - elif isinstance(obj, DenseMatrix): - values = [float(v) for v in obj.values] - return (1, obj.numRows, obj.numCols, None, None, values, - bool(obj.isTransposed)) - else: - raise TypeError("cannot serialize type %r" % (type(obj))) - - def deserialize(self, datum): - assert len(datum) == 7, \ - "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum) - tpe = datum[0] - if tpe == 0: - return SparseMatrix(*datum[1:]) - elif tpe == 1: - return DenseMatrix(datum[1], datum[2], datum[5], datum[6]) - else: - raise ValueError("do not recognize type %r" % tpe) - - def simpleString(self): - return "matrix" - - -class Vector(object): - - __UDT__ = VectorUDT() - - """ - Abstract class for DenseVector and SparseVector - """ - def toArray(self): - """ - Convert the vector into an numpy.ndarray - - :return: numpy.ndarray - """ - raise NotImplementedError - - def asML(self): - """ - Convert this vector to the new mllib-local representation. - This does NOT copy the data; it copies references. - - :return: :py:class:`pyspark.ml.linalg.Vector` - """ - raise NotImplementedError - - -class DenseVector(Vector): - """ - A dense vector represented by a value array. We use numpy array for - storage and arithmetics will be delegated to the underlying numpy - array. - - >>> v = Vectors.dense([1.0, 2.0]) - >>> u = Vectors.dense([3.0, 4.0]) - >>> v + u - DenseVector([4.0, 6.0]) - >>> 2 - v - DenseVector([1.0, 0.0]) - >>> v / 2 - DenseVector([0.5, 1.0]) - >>> v * u - DenseVector([3.0, 8.0]) - >>> u / v - DenseVector([3.0, 2.0]) - >>> u % 2 - DenseVector([1.0, 0.0]) - >>> -v - DenseVector([-1.0, -2.0]) - """ - def __init__(self, ar): - if isinstance(ar, bytes): - ar = np.frombuffer(ar, dtype=np.float64) - elif not isinstance(ar, np.ndarray): - ar = np.array(ar, dtype=np.float64) - if ar.dtype != np.float64: - ar = ar.astype(np.float64) - self.array = ar - - @staticmethod - def parse(s): - """ - Parse string representation back into the DenseVector. - - >>> DenseVector.parse(' [ 0.0,1.0,2.0, 3.0]') - DenseVector([0.0, 1.0, 2.0, 3.0]) - """ - start = s.find('[') - if start == -1: - raise ValueError("Array should start with '['.") - end = s.find(']') - if end == -1: - raise ValueError("Array should end with ']'.") - s = s[start + 1: end] - - try: - values = [float(val) for val in s.split(',') if val] - except ValueError: - raise ValueError("Unable to parse values from %s" % s) - return DenseVector(values) - - def __reduce__(self): - return DenseVector, (self.array.tostring(),) - - def numNonzeros(self): - """ - Number of nonzero elements. This scans all active values and count non zeros - """ - return np.count_nonzero(self.array) - - def norm(self, p): - """ - Calculates the norm of a DenseVector. - - >>> a = DenseVector([0, -1, 2, -3]) - >>> a.norm(2) - 3.7... - >>> a.norm(1) - 6.0 - """ - return np.linalg.norm(self.array, p) - - def dot(self, other): - """ - Compute the dot product of two Vectors. We support - (Numpy array, list, SparseVector, or SciPy sparse) - and a target NumPy array that is either 1- or 2-dimensional. - Equivalent to calling numpy.dot of the two vectors. - - >>> dense = DenseVector(array.array('d', [1., 2.])) - >>> dense.dot(dense) - 5.0 - >>> dense.dot(SparseVector(2, [0, 1], [2., 1.])) - 4.0 - >>> dense.dot(range(1, 3)) - 5.0 - >>> dense.dot(np.array(range(1, 3))) - 5.0 - >>> dense.dot([1.,]) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F')) - array([ 5., 11.]) - >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F')) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - """ - if type(other) == np.ndarray: - if other.ndim > 1: - assert len(self) == other.shape[0], "dimension mismatch" - return np.dot(self.array, other) - elif _have_scipy and scipy.sparse.issparse(other): - assert len(self) == other.shape[0], "dimension mismatch" - return other.transpose().dot(self.toArray()) - else: - assert len(self) == _vector_size(other), "dimension mismatch" - if isinstance(other, SparseVector): - return other.dot(self) - elif isinstance(other, Vector): - return np.dot(self.toArray(), other.toArray()) - else: - return np.dot(self.toArray(), other) - - def squared_distance(self, other): - """ - Squared distance of two Vectors. - - >>> dense1 = DenseVector(array.array('d', [1., 2.])) - >>> dense1.squared_distance(dense1) - 0.0 - >>> dense2 = np.array([2., 1.]) - >>> dense1.squared_distance(dense2) - 2.0 - >>> dense3 = [2., 1.] - >>> dense1.squared_distance(dense3) - 2.0 - >>> sparse1 = SparseVector(2, [0, 1], [2., 1.]) - >>> dense1.squared_distance(sparse1) - 2.0 - >>> dense1.squared_distance([1.,]) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> dense1.squared_distance(SparseVector(1, [0,], [1.,])) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - """ - assert len(self) == _vector_size(other), "dimension mismatch" - if isinstance(other, SparseVector): - return other.squared_distance(self) - elif _have_scipy and scipy.sparse.issparse(other): - return _convert_to_vector(other).squared_distance(self) - - if isinstance(other, Vector): - other = other.toArray() - elif not isinstance(other, np.ndarray): - other = np.array(other) - diff = self.toArray() - other - return np.dot(diff, diff) - - def toArray(self): - """ - Returns an numpy.ndarray - """ - return self.array - - def asML(self): - """ - Convert this vector to the new mllib-local representation. - This does NOT copy the data; it copies references. - - :return: :py:class:`pyspark.ml.linalg.DenseVector` - - .. versionadded:: 2.0.0 - """ - return newlinalg.DenseVector(self.array) - - @property - def values(self): - """ - Returns a list of values - """ - return self.array - - def __getitem__(self, item): - return self.array[item] - - def __len__(self): - return len(self.array) - - def __str__(self): - return "[" + ",".join([str(v) for v in self.array]) + "]" - - def __repr__(self): - return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array)) - - def __eq__(self, other): - if isinstance(other, DenseVector): - return np.array_equal(self.array, other.array) - elif isinstance(other, SparseVector): - if len(self) != other.size: - return False - return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values) - return False - - def __ne__(self, other): - return not self == other - - def __hash__(self): - size = len(self) - result = 31 + size - nnz = 0 - i = 0 - while i < size and nnz < 128: - if self.array[i] != 0: - result = 31 * result + i - bits = _double_to_long_bits(self.array[i]) - result = 31 * result + (bits ^ (bits >> 32)) - nnz += 1 - i += 1 - return result - - def __getattr__(self, item): - return getattr(self.array, item) - - def __neg__(self): - return DenseVector(-self.array) - - def _delegate(op): - def func(self, other): - if isinstance(other, DenseVector): - other = other.array - return DenseVector(getattr(self.array, op)(other)) - return func - - __add__ = _delegate("__add__") - __sub__ = _delegate("__sub__") - __mul__ = _delegate("__mul__") - __div__ = _delegate("__div__") - __truediv__ = _delegate("__truediv__") - __mod__ = _delegate("__mod__") - __radd__ = _delegate("__radd__") - __rsub__ = _delegate("__rsub__") - __rmul__ = _delegate("__rmul__") - __rdiv__ = _delegate("__rdiv__") - __rtruediv__ = _delegate("__rtruediv__") - __rmod__ = _delegate("__rmod__") - - -class SparseVector(Vector): - """ - A simple sparse vector class for passing data to MLlib. Users may - alternatively pass SciPy's {scipy.sparse} data types. - """ - def __init__(self, size, *args): - """ - Create a sparse vector, using either a dictionary, a list of - (index, value) pairs, or two separate arrays of indices and - values (sorted by index). - - :param size: Size of the vector. - :param args: Active entries, as a dictionary {index: value, ...}, - a list of tuples [(index, value), ...], or a list of strictly - increasing indices and a list of corresponding values [index, ...], - [value, ...]. Inactive entries are treated as zeros. - - >>> SparseVector(4, {1: 1.0, 3: 5.5}) - SparseVector(4, {1: 1.0, 3: 5.5}) - >>> SparseVector(4, [(1, 1.0), (3, 5.5)]) - SparseVector(4, {1: 1.0, 3: 5.5}) - >>> SparseVector(4, [1, 3], [1.0, 5.5]) - SparseVector(4, {1: 1.0, 3: 5.5}) - """ - self.size = int(size) - """ Size of the vector. """ - assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments" - if len(args) == 1: - pairs = args[0] - if type(pairs) == dict: - pairs = pairs.items() - pairs = sorted(pairs) - self.indices = np.array([p[0] for p in pairs], dtype=np.int32) - """ A list of indices corresponding to active entries. """ - self.values = np.array([p[1] for p in pairs], dtype=np.float64) - """ A list of values corresponding to active entries. """ - else: - if isinstance(args[0], bytes): - assert isinstance(args[1], bytes), "values should be string too" - if args[0]: - self.indices = np.frombuffer(args[0], np.int32) - self.values = np.frombuffer(args[1], np.float64) - else: - # np.frombuffer() doesn't work well with empty string in older version - self.indices = np.array([], dtype=np.int32) - self.values = np.array([], dtype=np.float64) - else: - self.indices = np.array(args[0], dtype=np.int32) - self.values = np.array(args[1], dtype=np.float64) - assert len(self.indices) == len(self.values), "index and value arrays not same length" - for i in xrange(len(self.indices) - 1): - if self.indices[i] >= self.indices[i + 1]: - raise TypeError( - "Indices %s and %s are not strictly increasing" - % (self.indices[i], self.indices[i + 1])) - - def numNonzeros(self): - """ - Number of nonzero elements. This scans all active values and count non zeros. - """ - return np.count_nonzero(self.values) - - def norm(self, p): - """ - Calculates the norm of a SparseVector. - - >>> a = SparseVector(4, [0, 1], [3., -4.]) - >>> a.norm(1) - 7.0 - >>> a.norm(2) - 5.0 - """ - return np.linalg.norm(self.values, p) - - def __reduce__(self): - return ( - SparseVector, - (self.size, self.indices.tostring(), self.values.tostring())) - - @staticmethod - def parse(s): - """ - Parse string representation back into the SparseVector. - - >>> SparseVector.parse(' (4, [0,1 ],[ 4.0,5.0] )') - SparseVector(4, {0: 4.0, 1: 5.0}) - """ - start = s.find('(') - if start == -1: - raise ValueError("Tuple should start with '('") - end = s.find(')') - if end == -1: - raise ValueError("Tuple should end with ')'") - s = s[start + 1: end].strip() - - size = s[: s.find(',')] - try: - size = int(size) - except ValueError: - raise ValueError("Cannot parse size %s." % size) - - ind_start = s.find('[') - if ind_start == -1: - raise ValueError("Indices array should start with '['.") - ind_end = s.find(']') - if ind_end == -1: - raise ValueError("Indices array should end with ']'") - new_s = s[ind_start + 1: ind_end] - ind_list = new_s.split(',') - try: - indices = [int(ind) for ind in ind_list if ind] - except ValueError: - raise ValueError("Unable to parse indices from %s." % new_s) - s = s[ind_end + 1:].strip() - - val_start = s.find('[') - if val_start == -1: - raise ValueError("Values array should start with '['.") - val_end = s.find(']') - if val_end == -1: - raise ValueError("Values array should end with ']'.") - val_list = s[val_start + 1: val_end].split(',') - try: - values = [float(val) for val in val_list if val] - except ValueError: - raise ValueError("Unable to parse values from %s." % s) - return SparseVector(size, indices, values) - - def dot(self, other): - """ - Dot product with a SparseVector or 1- or 2-dimensional Numpy array. - - >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) - >>> a.dot(a) - 25.0 - >>> a.dot(array.array('d', [1., 2., 3., 4.])) - 22.0 - >>> b = SparseVector(4, [2], [1.0]) - >>> a.dot(b) - 0.0 - >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]])) - array([ 22., 22.]) - >>> a.dot([1., 2., 3.]) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> a.dot(np.array([1., 2.])) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> a.dot(DenseVector([1., 2.])) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> a.dot(np.zeros((3, 2))) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - """ - - if isinstance(other, np.ndarray): - if other.ndim not in [2, 1]: - raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim) - assert len(self) == other.shape[0], "dimension mismatch" - return np.dot(self.values, other[self.indices]) - - assert len(self) == _vector_size(other), "dimension mismatch" - - if isinstance(other, DenseVector): - return np.dot(other.array[self.indices], self.values) - - elif isinstance(other, SparseVector): - # Find out common indices. - self_cmind = np.in1d(self.indices, other.indices, assume_unique=True) - self_values = self.values[self_cmind] - if self_values.size == 0: - return 0.0 - else: - other_cmind = np.in1d(other.indices, self.indices, assume_unique=True) - return np.dot(self_values, other.values[other_cmind]) - - else: - return self.dot(_convert_to_vector(other)) - - def squared_distance(self, other): - """ - Squared distance from a SparseVector or 1-dimensional NumPy array. - - >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) - >>> a.squared_distance(a) - 0.0 - >>> a.squared_distance(array.array('d', [1., 2., 3., 4.])) - 11.0 - >>> a.squared_distance(np.array([1., 2., 3., 4.])) - 11.0 - >>> b = SparseVector(4, [2], [1.0]) - >>> a.squared_distance(b) - 26.0 - >>> b.squared_distance(a) - 26.0 - >>> b.squared_distance([1., 2.]) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - >>> b.squared_distance(SparseVector(3, [1,], [1.0,])) - Traceback (most recent call last): - ... - AssertionError: dimension mismatch - """ - assert len(self) == _vector_size(other), "dimension mismatch" - - if isinstance(other, np.ndarray) or isinstance(other, DenseVector): - if isinstance(other, np.ndarray) and other.ndim != 1: - raise Exception("Cannot call squared_distance with %d-dimensional array" % - other.ndim) - if isinstance(other, DenseVector): - other = other.array - sparse_ind = np.zeros(other.size, dtype=bool) - sparse_ind[self.indices] = True - dist = other[sparse_ind] - self.values - result = np.dot(dist, dist) - - other_ind = other[~sparse_ind] - result += np.dot(other_ind, other_ind) - return result - - elif isinstance(other, SparseVector): - result = 0.0 - i, j = 0, 0 - while i < len(self.indices) and j < len(other.indices): - if self.indices[i] == other.indices[j]: - diff = self.values[i] - other.values[j] - result += diff * diff - i += 1 - j += 1 - elif self.indices[i] < other.indices[j]: - result += self.values[i] * self.values[i] - i += 1 - else: - result += other.values[j] * other.values[j] - j += 1 - while i < len(self.indices): - result += self.values[i] * self.values[i] - i += 1 - while j < len(other.indices): - result += other.values[j] * other.values[j] - j += 1 - return result - else: - return self.squared_distance(_convert_to_vector(other)) - - def toArray(self): - """ - Returns a copy of this SparseVector as a 1-dimensional NumPy array. - """ - arr = np.zeros((self.size,), dtype=np.float64) - arr[self.indices] = self.values - return arr - - def asML(self): - """ - Convert this vector to the new mllib-local representation. - This does NOT copy the data; it copies references. - - :return: :py:class:`pyspark.ml.linalg.SparseVector` - - .. versionadded:: 2.0.0 - """ - return newlinalg.SparseVector(self.size, self.indices, self.values) - - def __len__(self): - return self.size - - def __str__(self): - inds = "[" + ",".join([str(i) for i in self.indices]) + "]" - vals = "[" + ",".join([str(v) for v in self.values]) + "]" - return "(" + ",".join((str(self.size), inds, vals)) + ")" - - def __repr__(self): - inds = self.indices - vals = self.values - entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i])) - for i in xrange(len(inds))]) - return "SparseVector({0}, {{{1}}})".format(self.size, entries) - - def __eq__(self, other): - if isinstance(other, SparseVector): - return other.size == self.size and np.array_equal(other.indices, self.indices) \ - and np.array_equal(other.values, self.values) - elif isinstance(other, DenseVector): - if self.size != len(other): - return False - return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array) - return False - - def __getitem__(self, index): - inds = self.indices - vals = self.values - if not isinstance(index, int): - raise TypeError( - "Indices must be of type integer, got type %s" % type(index)) - - if index >= self.size or index < -self.size: - raise IndexError("Index %d out of bounds." % index) - if index < 0: - index += self.size - - if (inds.size == 0) or (index > inds.item(-1)): - return 0. - - insert_index = np.searchsorted(inds, index) - row_ind = inds[insert_index] - if row_ind == index: - return vals[insert_index] - return 0. - - def __ne__(self, other): - return not self.__eq__(other) - - def __hash__(self): - result = 31 + self.size - nnz = 0 - i = 0 - while i < len(self.values) and nnz < 128: - if self.values[i] != 0: - result = 31 * result + int(self.indices[i]) - bits = _double_to_long_bits(self.values[i]) - result = 31 * result + (bits ^ (bits >> 32)) - nnz += 1 - i += 1 - return result - - -class Vectors(object): - - """ - Factory methods for working with vectors. - - .. note:: Dense vectors are simply represented as NumPy array objects, - so there is no need to covert them for use in MLlib. For sparse vectors, - the factory methods in this class create an MLlib-compatible type, or users - can pass in SciPy's C{scipy.sparse} column vectors. - """ - - @staticmethod - def sparse(size, *args): - """ - Create a sparse vector, using either a dictionary, a list of - (index, value) pairs, or two separate arrays of indices and - values (sorted by index). - - :param size: Size of the vector. - :param args: Non-zero entries, as a dictionary, list of tuples, - or two sorted lists containing indices and values. - - >>> Vectors.sparse(4, {1: 1.0, 3: 5.5}) - SparseVector(4, {1: 1.0, 3: 5.5}) - >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) - SparseVector(4, {1: 1.0, 3: 5.5}) - >>> Vectors.sparse(4, [1, 3], [1.0, 5.5]) - SparseVector(4, {1: 1.0, 3: 5.5}) - """ - return SparseVector(size, *args) - - @staticmethod - def dense(*elements): - """ - Create a dense vector of 64-bit floats from a Python list or numbers. - - >>> Vectors.dense([1, 2, 3]) - DenseVector([1.0, 2.0, 3.0]) - >>> Vectors.dense(1.0, 2.0) - DenseVector([1.0, 2.0]) - """ - if len(elements) == 1 and not isinstance(elements[0], (float, int, long)): - # it's list, numpy.array or other iterable object. - elements = elements[0] - return DenseVector(elements) - - @staticmethod - def fromML(vec): - """ - Convert a vector from the new mllib-local representation. - This does NOT copy the data; it copies references. - - :param vec: a :py:class:`pyspark.ml.linalg.Vector` - :return: a :py:class:`pyspark.mllib.linalg.Vector` - - .. versionadded:: 2.0.0 - """ - if isinstance(vec, newlinalg.DenseVector): - return DenseVector(vec.array) - elif isinstance(vec, newlinalg.SparseVector): - return SparseVector(vec.size, vec.indices, vec.values) - else: - raise TypeError("Unsupported vector type %s" % type(vec)) - - @staticmethod - def stringify(vector): - """ - Converts a vector into a string, which can be recognized by - Vectors.parse(). - - >>> Vectors.stringify(Vectors.sparse(2, [1], [1.0])) - '(2,[1],[1.0])' - >>> Vectors.stringify(Vectors.dense([0.0, 1.0])) - '[0.0,1.0]' - """ - return str(vector) - - @staticmethod - def squared_distance(v1, v2): - """ - Squared distance between two vectors. - a and b can be of type SparseVector, DenseVector, np.ndarray - or array.array. - - >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) - >>> b = Vectors.dense([2, 5, 4, 1]) - >>> a.squared_distance(b) - 51.0 - """ - v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2) - return v1.squared_distance(v2) - - @staticmethod - def norm(vector, p): - """ - Find norm of the given vector. - """ - return _convert_to_vector(vector).norm(p) - - @staticmethod - def parse(s): - """Parse a string representation back into the Vector. - - >>> Vectors.parse('[2,1,2 ]') - DenseVector([2.0, 1.0, 2.0]) - >>> Vectors.parse(' ( 100, [0], [2])') - SparseVector(100, {0: 2.0}) - """ - if s.find('(') == -1 and s.find('[') != -1: - return DenseVector.parse(s) - elif s.find('(') != -1: - return SparseVector.parse(s) - else: - raise ValueError( - "Cannot find tokens '[' or '(' from the input string.") - - @staticmethod - def zeros(size): - return DenseVector(np.zeros(size)) - - @staticmethod - def _equals(v1_indices, v1_values, v2_indices, v2_values): - """ - Check equality between sparse/dense vectors, - v1_indices and v2_indices assume to be strictly increasing. - """ - v1_size = len(v1_values) - v2_size = len(v2_values) - k1 = 0 - k2 = 0 - all_equal = True - while all_equal: - while k1 < v1_size and v1_values[k1] == 0: - k1 += 1 - while k2 < v2_size and v2_values[k2] == 0: - k2 += 1 - - if k1 >= v1_size or k2 >= v2_size: - return k1 >= v1_size and k2 >= v2_size - - all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2] - k1 += 1 - k2 += 1 - return all_equal - - -class Matrix(object): - - __UDT__ = MatrixUDT() - - """ - Represents a local matrix. - """ - def __init__(self, numRows, numCols, isTransposed=False): - self.numRows = numRows - self.numCols = numCols - self.isTransposed = isTransposed - - def toArray(self): - """ - Returns its elements in a NumPy ndarray. - """ - raise NotImplementedError - - def asML(self): - """ - Convert this matrix to the new mllib-local representation. - This does NOT copy the data; it copies references. - """ - raise NotImplementedError - - @staticmethod - def _convert_to_array(array_like, dtype): - """ - Convert Matrix attributes which are array-like or buffer to array. - """ - if isinstance(array_like, bytes): - return np.frombuffer(array_like, dtype=dtype) - return np.asarray(array_like, dtype=dtype) - - -class DenseMatrix(Matrix): - """ - Column-major dense matrix. - """ - def __init__(self, numRows, numCols, values, isTransposed=False): - Matrix.__init__(self, numRows, numCols, isTransposed) - values = self._convert_to_array(values, np.float64) - assert len(values) == numRows * numCols - self.values = values - - def __reduce__(self): - return DenseMatrix, ( - self.numRows, self.numCols, self.values.tostring(), - int(self.isTransposed)) - - def __str__(self): - """ - Pretty printing of a DenseMatrix - - >>> dm = DenseMatrix(2, 2, range(4)) - >>> print(dm) - DenseMatrix([[ 0., 2.], - [ 1., 3.]]) - >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True) - >>> print(dm) - DenseMatrix([[ 0., 1.], - [ 2., 3.]]) - """ - # Inspired by __repr__ in scipy matrices. - array_lines = repr(self.toArray()).splitlines() - - # We need to adjust six spaces which is the difference in number - # of letters between "DenseMatrix" and "array" - x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]]) - return array_lines[0].replace("array", "DenseMatrix") + "\n" + x - - def __repr__(self): - """ - Representation of a DenseMatrix - - >>> dm = DenseMatrix(2, 2, range(4)) - >>> dm - DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False) - """ - # If the number of values are less than seventeen then return as it is. - # Else return first eight values and last eight values. - if len(self.values) < 17: - entries = _format_float_list(self.values) - else: - entries = ( - _format_float_list(self.values[:8]) + - ["..."] + - _format_float_list(self.values[-8:]) - ) - - entries = ", ".join(entries) - return "DenseMatrix({0}, {1}, [{2}], {3})".format( - self.numRows, self.numCols, entries, self.isTransposed) - - def toArray(self): - """ - Return an numpy.ndarray - - >>> m = DenseMatrix(2, 2, range(4)) - >>> m.toArray() - array([[ 0., 2.], - [ 1., 3.]]) - """ - if self.isTransposed: - return np.asfortranarray( - self.values.reshape((self.numRows, self.numCols))) - else: - return self.values.reshape((self.numRows, self.numCols), order='F') - - def toSparse(self): - """Convert to SparseMatrix""" - if self.isTransposed: - values = np.ravel(self.toArray(), order='F') - else: - values = self.values - indices = np.nonzero(values)[0] - colCounts = np.bincount(indices // self.numRows) - colPtrs = np.cumsum(np.hstack( - (0, colCounts, np.zeros(self.numCols - colCounts.size)))) - values = values[indices] - rowIndices = indices % self.numRows - - return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values) - - def asML(self): - """ - Convert this matrix to the new mllib-local representation. - This does NOT copy the data; it copies references. - - :return: :py:class:`pyspark.ml.linalg.DenseMatrix` - - .. versionadded:: 2.0.0 - """ - return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed) - - def __getitem__(self, indices): - i, j = indices - if i < 0 or i >= self.numRows: - raise IndexError("Row index %d is out of range [0, %d)" - % (i, self.numRows)) - if j >= self.numCols or j < 0: - raise IndexError("Column index %d is out of range [0, %d)" - % (j, self.numCols)) - - if self.isTransposed: - return self.values[i * self.numCols + j] - else: - return self.values[i + j * self.numRows] - - def __eq__(self, other): - if (not isinstance(other, DenseMatrix) or - self.numRows != other.numRows or - self.numCols != other.numCols): - return False - - self_values = np.ravel(self.toArray(), order='F') - other_values = np.ravel(other.toArray(), order='F') - return all(self_values == other_values) - - -class SparseMatrix(Matrix): - """Sparse Matrix stored in CSC format.""" - def __init__(self, numRows, numCols, colPtrs, rowIndices, values, - isTransposed=False): - Matrix.__init__(self, numRows, numCols, isTransposed) - self.colPtrs = self._convert_to_array(colPtrs, np.int32) - self.rowIndices = self._convert_to_array(rowIndices, np.int32) - self.values = self._convert_to_array(values, np.float64) - - if self.isTransposed: - if self.colPtrs.size != numRows + 1: - raise ValueError("Expected colPtrs of size %d, got %d." - % (numRows + 1, self.colPtrs.size)) - else: - if self.colPtrs.size != numCols + 1: - raise ValueError("Expected colPtrs of size %d, got %d." - % (numCols + 1, self.colPtrs.size)) - if self.rowIndices.size != self.values.size: - raise ValueError("Expected rowIndices of length %d, got %d." - % (self.rowIndices.size, self.values.size)) - - def __str__(self): - """ - Pretty printing of a SparseMatrix - - >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - >>> print(sm1) - 2 X 2 CSCMatrix - (0,0) 2.0 - (1,0) 3.0 - (1,1) 4.0 - >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) - >>> print(sm1) - 2 X 2 CSRMatrix - (0,0) 2.0 - (0,1) 3.0 - (1,1) 4.0 - """ - spstr = "{0} X {1} ".format(self.numRows, self.numCols) - if self.isTransposed: - spstr += "CSRMatrix\n" - else: - spstr += "CSCMatrix\n" - - cur_col = 0 - smlist = [] - - # Display first 16 values. - if len(self.values) <= 16: - zipindval = zip(self.rowIndices, self.values) - else: - zipindval = zip(self.rowIndices[:16], self.values[:16]) - for i, (rowInd, value) in enumerate(zipindval): - if self.colPtrs[cur_col + 1] <= i: - cur_col += 1 - if self.isTransposed: - smlist.append('({0},{1}) {2}'.format( - cur_col, rowInd, _format_float(value))) - else: - smlist.append('({0},{1}) {2}'.format( - rowInd, cur_col, _format_float(value))) - spstr += "\n".join(smlist) - - if len(self.values) > 16: - spstr += "\n.." * 2 - return spstr - - def __repr__(self): - """ - Representation of a SparseMatrix - - >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - >>> sm1 - SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False) - """ - rowIndices = list(self.rowIndices) - colPtrs = list(self.colPtrs) - - if len(self.values) <= 16: - values = _format_float_list(self.values) - - else: - values = ( - _format_float_list(self.values[:8]) + - ["..."] + - _format_float_list(self.values[-8:]) - ) - rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:] - - if len(self.colPtrs) > 16: - colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:] - - values = ", ".join(values) - rowIndices = ", ".join([str(ind) for ind in rowIndices]) - colPtrs = ", ".join([str(ptr) for ptr in colPtrs]) - return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format( - self.numRows, self.numCols, colPtrs, rowIndices, - values, self.isTransposed) - - def __reduce__(self): - return SparseMatrix, ( - self.numRows, self.numCols, self.colPtrs.tostring(), - self.rowIndices.tostring(), self.values.tostring(), - int(self.isTransposed)) - - def __getitem__(self, indices): - i, j = indices - if i < 0 or i >= self.numRows: - raise IndexError("Row index %d is out of range [0, %d)" - % (i, self.numRows)) - if j < 0 or j >= self.numCols: - raise IndexError("Column index %d is out of range [0, %d)" - % (j, self.numCols)) - - # If a CSR matrix is given, then the row index should be searched - # for in ColPtrs, and the column index should be searched for in the - # corresponding slice obtained from rowIndices. - if self.isTransposed: - j, i = i, j - - colStart = self.colPtrs[j] - colEnd = self.colPtrs[j + 1] - nz = self.rowIndices[colStart: colEnd] - ind = np.searchsorted(nz, i) + colStart - if ind < colEnd and self.rowIndices[ind] == i: - return self.values[ind] - else: - return 0.0 - - def toArray(self): - """ - Return an numpy.ndarray - """ - A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F') - for k in xrange(self.colPtrs.size - 1): - startptr = self.colPtrs[k] - endptr = self.colPtrs[k + 1] - if self.isTransposed: - A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr] - else: - A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr] - return A - - def toDense(self): - densevals = np.ravel(self.toArray(), order='F') - return DenseMatrix(self.numRows, self.numCols, densevals) - - def asML(self): - """ - Convert this matrix to the new mllib-local representation. - This does NOT copy the data; it copies references. - - :return: :py:class:`pyspark.ml.linalg.SparseMatrix` - - .. versionadded:: 2.0.0 - """ - return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices, - self.values, self.isTransposed) - - # TODO: More efficient implementation: - def __eq__(self, other): - return np.all(self.toArray() == other.toArray()) - - -class Matrices(object): - @staticmethod - def dense(numRows, numCols, values): - """ - Create a DenseMatrix - """ - return DenseMatrix(numRows, numCols, values) - - @staticmethod - def sparse(numRows, numCols, colPtrs, rowIndices, values): - """ - Create a SparseMatrix - """ - return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values) - - @staticmethod - def fromML(mat): - """ - Convert a matrix from the new mllib-local representation. - This does NOT copy the data; it copies references. - - :param mat: a :py:class:`pyspark.ml.linalg.Matrix` - :return: a :py:class:`pyspark.mllib.linalg.Matrix` - - .. versionadded:: 2.0.0 - """ - if isinstance(mat, newlinalg.DenseMatrix): - return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed) - elif isinstance(mat, newlinalg.SparseMatrix): - return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices, - mat.values, mat.isTransposed) - else: - raise TypeError("Unsupported matrix type %s" % type(mat)) - - -class QRDecomposition(object): - """ - Represents QR factors. - """ - def __init__(self, Q, R): - self._Q = Q - self._R = R - - @property - @since('2.0.0') - def Q(self): - """ - An orthogonal matrix Q in a QR decomposition. - May be null if not computed. - """ - return self._Q - - @property - @since('2.0.0') - def R(self): - """ - An upper triangular matrix R in a QR decomposition. - """ - return self._R - - -def _test(): - import doctest - import numpy - try: - # Numpy 1.14+ changed it's string format. - numpy.set_printoptions(legacy='1.13') - except TypeError: - pass - (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS) - if failure_count: - sys.exit(-1) - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/linalg/distributed.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/linalg/distributed.py deleted file mode 100644 index 7e8b150..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/linalg/distributed.py +++ /dev/null @@ -1,1389 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Package for distributed linear algebra. -""" - -import sys - -if sys.version >= '3': - long = int - -from py4j.java_gateway import JavaObject - -from pyspark import RDD, since -from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper -from pyspark.mllib.linalg import _convert_to_vector, DenseMatrix, Matrix, QRDecomposition -from pyspark.mllib.stat import MultivariateStatisticalSummary -from pyspark.storagelevel import StorageLevel - - -__all__ = ['BlockMatrix', 'CoordinateMatrix', 'DistributedMatrix', 'IndexedRow', - 'IndexedRowMatrix', 'MatrixEntry', 'RowMatrix', 'SingularValueDecomposition'] - - -class DistributedMatrix(object): - """ - Represents a distributively stored matrix backed by one or - more RDDs. - - """ - def numRows(self): - """Get or compute the number of rows.""" - raise NotImplementedError - - def numCols(self): - """Get or compute the number of cols.""" - raise NotImplementedError - - -class RowMatrix(DistributedMatrix): - """ - Represents a row-oriented distributed Matrix with no meaningful - row indices. - - :param rows: An RDD of vectors. - :param numRows: Number of rows in the matrix. A non-positive - value means unknown, at which point the number - of rows will be determined by the number of - records in the `rows` RDD. - :param numCols: Number of columns in the matrix. A non-positive - value means unknown, at which point the number - of columns will be determined by the size of - the first row. - """ - def __init__(self, rows, numRows=0, numCols=0): - """ - Note: This docstring is not shown publicly. - - Create a wrapper over a Java RowMatrix. - - Publicly, we require that `rows` be an RDD. However, for - internal usage, `rows` can also be a Java RowMatrix - object, in which case we can wrap it directly. This - assists in clean matrix conversions. - - >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) - >>> mat = RowMatrix(rows) - - >>> mat_diff = RowMatrix(rows) - >>> (mat_diff._java_matrix_wrapper._java_model == - ... mat._java_matrix_wrapper._java_model) - False - - >>> mat_same = RowMatrix(mat._java_matrix_wrapper._java_model) - >>> (mat_same._java_matrix_wrapper._java_model == - ... mat._java_matrix_wrapper._java_model) - True - """ - if isinstance(rows, RDD): - rows = rows.map(_convert_to_vector) - java_matrix = callMLlibFunc("createRowMatrix", rows, long(numRows), int(numCols)) - elif (isinstance(rows, JavaObject) - and rows.getClass().getSimpleName() == "RowMatrix"): - java_matrix = rows - else: - raise TypeError("rows should be an RDD of vectors, got %s" % type(rows)) - - self._java_matrix_wrapper = JavaModelWrapper(java_matrix) - - @property - def rows(self): - """ - Rows of the RowMatrix stored as an RDD of vectors. - - >>> mat = RowMatrix(sc.parallelize([[1, 2, 3], [4, 5, 6]])) - >>> rows = mat.rows - >>> rows.first() - DenseVector([1.0, 2.0, 3.0]) - """ - return self._java_matrix_wrapper.call("rows") - - def numRows(self): - """ - Get or compute the number of rows. - - >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6], - ... [7, 8, 9], [10, 11, 12]]) - - >>> mat = RowMatrix(rows) - >>> print(mat.numRows()) - 4 - - >>> mat = RowMatrix(rows, 7, 6) - >>> print(mat.numRows()) - 7 - """ - return self._java_matrix_wrapper.call("numRows") - - def numCols(self): - """ - Get or compute the number of cols. - - >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6], - ... [7, 8, 9], [10, 11, 12]]) - - >>> mat = RowMatrix(rows) - >>> print(mat.numCols()) - 3 - - >>> mat = RowMatrix(rows, 7, 6) - >>> print(mat.numCols()) - 6 - """ - return self._java_matrix_wrapper.call("numCols") - - @since('2.0.0') - def computeColumnSummaryStatistics(self): - """ - Computes column-wise summary statistics. - - :return: :class:`MultivariateStatisticalSummary` object - containing column-wise summary statistics. - - >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) - >>> mat = RowMatrix(rows) - - >>> colStats = mat.computeColumnSummaryStatistics() - >>> colStats.mean() - array([ 2.5, 3.5, 4.5]) - """ - java_col_stats = self._java_matrix_wrapper.call("computeColumnSummaryStatistics") - return MultivariateStatisticalSummary(java_col_stats) - - @since('2.0.0') - def computeCovariance(self): - """ - Computes the covariance matrix, treating each row as an - observation. - - .. note:: This cannot be computed on matrices with more than 65535 columns. - - >>> rows = sc.parallelize([[1, 2], [2, 1]]) - >>> mat = RowMatrix(rows) - - >>> mat.computeCovariance() - DenseMatrix(2, 2, [0.5, -0.5, -0.5, 0.5], 0) - """ - return self._java_matrix_wrapper.call("computeCovariance") - - @since('2.0.0') - def computeGramianMatrix(self): - """ - Computes the Gramian matrix `A^T A`. - - .. note:: This cannot be computed on matrices with more than 65535 columns. - - >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) - >>> mat = RowMatrix(rows) - - >>> mat.computeGramianMatrix() - DenseMatrix(3, 3, [17.0, 22.0, 27.0, 22.0, 29.0, 36.0, 27.0, 36.0, 45.0], 0) - """ - return self._java_matrix_wrapper.call("computeGramianMatrix") - - @since('2.0.0') - def columnSimilarities(self, threshold=0.0): - """ - Compute similarities between columns of this matrix. - - The threshold parameter is a trade-off knob between estimate - quality and computational cost. - - The default threshold setting of 0 guarantees deterministically - correct results, but uses the brute-force approach of computing - normalized dot products. - - Setting the threshold to positive values uses a sampling - approach and incurs strictly less computational cost than the - brute-force approach. However the similarities computed will - be estimates. - - The sampling guarantees relative-error correctness for those - pairs of columns that have similarity greater than the given - similarity threshold. - - To describe the guarantee, we set some notation: - * Let A be the smallest in magnitude non-zero element of - this matrix. - * Let B be the largest in magnitude non-zero element of - this matrix. - * Let L be the maximum number of non-zeros per row. - - For example, for {0,1} matrices: A=B=1. - Another example, for the Netflix matrix: A=1, B=5 - - For those column pairs that are above the threshold, the - computed similarity is correct to within 20% relative error - with probability at least 1 - (0.981)^10/B^ - - The shuffle size is bounded by the *smaller* of the following - two expressions: - - * O(n log(n) L / (threshold * A)) - * O(m L^2^) - - The latter is the cost of the brute-force approach, so for - non-zero thresholds, the cost is always cheaper than the - brute-force approach. - - :param: threshold: Set to 0 for deterministic guaranteed - correctness. Similarities above this - threshold are estimated with the cost vs - estimate quality trade-off described above. - :return: An n x n sparse upper-triangular CoordinateMatrix of - cosine similarities between columns of this matrix. - - >>> rows = sc.parallelize([[1, 2], [1, 5]]) - >>> mat = RowMatrix(rows) - - >>> sims = mat.columnSimilarities() - >>> sims.entries.first().value - 0.91914503... - """ - java_sims_mat = self._java_matrix_wrapper.call("columnSimilarities", float(threshold)) - return CoordinateMatrix(java_sims_mat) - - @since('2.0.0') - def tallSkinnyQR(self, computeQ=False): - """ - Compute the QR decomposition of this RowMatrix. - - The implementation is designed to optimize the QR decomposition - (factorization) for the RowMatrix of a tall and skinny shape. - - Reference: - Paul G. Constantine, David F. Gleich. "Tall and skinny QR - factorizations in MapReduce architectures" - ([[http://dx.doi.org/10.1145/1996092.1996103]]) - - :param: computeQ: whether to computeQ - :return: QRDecomposition(Q: RowMatrix, R: Matrix), where - Q = None if computeQ = false. - - >>> rows = sc.parallelize([[3, -6], [4, -8], [0, 1]]) - >>> mat = RowMatrix(rows) - >>> decomp = mat.tallSkinnyQR(True) - >>> Q = decomp.Q - >>> R = decomp.R - - >>> # Test with absolute values - >>> absQRows = Q.rows.map(lambda row: abs(row.toArray()).tolist()) - >>> absQRows.collect() - [[0.6..., 0.0], [0.8..., 0.0], [0.0, 1.0]] - - >>> # Test with absolute values - >>> abs(R.toArray()).tolist() - [[5.0, 10.0], [0.0, 1.0]] - """ - decomp = JavaModelWrapper(self._java_matrix_wrapper.call("tallSkinnyQR", computeQ)) - if computeQ: - java_Q = decomp.call("Q") - Q = RowMatrix(java_Q) - else: - Q = None - R = decomp.call("R") - return QRDecomposition(Q, R) - - @since('2.2.0') - def computeSVD(self, k, computeU=False, rCond=1e-9): - """ - Computes the singular value decomposition of the RowMatrix. - - The given row matrix A of dimension (m X n) is decomposed into - U * s * V'T where - - * U: (m X k) (left singular vectors) is a RowMatrix whose - columns are the eigenvectors of (A X A') - * s: DenseVector consisting of square root of the eigenvalues - (singular values) in descending order. - * v: (n X k) (right singular vectors) is a Matrix whose columns - are the eigenvectors of (A' X A) - - For more specific details on implementation, please refer - the Scala documentation. - - :param k: Number of leading singular values to keep (`0 < k <= n`). - It might return less than k if there are numerically zero singular values - or there are not enough Ritz values converged before the maximum number of - Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). - :param computeU: Whether or not to compute U. If set to be - True, then U is computed by A * V * s^-1 - :param rCond: Reciprocal condition number. All singular values - smaller than rCond * s[0] are treated as zero - where s[0] is the largest singular value. - :returns: :py:class:`SingularValueDecomposition` - - >>> rows = sc.parallelize([[3, 1, 1], [-1, 3, 1]]) - >>> rm = RowMatrix(rows) - - >>> svd_model = rm.computeSVD(2, True) - >>> svd_model.U.rows.collect() - [DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])] - >>> svd_model.s - DenseVector([3.4641, 3.1623]) - >>> svd_model.V - DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0) - """ - j_model = self._java_matrix_wrapper.call( - "computeSVD", int(k), bool(computeU), float(rCond)) - return SingularValueDecomposition(j_model) - - @since('2.2.0') - def computePrincipalComponents(self, k): - """ - Computes the k principal components of the given row matrix - - .. note:: This cannot be computed on matrices with more than 65535 columns. - - :param k: Number of principal components to keep. - :returns: :py:class:`pyspark.mllib.linalg.DenseMatrix` - - >>> rows = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]]) - >>> rm = RowMatrix(rows) - - >>> # Returns the two principal components of rm - >>> pca = rm.computePrincipalComponents(2) - >>> pca - DenseMatrix(3, 2, [-0.349, -0.6981, 0.6252, -0.2796, -0.5592, -0.7805], 0) - - >>> # Transform into new dimensions with the greatest variance. - >>> rm.multiply(pca).rows.collect() # doctest: +NORMALIZE_WHITESPACE - [DenseVector([0.1305, -3.7394]), DenseVector([-0.3642, -6.6983]), \ - DenseVector([-4.6102, -4.9745])] - """ - return self._java_matrix_wrapper.call("computePrincipalComponents", k) - - @since('2.2.0') - def multiply(self, matrix): - """ - Multiply this matrix by a local dense matrix on the right. - - :param matrix: a local dense matrix whose number of rows must match the number of columns - of this matrix - :returns: :py:class:`RowMatrix` - - >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]])) - >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() - [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])] - """ - if not isinstance(matrix, DenseMatrix): - raise ValueError("Only multiplication with DenseMatrix " - "is supported.") - j_model = self._java_matrix_wrapper.call("multiply", matrix) - return RowMatrix(j_model) - - -class SingularValueDecomposition(JavaModelWrapper): - """ - Represents singular value decomposition (SVD) factors. - - .. versionadded:: 2.2.0 - """ - - @property - @since('2.2.0') - def U(self): - """ - Returns a distributed matrix whose columns are the left - singular vectors of the SingularValueDecomposition if computeU was set to be True. - """ - u = self.call("U") - if u is not None: - mat_name = u.getClass().getSimpleName() - if mat_name == "RowMatrix": - return RowMatrix(u) - elif mat_name == "IndexedRowMatrix": - return IndexedRowMatrix(u) - else: - raise TypeError("Expected RowMatrix/IndexedRowMatrix got %s" % mat_name) - - @property - @since('2.2.0') - def s(self): - """ - Returns a DenseVector with singular values in descending order. - """ - return self.call("s") - - @property - @since('2.2.0') - def V(self): - """ - Returns a DenseMatrix whose columns are the right singular - vectors of the SingularValueDecomposition. - """ - return self.call("V") - - -class IndexedRow(object): - """ - Represents a row of an IndexedRowMatrix. - - Just a wrapper over a (long, vector) tuple. - - :param index: The index for the given row. - :param vector: The row in the matrix at the given index. - """ - def __init__(self, index, vector): - self.index = long(index) - self.vector = _convert_to_vector(vector) - - def __repr__(self): - return "IndexedRow(%s, %s)" % (self.index, self.vector) - - -def _convert_to_indexed_row(row): - if isinstance(row, IndexedRow): - return row - elif isinstance(row, tuple) and len(row) == 2: - return IndexedRow(*row) - else: - raise TypeError("Cannot convert type %s into IndexedRow" % type(row)) - - -class IndexedRowMatrix(DistributedMatrix): - """ - Represents a row-oriented distributed Matrix with indexed rows. - - :param rows: An RDD of IndexedRows or (long, vector) tuples. - :param numRows: Number of rows in the matrix. A non-positive - value means unknown, at which point the number - of rows will be determined by the max row - index plus one. - :param numCols: Number of columns in the matrix. A non-positive - value means unknown, at which point the number - of columns will be determined by the size of - the first row. - """ - def __init__(self, rows, numRows=0, numCols=0): - """ - Note: This docstring is not shown publicly. - - Create a wrapper over a Java IndexedRowMatrix. - - Publicly, we require that `rows` be an RDD. However, for - internal usage, `rows` can also be a Java IndexedRowMatrix - object, in which case we can wrap it directly. This - assists in clean matrix conversions. - - >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), - ... IndexedRow(1, [4, 5, 6])]) - >>> mat = IndexedRowMatrix(rows) - - >>> mat_diff = IndexedRowMatrix(rows) - >>> (mat_diff._java_matrix_wrapper._java_model == - ... mat._java_matrix_wrapper._java_model) - False - - >>> mat_same = IndexedRowMatrix(mat._java_matrix_wrapper._java_model) - >>> (mat_same._java_matrix_wrapper._java_model == - ... mat._java_matrix_wrapper._java_model) - True - """ - if isinstance(rows, RDD): - rows = rows.map(_convert_to_indexed_row) - # We use DataFrames for serialization of IndexedRows from - # Python, so first convert the RDD to a DataFrame on this - # side. This will convert each IndexedRow to a Row - # containing the 'index' and 'vector' values, which can - # both be easily serialized. We will convert back to - # IndexedRows on the Scala side. - java_matrix = callMLlibFunc("createIndexedRowMatrix", rows.toDF(), - long(numRows), int(numCols)) - elif (isinstance(rows, JavaObject) - and rows.getClass().getSimpleName() == "IndexedRowMatrix"): - java_matrix = rows - else: - raise TypeError("rows should be an RDD of IndexedRows or (long, vector) tuples, " - "got %s" % type(rows)) - - self._java_matrix_wrapper = JavaModelWrapper(java_matrix) - - @property - def rows(self): - """ - Rows of the IndexedRowMatrix stored as an RDD of IndexedRows. - - >>> mat = IndexedRowMatrix(sc.parallelize([IndexedRow(0, [1, 2, 3]), - ... IndexedRow(1, [4, 5, 6])])) - >>> rows = mat.rows - >>> rows.first() - IndexedRow(0, [1.0,2.0,3.0]) - """ - # We use DataFrames for serialization of IndexedRows from - # Java, so we first convert the RDD of rows to a DataFrame - # on the Scala/Java side. Then we map each Row in the - # DataFrame back to an IndexedRow on this side. - rows_df = callMLlibFunc("getIndexedRows", self._java_matrix_wrapper._java_model) - rows = rows_df.rdd.map(lambda row: IndexedRow(row[0], row[1])) - return rows - - def numRows(self): - """ - Get or compute the number of rows. - - >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), - ... IndexedRow(1, [4, 5, 6]), - ... IndexedRow(2, [7, 8, 9]), - ... IndexedRow(3, [10, 11, 12])]) - - >>> mat = IndexedRowMatrix(rows) - >>> print(mat.numRows()) - 4 - - >>> mat = IndexedRowMatrix(rows, 7, 6) - >>> print(mat.numRows()) - 7 - """ - return self._java_matrix_wrapper.call("numRows") - - def numCols(self): - """ - Get or compute the number of cols. - - >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), - ... IndexedRow(1, [4, 5, 6]), - ... IndexedRow(2, [7, 8, 9]), - ... IndexedRow(3, [10, 11, 12])]) - - >>> mat = IndexedRowMatrix(rows) - >>> print(mat.numCols()) - 3 - - >>> mat = IndexedRowMatrix(rows, 7, 6) - >>> print(mat.numCols()) - 6 - """ - return self._java_matrix_wrapper.call("numCols") - - def columnSimilarities(self): - """ - Compute all cosine similarities between columns. - - >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), - ... IndexedRow(6, [4, 5, 6])]) - >>> mat = IndexedRowMatrix(rows) - >>> cs = mat.columnSimilarities() - >>> print(cs.numCols()) - 3 - """ - java_coordinate_matrix = self._java_matrix_wrapper.call("columnSimilarities") - return CoordinateMatrix(java_coordinate_matrix) - - @since('2.0.0') - def computeGramianMatrix(self): - """ - Computes the Gramian matrix `A^T A`. - - .. note:: This cannot be computed on matrices with more than 65535 columns. - - >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), - ... IndexedRow(1, [4, 5, 6])]) - >>> mat = IndexedRowMatrix(rows) - - >>> mat.computeGramianMatrix() - DenseMatrix(3, 3, [17.0, 22.0, 27.0, 22.0, 29.0, 36.0, 27.0, 36.0, 45.0], 0) - """ - return self._java_matrix_wrapper.call("computeGramianMatrix") - - def toRowMatrix(self): - """ - Convert this matrix to a RowMatrix. - - >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), - ... IndexedRow(6, [4, 5, 6])]) - >>> mat = IndexedRowMatrix(rows).toRowMatrix() - >>> mat.rows.collect() - [DenseVector([1.0, 2.0, 3.0]), DenseVector([4.0, 5.0, 6.0])] - """ - java_row_matrix = self._java_matrix_wrapper.call("toRowMatrix") - return RowMatrix(java_row_matrix) - - def toCoordinateMatrix(self): - """ - Convert this matrix to a CoordinateMatrix. - - >>> rows = sc.parallelize([IndexedRow(0, [1, 0]), - ... IndexedRow(6, [0, 5])]) - >>> mat = IndexedRowMatrix(rows).toCoordinateMatrix() - >>> mat.entries.take(3) - [MatrixEntry(0, 0, 1.0), MatrixEntry(0, 1, 0.0), MatrixEntry(6, 0, 0.0)] - """ - java_coordinate_matrix = self._java_matrix_wrapper.call("toCoordinateMatrix") - return CoordinateMatrix(java_coordinate_matrix) - - def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): - """ - Convert this matrix to a BlockMatrix. - - :param rowsPerBlock: Number of rows that make up each block. - The blocks forming the final rows are not - required to have the given number of rows. - :param colsPerBlock: Number of columns that make up each block. - The blocks forming the final columns are not - required to have the given number of columns. - - >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), - ... IndexedRow(6, [4, 5, 6])]) - >>> mat = IndexedRowMatrix(rows).toBlockMatrix() - - >>> # This IndexedRowMatrix will have 7 effective rows, due to - >>> # the highest row index being 6, and the ensuing - >>> # BlockMatrix will have 7 rows as well. - >>> print(mat.numRows()) - 7 - - >>> print(mat.numCols()) - 3 - """ - java_block_matrix = self._java_matrix_wrapper.call("toBlockMatrix", - rowsPerBlock, - colsPerBlock) - return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock) - - @since('2.2.0') - def computeSVD(self, k, computeU=False, rCond=1e-9): - """ - Computes the singular value decomposition of the IndexedRowMatrix. - - The given row matrix A of dimension (m X n) is decomposed into - U * s * V'T where - - * U: (m X k) (left singular vectors) is a IndexedRowMatrix - whose columns are the eigenvectors of (A X A') - * s: DenseVector consisting of square root of the eigenvalues - (singular values) in descending order. - * v: (n X k) (right singular vectors) is a Matrix whose columns - are the eigenvectors of (A' X A) - - For more specific details on implementation, please refer - the scala documentation. - - :param k: Number of leading singular values to keep (`0 < k <= n`). - It might return less than k if there are numerically zero singular values - or there are not enough Ritz values converged before the maximum number of - Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). - :param computeU: Whether or not to compute U. If set to be - True, then U is computed by A * V * s^-1 - :param rCond: Reciprocal condition number. All singular values - smaller than rCond * s[0] are treated as zero - where s[0] is the largest singular value. - :returns: SingularValueDecomposition object - - >>> rows = [(0, (3, 1, 1)), (1, (-1, 3, 1))] - >>> irm = IndexedRowMatrix(sc.parallelize(rows)) - >>> svd_model = irm.computeSVD(2, True) - >>> svd_model.U.rows.collect() # doctest: +NORMALIZE_WHITESPACE - [IndexedRow(0, [-0.707106781187,0.707106781187]),\ - IndexedRow(1, [-0.707106781187,-0.707106781187])] - >>> svd_model.s - DenseVector([3.4641, 3.1623]) - >>> svd_model.V - DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0) - """ - j_model = self._java_matrix_wrapper.call( - "computeSVD", int(k), bool(computeU), float(rCond)) - return SingularValueDecomposition(j_model) - - @since('2.2.0') - def multiply(self, matrix): - """ - Multiply this matrix by a local dense matrix on the right. - - :param matrix: a local dense matrix whose number of rows must match the number of columns - of this matrix - :returns: :py:class:`IndexedRowMatrix` - - >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))])) - >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() - [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])] - """ - if not isinstance(matrix, DenseMatrix): - raise ValueError("Only multiplication with DenseMatrix " - "is supported.") - return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix)) - - -class MatrixEntry(object): - """ - Represents an entry of a CoordinateMatrix. - - Just a wrapper over a (long, long, float) tuple. - - :param i: The row index of the matrix. - :param j: The column index of the matrix. - :param value: The (i, j)th entry of the matrix, as a float. - """ - def __init__(self, i, j, value): - self.i = long(i) - self.j = long(j) - self.value = float(value) - - def __repr__(self): - return "MatrixEntry(%s, %s, %s)" % (self.i, self.j, self.value) - - -def _convert_to_matrix_entry(entry): - if isinstance(entry, MatrixEntry): - return entry - elif isinstance(entry, tuple) and len(entry) == 3: - return MatrixEntry(*entry) - else: - raise TypeError("Cannot convert type %s into MatrixEntry" % type(entry)) - - -class CoordinateMatrix(DistributedMatrix): - """ - Represents a matrix in coordinate format. - - :param entries: An RDD of MatrixEntry inputs or - (long, long, float) tuples. - :param numRows: Number of rows in the matrix. A non-positive - value means unknown, at which point the number - of rows will be determined by the max row - index plus one. - :param numCols: Number of columns in the matrix. A non-positive - value means unknown, at which point the number - of columns will be determined by the max row - index plus one. - """ - def __init__(self, entries, numRows=0, numCols=0): - """ - Note: This docstring is not shown publicly. - - Create a wrapper over a Java CoordinateMatrix. - - Publicly, we require that `rows` be an RDD. However, for - internal usage, `rows` can also be a Java CoordinateMatrix - object, in which case we can wrap it directly. This - assists in clean matrix conversions. - - >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), - ... MatrixEntry(6, 4, 2.1)]) - >>> mat = CoordinateMatrix(entries) - - >>> mat_diff = CoordinateMatrix(entries) - >>> (mat_diff._java_matrix_wrapper._java_model == - ... mat._java_matrix_wrapper._java_model) - False - - >>> mat_same = CoordinateMatrix(mat._java_matrix_wrapper._java_model) - >>> (mat_same._java_matrix_wrapper._java_model == - ... mat._java_matrix_wrapper._java_model) - True - """ - if isinstance(entries, RDD): - entries = entries.map(_convert_to_matrix_entry) - # We use DataFrames for serialization of MatrixEntry entries - # from Python, so first convert the RDD to a DataFrame on - # this side. This will convert each MatrixEntry to a Row - # containing the 'i', 'j', and 'value' values, which can - # each be easily serialized. We will convert back to - # MatrixEntry inputs on the Scala side. - java_matrix = callMLlibFunc("createCoordinateMatrix", entries.toDF(), - long(numRows), long(numCols)) - elif (isinstance(entries, JavaObject) - and entries.getClass().getSimpleName() == "CoordinateMatrix"): - java_matrix = entries - else: - raise TypeError("entries should be an RDD of MatrixEntry entries or " - "(long, long, float) tuples, got %s" % type(entries)) - - self._java_matrix_wrapper = JavaModelWrapper(java_matrix) - - @property - def entries(self): - """ - Entries of the CoordinateMatrix stored as an RDD of - MatrixEntries. - - >>> mat = CoordinateMatrix(sc.parallelize([MatrixEntry(0, 0, 1.2), - ... MatrixEntry(6, 4, 2.1)])) - >>> entries = mat.entries - >>> entries.first() - MatrixEntry(0, 0, 1.2) - """ - # We use DataFrames for serialization of MatrixEntry entries - # from Java, so we first convert the RDD of entries to a - # DataFrame on the Scala/Java side. Then we map each Row in - # the DataFrame back to a MatrixEntry on this side. - entries_df = callMLlibFunc("getMatrixEntries", self._java_matrix_wrapper._java_model) - entries = entries_df.rdd.map(lambda row: MatrixEntry(row[0], row[1], row[2])) - return entries - - def numRows(self): - """ - Get or compute the number of rows. - - >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), - ... MatrixEntry(1, 0, 2), - ... MatrixEntry(2, 1, 3.7)]) - - >>> mat = CoordinateMatrix(entries) - >>> print(mat.numRows()) - 3 - - >>> mat = CoordinateMatrix(entries, 7, 6) - >>> print(mat.numRows()) - 7 - """ - return self._java_matrix_wrapper.call("numRows") - - def numCols(self): - """ - Get or compute the number of cols. - - >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), - ... MatrixEntry(1, 0, 2), - ... MatrixEntry(2, 1, 3.7)]) - - >>> mat = CoordinateMatrix(entries) - >>> print(mat.numCols()) - 2 - - >>> mat = CoordinateMatrix(entries, 7, 6) - >>> print(mat.numCols()) - 6 - """ - return self._java_matrix_wrapper.call("numCols") - - @since('2.0.0') - def transpose(self): - """ - Transpose this CoordinateMatrix. - - >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), - ... MatrixEntry(1, 0, 2), - ... MatrixEntry(2, 1, 3.7)]) - >>> mat = CoordinateMatrix(entries) - >>> mat_transposed = mat.transpose() - - >>> print(mat_transposed.numRows()) - 2 - - >>> print(mat_transposed.numCols()) - 3 - """ - java_transposed_matrix = self._java_matrix_wrapper.call("transpose") - return CoordinateMatrix(java_transposed_matrix) - - def toRowMatrix(self): - """ - Convert this matrix to a RowMatrix. - - >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), - ... MatrixEntry(6, 4, 2.1)]) - >>> mat = CoordinateMatrix(entries).toRowMatrix() - - >>> # This CoordinateMatrix will have 7 effective rows, due to - >>> # the highest row index being 6, but the ensuing RowMatrix - >>> # will only have 2 rows since there are only entries on 2 - >>> # unique rows. - >>> print(mat.numRows()) - 2 - - >>> # This CoordinateMatrix will have 5 columns, due to the - >>> # highest column index being 4, and the ensuing RowMatrix - >>> # will have 5 columns as well. - >>> print(mat.numCols()) - 5 - """ - java_row_matrix = self._java_matrix_wrapper.call("toRowMatrix") - return RowMatrix(java_row_matrix) - - def toIndexedRowMatrix(self): - """ - Convert this matrix to an IndexedRowMatrix. - - >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), - ... MatrixEntry(6, 4, 2.1)]) - >>> mat = CoordinateMatrix(entries).toIndexedRowMatrix() - - >>> # This CoordinateMatrix will have 7 effective rows, due to - >>> # the highest row index being 6, and the ensuing - >>> # IndexedRowMatrix will have 7 rows as well. - >>> print(mat.numRows()) - 7 - - >>> # This CoordinateMatrix will have 5 columns, due to the - >>> # highest column index being 4, and the ensuing - >>> # IndexedRowMatrix will have 5 columns as well. - >>> print(mat.numCols()) - 5 - """ - java_indexed_row_matrix = self._java_matrix_wrapper.call("toIndexedRowMatrix") - return IndexedRowMatrix(java_indexed_row_matrix) - - def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): - """ - Convert this matrix to a BlockMatrix. - - :param rowsPerBlock: Number of rows that make up each block. - The blocks forming the final rows are not - required to have the given number of rows. - :param colsPerBlock: Number of columns that make up each block. - The blocks forming the final columns are not - required to have the given number of columns. - - >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), - ... MatrixEntry(6, 4, 2.1)]) - >>> mat = CoordinateMatrix(entries).toBlockMatrix() - - >>> # This CoordinateMatrix will have 7 effective rows, due to - >>> # the highest row index being 6, and the ensuing - >>> # BlockMatrix will have 7 rows as well. - >>> print(mat.numRows()) - 7 - - >>> # This CoordinateMatrix will have 5 columns, due to the - >>> # highest column index being 4, and the ensuing - >>> # BlockMatrix will have 5 columns as well. - >>> print(mat.numCols()) - 5 - """ - java_block_matrix = self._java_matrix_wrapper.call("toBlockMatrix", - rowsPerBlock, - colsPerBlock) - return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock) - - -def _convert_to_matrix_block_tuple(block): - if (isinstance(block, tuple) and len(block) == 2 - and isinstance(block[0], tuple) and len(block[0]) == 2 - and isinstance(block[1], Matrix)): - blockRowIndex = int(block[0][0]) - blockColIndex = int(block[0][1]) - subMatrix = block[1] - return ((blockRowIndex, blockColIndex), subMatrix) - else: - raise TypeError("Cannot convert type %s into a sub-matrix block tuple" % type(block)) - - -class BlockMatrix(DistributedMatrix): - """ - Represents a distributed matrix in blocks of local matrices. - - :param blocks: An RDD of sub-matrix blocks - ((blockRowIndex, blockColIndex), sub-matrix) that - form this distributed matrix. If multiple blocks - with the same index exist, the results for - operations like add and multiply will be - unpredictable. - :param rowsPerBlock: Number of rows that make up each block. - The blocks forming the final rows are not - required to have the given number of rows. - :param colsPerBlock: Number of columns that make up each block. - The blocks forming the final columns are not - required to have the given number of columns. - :param numRows: Number of rows of this matrix. If the supplied - value is less than or equal to zero, the number - of rows will be calculated when `numRows` is - invoked. - :param numCols: Number of columns of this matrix. If the supplied - value is less than or equal to zero, the number - of columns will be calculated when `numCols` is - invoked. - """ - def __init__(self, blocks, rowsPerBlock, colsPerBlock, numRows=0, numCols=0): - """ - Note: This docstring is not shown publicly. - - Create a wrapper over a Java BlockMatrix. - - Publicly, we require that `blocks` be an RDD. However, for - internal usage, `blocks` can also be a Java BlockMatrix - object, in which case we can wrap it directly. This - assists in clean matrix conversions. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - >>> mat = BlockMatrix(blocks, 3, 2) - - >>> mat_diff = BlockMatrix(blocks, 3, 2) - >>> (mat_diff._java_matrix_wrapper._java_model == - ... mat._java_matrix_wrapper._java_model) - False - - >>> mat_same = BlockMatrix(mat._java_matrix_wrapper._java_model, 3, 2) - >>> (mat_same._java_matrix_wrapper._java_model == - ... mat._java_matrix_wrapper._java_model) - True - """ - if isinstance(blocks, RDD): - blocks = blocks.map(_convert_to_matrix_block_tuple) - # We use DataFrames for serialization of sub-matrix blocks - # from Python, so first convert the RDD to a DataFrame on - # this side. This will convert each sub-matrix block - # tuple to a Row containing the 'blockRowIndex', - # 'blockColIndex', and 'subMatrix' values, which can - # each be easily serialized. We will convert back to - # ((blockRowIndex, blockColIndex), sub-matrix) tuples on - # the Scala side. - java_matrix = callMLlibFunc("createBlockMatrix", blocks.toDF(), - int(rowsPerBlock), int(colsPerBlock), - long(numRows), long(numCols)) - elif (isinstance(blocks, JavaObject) - and blocks.getClass().getSimpleName() == "BlockMatrix"): - java_matrix = blocks - else: - raise TypeError("blocks should be an RDD of sub-matrix blocks as " - "((int, int), matrix) tuples, got %s" % type(blocks)) - - self._java_matrix_wrapper = JavaModelWrapper(java_matrix) - - @property - def blocks(self): - """ - The RDD of sub-matrix blocks - ((blockRowIndex, blockColIndex), sub-matrix) that form this - distributed matrix. - - >>> mat = BlockMatrix( - ... sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]), 3, 2) - >>> blocks = mat.blocks - >>> blocks.first() - ((0, 0), DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 0)) - - """ - # We use DataFrames for serialization of sub-matrix blocks - # from Java, so we first convert the RDD of blocks to a - # DataFrame on the Scala/Java side. Then we map each Row in - # the DataFrame back to a sub-matrix block on this side. - blocks_df = callMLlibFunc("getMatrixBlocks", self._java_matrix_wrapper._java_model) - blocks = blocks_df.rdd.map(lambda row: ((row[0][0], row[0][1]), row[1])) - return blocks - - @property - def rowsPerBlock(self): - """ - Number of rows that make up each block. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - >>> mat = BlockMatrix(blocks, 3, 2) - >>> mat.rowsPerBlock - 3 - """ - return self._java_matrix_wrapper.call("rowsPerBlock") - - @property - def colsPerBlock(self): - """ - Number of columns that make up each block. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - >>> mat = BlockMatrix(blocks, 3, 2) - >>> mat.colsPerBlock - 2 - """ - return self._java_matrix_wrapper.call("colsPerBlock") - - @property - def numRowBlocks(self): - """ - Number of rows of blocks in the BlockMatrix. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - >>> mat = BlockMatrix(blocks, 3, 2) - >>> mat.numRowBlocks - 2 - """ - return self._java_matrix_wrapper.call("numRowBlocks") - - @property - def numColBlocks(self): - """ - Number of columns of blocks in the BlockMatrix. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - >>> mat = BlockMatrix(blocks, 3, 2) - >>> mat.numColBlocks - 1 - """ - return self._java_matrix_wrapper.call("numColBlocks") - - def numRows(self): - """ - Get or compute the number of rows. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - - >>> mat = BlockMatrix(blocks, 3, 2) - >>> print(mat.numRows()) - 6 - - >>> mat = BlockMatrix(blocks, 3, 2, 7, 6) - >>> print(mat.numRows()) - 7 - """ - return self._java_matrix_wrapper.call("numRows") - - def numCols(self): - """ - Get or compute the number of cols. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - - >>> mat = BlockMatrix(blocks, 3, 2) - >>> print(mat.numCols()) - 2 - - >>> mat = BlockMatrix(blocks, 3, 2, 7, 6) - >>> print(mat.numCols()) - 6 - """ - return self._java_matrix_wrapper.call("numCols") - - @since('2.0.0') - def cache(self): - """ - Caches the underlying RDD. - """ - self._java_matrix_wrapper.call("cache") - return self - - @since('2.0.0') - def persist(self, storageLevel): - """ - Persists the underlying RDD with the specified storage level. - """ - if not isinstance(storageLevel, StorageLevel): - raise TypeError("`storageLevel` should be a StorageLevel, got %s" % type(storageLevel)) - javaStorageLevel = self._java_matrix_wrapper._sc._getJavaStorageLevel(storageLevel) - self._java_matrix_wrapper.call("persist", javaStorageLevel) - return self - - @since('2.0.0') - def validate(self): - """ - Validates the block matrix info against the matrix data (`blocks`) - and throws an exception if any error is found. - """ - self._java_matrix_wrapper.call("validate") - - def add(self, other): - """ - Adds two block matrices together. The matrices must have the - same size and matching `rowsPerBlock` and `colsPerBlock` values. - If one of the sub matrix blocks that are being added is a - SparseMatrix, the resulting sub matrix block will also be a - SparseMatrix, even if it is being added to a DenseMatrix. If - two dense sub matrix blocks are added, the output block will - also be a DenseMatrix. - - >>> dm1 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) - >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]) - >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12]) - >>> blocks1 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)]) - >>> blocks2 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)]) - >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm2)]) - >>> mat1 = BlockMatrix(blocks1, 3, 2) - >>> mat2 = BlockMatrix(blocks2, 3, 2) - >>> mat3 = BlockMatrix(blocks3, 3, 2) - - >>> mat1.add(mat2).toLocalMatrix() - DenseMatrix(6, 2, [2.0, 4.0, 6.0, 14.0, 16.0, 18.0, 8.0, 10.0, 12.0, 20.0, 22.0, 24.0], 0) - - >>> mat1.add(mat3).toLocalMatrix() - DenseMatrix(6, 2, [8.0, 2.0, 3.0, 14.0, 16.0, 18.0, 4.0, 16.0, 18.0, 20.0, 22.0, 24.0], 0) - """ - if not isinstance(other, BlockMatrix): - raise TypeError("Other should be a BlockMatrix, got %s" % type(other)) - - other_java_block_matrix = other._java_matrix_wrapper._java_model - java_block_matrix = self._java_matrix_wrapper.call("add", other_java_block_matrix) - return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock) - - @since('2.0.0') - def subtract(self, other): - """ - Subtracts the given block matrix `other` from this block matrix: - `this - other`. The matrices must have the same size and - matching `rowsPerBlock` and `colsPerBlock` values. If one of - the sub matrix blocks that are being subtracted is a - SparseMatrix, the resulting sub matrix block will also be a - SparseMatrix, even if it is being subtracted from a DenseMatrix. - If two dense sub matrix blocks are subtracted, the output block - will also be a DenseMatrix. - - >>> dm1 = Matrices.dense(3, 2, [3, 1, 5, 4, 6, 2]) - >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]) - >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [1, 2, 3]) - >>> blocks1 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)]) - >>> blocks2 = sc.parallelize([((0, 0), dm2), ((1, 0), dm1)]) - >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm2)]) - >>> mat1 = BlockMatrix(blocks1, 3, 2) - >>> mat2 = BlockMatrix(blocks2, 3, 2) - >>> mat3 = BlockMatrix(blocks3, 3, 2) - - >>> mat1.subtract(mat2).toLocalMatrix() - DenseMatrix(6, 2, [-4.0, -7.0, -4.0, 4.0, 7.0, 4.0, -6.0, -5.0, -10.0, 6.0, 5.0, 10.0], 0) - - >>> mat2.subtract(mat3).toLocalMatrix() - DenseMatrix(6, 2, [6.0, 8.0, 9.0, -4.0, -7.0, -4.0, 10.0, 9.0, 9.0, -6.0, -5.0, -10.0], 0) - """ - if not isinstance(other, BlockMatrix): - raise TypeError("Other should be a BlockMatrix, got %s" % type(other)) - - other_java_block_matrix = other._java_matrix_wrapper._java_model - java_block_matrix = self._java_matrix_wrapper.call("subtract", other_java_block_matrix) - return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock) - - def multiply(self, other): - """ - Left multiplies this BlockMatrix by `other`, another - BlockMatrix. The `colsPerBlock` of this matrix must equal the - `rowsPerBlock` of `other`. If `other` contains any SparseMatrix - blocks, they will have to be converted to DenseMatrix blocks. - The output BlockMatrix will only consist of DenseMatrix blocks. - This may cause some performance issues until support for - multiplying two sparse matrices is added. - - >>> dm1 = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6]) - >>> dm2 = Matrices.dense(2, 3, [7, 8, 9, 10, 11, 12]) - >>> dm3 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) - >>> dm4 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]) - >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12]) - >>> blocks1 = sc.parallelize([((0, 0), dm1), ((0, 1), dm2)]) - >>> blocks2 = sc.parallelize([((0, 0), dm3), ((1, 0), dm4)]) - >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm4)]) - >>> mat1 = BlockMatrix(blocks1, 2, 3) - >>> mat2 = BlockMatrix(blocks2, 3, 2) - >>> mat3 = BlockMatrix(blocks3, 3, 2) - - >>> mat1.multiply(mat2).toLocalMatrix() - DenseMatrix(2, 2, [242.0, 272.0, 350.0, 398.0], 0) - - >>> mat1.multiply(mat3).toLocalMatrix() - DenseMatrix(2, 2, [227.0, 258.0, 394.0, 450.0], 0) - """ - if not isinstance(other, BlockMatrix): - raise TypeError("Other should be a BlockMatrix, got %s" % type(other)) - - other_java_block_matrix = other._java_matrix_wrapper._java_model - java_block_matrix = self._java_matrix_wrapper.call("multiply", other_java_block_matrix) - return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock) - - @since('2.0.0') - def transpose(self): - """ - Transpose this BlockMatrix. Returns a new BlockMatrix - instance sharing the same underlying data. Is a lazy operation. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - >>> mat = BlockMatrix(blocks, 3, 2) - - >>> mat_transposed = mat.transpose() - >>> mat_transposed.toLocalMatrix() - DenseMatrix(2, 6, [1.0, 4.0, 2.0, 5.0, 3.0, 6.0, 7.0, 10.0, 8.0, 11.0, 9.0, 12.0], 0) - """ - java_transposed_matrix = self._java_matrix_wrapper.call("transpose") - return BlockMatrix(java_transposed_matrix, self.colsPerBlock, self.rowsPerBlock) - - def toLocalMatrix(self): - """ - Collect the distributed matrix on the driver as a DenseMatrix. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - >>> mat = BlockMatrix(blocks, 3, 2).toLocalMatrix() - - >>> # This BlockMatrix will have 6 effective rows, due to - >>> # having two sub-matrix blocks stacked, each with 3 rows. - >>> # The ensuing DenseMatrix will also have 6 rows. - >>> print(mat.numRows) - 6 - - >>> # This BlockMatrix will have 2 effective columns, due to - >>> # having two sub-matrix blocks stacked, each with 2 - >>> # columns. The ensuing DenseMatrix will also have 2 columns. - >>> print(mat.numCols) - 2 - """ - return self._java_matrix_wrapper.call("toLocalMatrix") - - def toIndexedRowMatrix(self): - """ - Convert this matrix to an IndexedRowMatrix. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), - ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) - >>> mat = BlockMatrix(blocks, 3, 2).toIndexedRowMatrix() - - >>> # This BlockMatrix will have 6 effective rows, due to - >>> # having two sub-matrix blocks stacked, each with 3 rows. - >>> # The ensuing IndexedRowMatrix will also have 6 rows. - >>> print(mat.numRows()) - 6 - - >>> # This BlockMatrix will have 2 effective columns, due to - >>> # having two sub-matrix blocks stacked, each with 2 columns. - >>> # The ensuing IndexedRowMatrix will also have 2 columns. - >>> print(mat.numCols()) - 2 - """ - java_indexed_row_matrix = self._java_matrix_wrapper.call("toIndexedRowMatrix") - return IndexedRowMatrix(java_indexed_row_matrix) - - def toCoordinateMatrix(self): - """ - Convert this matrix to a CoordinateMatrix. - - >>> blocks = sc.parallelize([((0, 0), Matrices.dense(1, 2, [1, 2])), - ... ((1, 0), Matrices.dense(1, 2, [7, 8]))]) - >>> mat = BlockMatrix(blocks, 1, 2).toCoordinateMatrix() - >>> mat.entries.take(3) - [MatrixEntry(0, 0, 1.0), MatrixEntry(0, 1, 2.0), MatrixEntry(1, 0, 7.0)] - """ - java_coordinate_matrix = self._java_matrix_wrapper.call("toCoordinateMatrix") - return CoordinateMatrix(java_coordinate_matrix) - - -def _test(): - import doctest - import numpy - from pyspark.sql import SparkSession - from pyspark.mllib.linalg import Matrices - import pyspark.mllib.linalg.distributed - try: - # Numpy 1.14+ changed it's string format. - numpy.set_printoptions(legacy='1.13') - except TypeError: - pass - globs = pyspark.mllib.linalg.distributed.__dict__.copy() - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("mllib.linalg.distributed tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - globs['Matrices'] = Matrices - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/random.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/random.py deleted file mode 100644 index a8833cb..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/random.py +++ /dev/null @@ -1,429 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Python package for random data generation. -""" - -import sys -from functools import wraps - -from pyspark import since -from pyspark.mllib.common import callMLlibFunc - - -__all__ = ['RandomRDDs', ] - - -def toArray(f): - @wraps(f) - def func(sc, *a, **kw): - rdd = f(sc, *a, **kw) - return rdd.map(lambda vec: vec.toArray()) - return func - - -class RandomRDDs(object): - """ - Generator methods for creating RDDs comprised of i.i.d samples from - some distribution. - - .. versionadded:: 1.1.0 - """ - - @staticmethod - @since("1.1.0") - def uniformRDD(sc, size, numPartitions=None, seed=None): - """ - Generates an RDD comprised of i.i.d. samples from the - uniform distribution U(0.0, 1.0). - - To transform the distribution in the generated RDD from U(0.0, 1.0) - to U(a, b), use - C{RandomRDDs.uniformRDD(sc, n, p, seed)\ - .map(lambda v: a + (b - a) * v)} - - :param sc: SparkContext used to create the RDD. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`. - - >>> x = RandomRDDs.uniformRDD(sc, 100).collect() - >>> len(x) - 100 - >>> max(x) <= 1.0 and min(x) >= 0.0 - True - >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions() - 4 - >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions() - >>> parts == sc.defaultParallelism - True - """ - return callMLlibFunc("uniformRDD", sc._jsc, size, numPartitions, seed) - - @staticmethod - @since("1.1.0") - def normalRDD(sc, size, numPartitions=None, seed=None): - """ - Generates an RDD comprised of i.i.d. samples from the standard normal - distribution. - - To transform the distribution in the generated RDD from standard normal - to some other normal N(mean, sigma^2), use - C{RandomRDDs.normal(sc, n, p, seed)\ - .map(lambda v: mean + sigma * v)} - - :param sc: SparkContext used to create the RDD. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0). - - >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1) - >>> stats = x.stats() - >>> stats.count() - 1000 - >>> abs(stats.mean() - 0.0) < 0.1 - True - >>> abs(stats.stdev() - 1.0) < 0.1 - True - """ - return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed) - - @staticmethod - @since("1.3.0") - def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None): - """ - Generates an RDD comprised of i.i.d. samples from the log normal - distribution with the input mean and standard distribution. - - :param sc: SparkContext used to create the RDD. - :param mean: mean for the log Normal distribution - :param std: std for the log Normal distribution - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ log N(mean, std). - - >>> from math import sqrt, exp - >>> mean = 0.0 - >>> std = 1.0 - >>> expMean = exp(mean + 0.5 * std * std) - >>> expStd = sqrt((exp(std * std) - 1.0) * exp(2.0 * mean + std * std)) - >>> x = RandomRDDs.logNormalRDD(sc, mean, std, 1000, seed=2) - >>> stats = x.stats() - >>> stats.count() - 1000 - >>> abs(stats.mean() - expMean) < 0.5 - True - >>> from math import sqrt - >>> abs(stats.stdev() - expStd) < 0.5 - True - """ - return callMLlibFunc("logNormalRDD", sc._jsc, float(mean), float(std), - size, numPartitions, seed) - - @staticmethod - @since("1.1.0") - def poissonRDD(sc, mean, size, numPartitions=None, seed=None): - """ - Generates an RDD comprised of i.i.d. samples from the Poisson - distribution with the input mean. - - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or lambda, for the Poisson distribution. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ Pois(mean). - - >>> mean = 100.0 - >>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=2) - >>> stats = x.stats() - >>> stats.count() - 1000 - >>> abs(stats.mean() - mean) < 0.5 - True - >>> from math import sqrt - >>> abs(stats.stdev() - sqrt(mean)) < 0.5 - True - """ - return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed) - - @staticmethod - @since("1.3.0") - def exponentialRDD(sc, mean, size, numPartitions=None, seed=None): - """ - Generates an RDD comprised of i.i.d. samples from the Exponential - distribution with the input mean. - - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or 1 / lambda, for the Exponential distribution. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ Exp(mean). - - >>> mean = 2.0 - >>> x = RandomRDDs.exponentialRDD(sc, mean, 1000, seed=2) - >>> stats = x.stats() - >>> stats.count() - 1000 - >>> abs(stats.mean() - mean) < 0.5 - True - >>> from math import sqrt - >>> abs(stats.stdev() - sqrt(mean)) < 0.5 - True - """ - return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed) - - @staticmethod - @since("1.3.0") - def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None): - """ - Generates an RDD comprised of i.i.d. samples from the Gamma - distribution with the input shape and scale. - - :param sc: SparkContext used to create the RDD. - :param shape: shape (> 0) parameter for the Gamma distribution - :param scale: scale (> 0) parameter for the Gamma distribution - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ Gamma(shape, scale). - - >>> from math import sqrt - >>> shape = 1.0 - >>> scale = 2.0 - >>> expMean = shape * scale - >>> expStd = sqrt(shape * scale * scale) - >>> x = RandomRDDs.gammaRDD(sc, shape, scale, 1000, seed=2) - >>> stats = x.stats() - >>> stats.count() - 1000 - >>> abs(stats.mean() - expMean) < 0.5 - True - >>> abs(stats.stdev() - expStd) < 0.5 - True - """ - return callMLlibFunc("gammaRDD", sc._jsc, float(shape), - float(scale), size, numPartitions, seed) - - @staticmethod - @toArray - @since("1.1.0") - def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): - """ - Generates an RDD comprised of vectors containing i.i.d. samples drawn - from the uniform distribution U(0.0, 1.0). - - :param sc: SparkContext used to create the RDD. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD. - :param seed: Seed for the RNG that generates the seed for the generator in each partition. - :return: RDD of Vector with vectors containing i.i.d samples ~ `U(0.0, 1.0)`. - - >>> import numpy as np - >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) - >>> mat.shape - (10, 10) - >>> mat.max() <= 1.0 and mat.min() >= 0.0 - True - >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions() - 4 - """ - return callMLlibFunc("uniformVectorRDD", sc._jsc, numRows, numCols, numPartitions, seed) - - @staticmethod - @toArray - @since("1.1.0") - def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): - """ - Generates an RDD comprised of vectors containing i.i.d. samples drawn - from the standard normal distribution. - - :param sc: SparkContext used to create the RDD. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ `N(0.0, 1.0)`. - - >>> import numpy as np - >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1).collect()) - >>> mat.shape - (100, 100) - >>> abs(mat.mean() - 0.0) < 0.1 - True - >>> abs(mat.std() - 1.0) < 0.1 - True - """ - return callMLlibFunc("normalVectorRDD", sc._jsc, numRows, numCols, numPartitions, seed) - - @staticmethod - @toArray - @since("1.3.0") - def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed=None): - """ - Generates an RDD comprised of vectors containing i.i.d. samples drawn - from the log normal distribution. - - :param sc: SparkContext used to create the RDD. - :param mean: Mean of the log normal distribution - :param std: Standard Deviation of the log normal distribution - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ log `N(mean, std)`. - - >>> import numpy as np - >>> from math import sqrt, exp - >>> mean = 0.0 - >>> std = 1.0 - >>> expMean = exp(mean + 0.5 * std * std) - >>> expStd = sqrt((exp(std * std) - 1.0) * exp(2.0 * mean + std * std)) - >>> m = RandomRDDs.logNormalVectorRDD(sc, mean, std, 100, 100, seed=1).collect() - >>> mat = np.matrix(m) - >>> mat.shape - (100, 100) - >>> abs(mat.mean() - expMean) < 0.1 - True - >>> abs(mat.std() - expStd) < 0.1 - True - """ - return callMLlibFunc("logNormalVectorRDD", sc._jsc, float(mean), float(std), - numRows, numCols, numPartitions, seed) - - @staticmethod - @toArray - @since("1.1.0") - def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): - """ - Generates an RDD comprised of vectors containing i.i.d. samples drawn - from the Poisson distribution with the input mean. - - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or lambda, for the Poisson distribution. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`) - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ Pois(mean). - - >>> import numpy as np - >>> mean = 100.0 - >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1) - >>> mat = np.mat(rdd.collect()) - >>> mat.shape - (100, 100) - >>> abs(mat.mean() - mean) < 0.5 - True - >>> from math import sqrt - >>> abs(mat.std() - sqrt(mean)) < 0.5 - True - """ - return callMLlibFunc("poissonVectorRDD", sc._jsc, float(mean), numRows, numCols, - numPartitions, seed) - - @staticmethod - @toArray - @since("1.3.0") - def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): - """ - Generates an RDD comprised of vectors containing i.i.d. samples drawn - from the Exponential distribution with the input mean. - - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or 1 / lambda, for the Exponential distribution. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`) - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean). - - >>> import numpy as np - >>> mean = 0.5 - >>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1) - >>> mat = np.mat(rdd.collect()) - >>> mat.shape - (100, 100) - >>> abs(mat.mean() - mean) < 0.5 - True - >>> from math import sqrt - >>> abs(mat.std() - sqrt(mean)) < 0.5 - True - """ - return callMLlibFunc("exponentialVectorRDD", sc._jsc, float(mean), numRows, numCols, - numPartitions, seed) - - @staticmethod - @toArray - @since("1.3.0") - def gammaVectorRDD(sc, shape, scale, numRows, numCols, numPartitions=None, seed=None): - """ - Generates an RDD comprised of vectors containing i.i.d. samples drawn - from the Gamma distribution. - - :param sc: SparkContext used to create the RDD. - :param shape: Shape (> 0) of the Gamma distribution - :param scale: Scale (> 0) of the Gamma distribution - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ Gamma(shape, scale). - - >>> import numpy as np - >>> from math import sqrt - >>> shape = 1.0 - >>> scale = 2.0 - >>> expMean = shape * scale - >>> expStd = sqrt(shape * scale * scale) - >>> mat = np.matrix(RandomRDDs.gammaVectorRDD(sc, shape, scale, 100, 100, seed=1).collect()) - >>> mat.shape - (100, 100) - >>> abs(mat.mean() - expMean) < 0.1 - True - >>> abs(mat.std() - expStd) < 0.1 - True - """ - return callMLlibFunc("gammaVectorRDD", sc._jsc, float(shape), float(scale), - numRows, numCols, numPartitions, seed) - - -def _test(): - import doctest - from pyspark.sql import SparkSession - globs = globals().copy() - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("mllib.random tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/recommendation.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/recommendation.py deleted file mode 100644 index 3d4eae8..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/recommendation.py +++ /dev/null @@ -1,334 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import array -import sys -from collections import namedtuple - -from pyspark import SparkContext, since -from pyspark.rdd import RDD -from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, inherit_doc -from pyspark.mllib.util import JavaLoader, JavaSaveable -from pyspark.sql import DataFrame - -__all__ = ['MatrixFactorizationModel', 'ALS', 'Rating'] - - -class Rating(namedtuple("Rating", ["user", "product", "rating"])): - """ - Represents a (user, product, rating) tuple. - - >>> r = Rating(1, 2, 5.0) - >>> (r.user, r.product, r.rating) - (1, 2, 5.0) - >>> (r[0], r[1], r[2]) - (1, 2, 5.0) - - .. versionadded:: 1.2.0 - """ - - def __reduce__(self): - return Rating, (int(self.user), int(self.product), float(self.rating)) - - -@inherit_doc -class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader): - - """A matrix factorisation model trained by regularized alternating - least-squares. - - >>> r1 = (1, 1, 1.0) - >>> r2 = (1, 2, 2.0) - >>> r3 = (2, 1, 2.0) - >>> ratings = sc.parallelize([r1, r2, r3]) - >>> model = ALS.trainImplicit(ratings, 1, seed=10) - >>> model.predict(2, 2) - 0.4... - - >>> testset = sc.parallelize([(1, 2), (1, 1)]) - >>> model = ALS.train(ratings, 2, seed=0) - >>> model.predictAll(testset).collect() - [Rating(user=1, product=1, rating=1.0...), Rating(user=1, product=2, rating=1.9...)] - - >>> model = ALS.train(ratings, 4, seed=10) - >>> model.userFeatures().collect() - [(1, array('d', [...])), (2, array('d', [...]))] - - >>> model.recommendUsers(1, 2) - [Rating(user=2, product=1, rating=1.9...), Rating(user=1, product=1, rating=1.0...)] - >>> model.recommendProducts(1, 2) - [Rating(user=1, product=2, rating=1.9...), Rating(user=1, product=1, rating=1.0...)] - >>> model.rank - 4 - - >>> first_user = model.userFeatures().take(1)[0] - >>> latents = first_user[1] - >>> len(latents) - 4 - - >>> model.productFeatures().collect() - [(1, array('d', [...])), (2, array('d', [...]))] - - >>> first_product = model.productFeatures().take(1)[0] - >>> latents = first_product[1] - >>> len(latents) - 4 - - >>> products_for_users = model.recommendProductsForUsers(1).collect() - >>> len(products_for_users) - 2 - >>> products_for_users[0] - (1, (Rating(user=1, product=2, rating=...),)) - - >>> users_for_products = model.recommendUsersForProducts(1).collect() - >>> len(users_for_products) - 2 - >>> users_for_products[0] - (1, (Rating(user=2, product=1, rating=...),)) - - >>> model = ALS.train(ratings, 1, nonnegative=True, seed=10) - >>> model.predict(2, 2) - 3.73... - - >>> df = sqlContext.createDataFrame([Rating(1, 1, 1.0), Rating(1, 2, 2.0), Rating(2, 1, 2.0)]) - >>> model = ALS.train(df, 1, nonnegative=True, seed=10) - >>> model.predict(2, 2) - 3.73... - - >>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=10) - >>> model.predict(2, 2) - 0.4... - - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> model.save(sc, path) - >>> sameModel = MatrixFactorizationModel.load(sc, path) - >>> sameModel.predict(2, 2) - 0.4... - >>> sameModel.predictAll(testset).collect() - [Rating(... - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except OSError: - ... pass - - .. versionadded:: 0.9.0 - """ - @since("0.9.0") - def predict(self, user, product): - """ - Predicts rating for the given user and product. - """ - return self._java_model.predict(int(user), int(product)) - - @since("0.9.0") - def predictAll(self, user_product): - """ - Returns a list of predicted ratings for input user and product - pairs. - """ - assert isinstance(user_product, RDD), "user_product should be RDD of (user, product)" - first = user_product.first() - assert len(first) == 2, "user_product should be RDD of (user, product)" - user_product = user_product.map(lambda u_p: (int(u_p[0]), int(u_p[1]))) - return self.call("predict", user_product) - - @since("1.2.0") - def userFeatures(self): - """ - Returns a paired RDD, where the first element is the user and the - second is an array of features corresponding to that user. - """ - return self.call("getUserFeatures").mapValues(lambda v: array.array('d', v)) - - @since("1.2.0") - def productFeatures(self): - """ - Returns a paired RDD, where the first element is the product and the - second is an array of features corresponding to that product. - """ - return self.call("getProductFeatures").mapValues(lambda v: array.array('d', v)) - - @since("1.4.0") - def recommendUsers(self, product, num): - """ - Recommends the top "num" number of users for a given product and - returns a list of Rating objects sorted by the predicted rating in - descending order. - """ - return list(self.call("recommendUsers", product, num)) - - @since("1.4.0") - def recommendProducts(self, user, num): - """ - Recommends the top "num" number of products for a given user and - returns a list of Rating objects sorted by the predicted rating in - descending order. - """ - return list(self.call("recommendProducts", user, num)) - - def recommendProductsForUsers(self, num): - """ - Recommends the top "num" number of products for all users. The - number of recommendations returned per user may be less than "num". - """ - return self.call("wrappedRecommendProductsForUsers", num) - - def recommendUsersForProducts(self, num): - """ - Recommends the top "num" number of users for all products. The - number of recommendations returned per product may be less than - "num". - """ - return self.call("wrappedRecommendUsersForProducts", num) - - @property - @since("1.4.0") - def rank(self): - """Rank for the features in this model""" - return self.call("rank") - - @classmethod - @since("1.3.1") - def load(cls, sc, path): - """Load a model from the given path""" - model = cls._load_java(sc, path) - wrapper = sc._jvm.org.apache.spark.mllib.api.python.MatrixFactorizationModelWrapper(model) - return MatrixFactorizationModel(wrapper) - - -class ALS(object): - """Alternating Least Squares matrix factorization - - .. versionadded:: 0.9.0 - """ - - @classmethod - def _prepare(cls, ratings): - if isinstance(ratings, RDD): - pass - elif isinstance(ratings, DataFrame): - ratings = ratings.rdd - else: - raise TypeError("Ratings should be represented by either an RDD or a DataFrame, " - "but got %s." % type(ratings)) - first = ratings.first() - if isinstance(first, Rating): - pass - elif isinstance(first, (tuple, list)): - ratings = ratings.map(lambda x: Rating(*x)) - else: - raise TypeError("Expect a Rating or a tuple/list, but got %s." % type(first)) - return ratings - - @classmethod - @since("0.9.0") - def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False, - seed=None): - """ - Train a matrix factorization model given an RDD of ratings by users - for a subset of products. The ratings matrix is approximated as the - product of two lower-rank matrices of a given rank (number of - features). To solve for these features, ALS is run iteratively with - a configurable level of parallelism. - - :param ratings: - RDD of `Rating` or (userID, productID, rating) tuple. - :param rank: - Number of features to use (also referred to as the number of latent factors). - :param iterations: - Number of iterations of ALS. - (default: 5) - :param lambda_: - Regularization parameter. - (default: 0.01) - :param blocks: - Number of blocks used to parallelize the computation. A value - of -1 will use an auto-configured number of blocks. - (default: -1) - :param nonnegative: - A value of True will solve least-squares with nonnegativity - constraints. - (default: False) - :param seed: - Random seed for initial matrix factorization model. A value - of None will use system time as the seed. - (default: None) - """ - model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations, - lambda_, blocks, nonnegative, seed) - return MatrixFactorizationModel(model) - - @classmethod - @since("0.9.0") - def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01, - nonnegative=False, seed=None): - """ - Train a matrix factorization model given an RDD of 'implicit - preferences' of users for a subset of products. The ratings matrix - is approximated as the product of two lower-rank matrices of a - given rank (number of features). To solve for these features, ALS - is run iteratively with a configurable level of parallelism. - - :param ratings: - RDD of `Rating` or (userID, productID, rating) tuple. - :param rank: - Number of features to use (also referred to as the number of latent factors). - :param iterations: - Number of iterations of ALS. - (default: 5) - :param lambda_: - Regularization parameter. - (default: 0.01) - :param blocks: - Number of blocks used to parallelize the computation. A value - of -1 will use an auto-configured number of blocks. - (default: -1) - :param alpha: - A constant used in computing confidence. - (default: 0.01) - :param nonnegative: - A value of True will solve least-squares with nonnegativity - constraints. - (default: False) - :param seed: - Random seed for initial matrix factorization model. A value - of None will use system time as the seed. - (default: None) - """ - model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank, - iterations, lambda_, blocks, alpha, nonnegative, seed) - return MatrixFactorizationModel(model) - - -def _test(): - import doctest - import pyspark.mllib.recommendation - from pyspark.sql import SQLContext - globs = pyspark.mllib.recommendation.__dict__.copy() - sc = SparkContext('local[4]', 'PythonTest') - globs['sc'] = sc - globs['sqlContext'] = SQLContext(sc) - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - globs['sc'].stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/regression.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/regression.py deleted file mode 100644 index 6be45f5..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/regression.py +++ /dev/null @@ -1,845 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import warnings - -import numpy as np -from numpy import array - -from pyspark import RDD, since -from pyspark.streaming.dstream import DStream -from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc -from pyspark.mllib.linalg import SparseVector, Vectors, _convert_to_vector -from pyspark.mllib.util import Saveable, Loader - -__all__ = ['LabeledPoint', 'LinearModel', - 'LinearRegressionModel', 'LinearRegressionWithSGD', - 'RidgeRegressionModel', 'RidgeRegressionWithSGD', - 'LassoModel', 'LassoWithSGD', 'IsotonicRegressionModel', - 'IsotonicRegression', 'StreamingLinearAlgorithm', - 'StreamingLinearRegressionWithSGD'] - - -class LabeledPoint(object): - - """ - Class that represents the features and labels of a data point. - - :param label: - Label for this data point. - :param features: - Vector of features for this point (NumPy array, list, - pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix). - - .. note:: 'label' and 'features' are accessible as class attributes. - - .. versionadded:: 1.0.0 - """ - - def __init__(self, label, features): - self.label = float(label) - self.features = _convert_to_vector(features) - - def __reduce__(self): - return (LabeledPoint, (self.label, self.features)) - - def __str__(self): - return "(" + ",".join((str(self.label), str(self.features))) + ")" - - def __repr__(self): - return "LabeledPoint(%s, %s)" % (self.label, self.features) - - -class LinearModel(object): - - """ - A linear model that has a vector of coefficients and an intercept. - - :param weights: - Weights computed for every feature. - :param intercept: - Intercept computed for this model. - - .. versionadded:: 0.9.0 - """ - - def __init__(self, weights, intercept): - self._coeff = _convert_to_vector(weights) - self._intercept = float(intercept) - - @property - @since("1.0.0") - def weights(self): - """Weights computed for every feature.""" - return self._coeff - - @property - @since("1.0.0") - def intercept(self): - """Intercept computed for this model.""" - return self._intercept - - def __repr__(self): - return "(weights=%s, intercept=%r)" % (self._coeff, self._intercept) - - -@inherit_doc -class LinearRegressionModelBase(LinearModel): - - """A linear regression model. - - >>> lrmb = LinearRegressionModelBase(np.array([1.0, 2.0]), 0.1) - >>> abs(lrmb.predict(np.array([-1.03, 7.777])) - 14.624) < 1e-6 - True - >>> abs(lrmb.predict(SparseVector(2, {0: -1.03, 1: 7.777})) - 14.624) < 1e-6 - True - - .. versionadded:: 0.9.0 - """ - - @since("0.9.0") - def predict(self, x): - """ - Predict the value of the dependent variable given a vector or - an RDD of vectors containing values for the independent variables. - """ - if isinstance(x, RDD): - return x.map(self.predict) - x = _convert_to_vector(x) - return self.weights.dot(x) + self.intercept - - -@inherit_doc -class LinearRegressionModel(LinearRegressionModelBase): - - """A linear regression model derived from a least-squares fit. - - >>> from pyspark.mllib.regression import LabeledPoint - >>> data = [ - ... LabeledPoint(0.0, [0.0]), - ... LabeledPoint(1.0, [1.0]), - ... LabeledPoint(3.0, [2.0]), - ... LabeledPoint(2.0, [3.0]) - ... ] - >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, - ... initialWeights=np.array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 - True - >>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5 - True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - >>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5 - True - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> lrm.save(sc, path) - >>> sameModel = LinearRegressionModel.load(sc, path) - >>> abs(sameModel.predict(np.array([0.0])) - 0) < 0.5 - True - >>> abs(sameModel.predict(np.array([1.0])) - 1) < 0.5 - True - >>> abs(sameModel.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except: - ... pass - >>> data = [ - ... LabeledPoint(0.0, SparseVector(1, {0: 0.0})), - ... LabeledPoint(1.0, SparseVector(1, {0: 1.0})), - ... LabeledPoint(3.0, SparseVector(1, {0: 2.0})), - ... LabeledPoint(2.0, SparseVector(1, {0: 3.0})) - ... ] - >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, - ... initialWeights=array([1.0])) - >>> abs(lrm.predict(array([0.0])) - 0) < 0.5 - True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0, - ... miniBatchFraction=1.0, initialWeights=array([1.0]), regParam=0.1, regType="l2", - ... intercept=True, validateData=True) - >>> abs(lrm.predict(array([0.0])) - 0) < 0.5 - True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - - .. versionadded:: 0.9.0 - """ - @since("1.4.0") - def save(self, sc, path): - """Save a LinearRegressionModel.""" - java_model = sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel( - _py2java(sc, self._coeff), self.intercept) - java_model.save(sc._jsc.sc(), path) - - @classmethod - @since("1.4.0") - def load(cls, sc, path): - """Load a LinearRegressionModel.""" - java_model = sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel.load( - sc._jsc.sc(), path) - weights = _java2py(sc, java_model.weights()) - intercept = java_model.intercept() - model = LinearRegressionModel(weights, intercept) - return model - - -# train_func should take two parameters, namely data and initial_weights, and -# return the result of a call to the appropriate JVM stub. -# _regression_train_wrapper is responsible for setup and error checking. -def _regression_train_wrapper(train_func, modelClass, data, initial_weights): - from pyspark.mllib.classification import LogisticRegressionModel - first = data.first() - if not isinstance(first, LabeledPoint): - raise TypeError("data should be an RDD of LabeledPoint, but got %s" % type(first)) - if initial_weights is None: - initial_weights = [0.0] * len(data.first().features) - if (modelClass == LogisticRegressionModel): - weights, intercept, numFeatures, numClasses = train_func( - data, _convert_to_vector(initial_weights)) - return modelClass(weights, intercept, numFeatures, numClasses) - else: - weights, intercept = train_func(data, _convert_to_vector(initial_weights)) - return modelClass(weights, intercept) - - -class LinearRegressionWithSGD(object): - """ - .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.regression.LinearRegression. - """ - @classmethod - @since("0.9.0") - def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, - initialWeights=None, regParam=0.0, regType=None, intercept=False, - validateData=True, convergenceTol=0.001): - """ - Train a linear regression model using Stochastic Gradient - Descent (SGD). This solves the least squares regression - formulation - - f(weights) = 1/(2n) ||A weights - y||^2 - - which is the mean squared error. Here the data matrix has n rows, - and the input RDD holds the set of rows of A, each with its - corresponding right hand side label y. - See also the documentation for the precise formulation. - - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param regParam: - The regularizer parameter. - (default: 0.0) - :param regType: - The type of regularizer used for training our model. - Supported values: - - - "l1" for using L1 regularization - - "l2" for using L2 regularization - - None for no regularization (default) - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e., whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) - """ - warnings.warn( - "Deprecated in 2.0.0. Use ml.regression.LinearRegression.", DeprecationWarning) - - def train(rdd, i): - return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations), - float(step), float(miniBatchFraction), i, float(regParam), - regType, bool(intercept), bool(validateData), - float(convergenceTol)) - - return _regression_train_wrapper(train, LinearRegressionModel, data, initialWeights) - - -@inherit_doc -class LassoModel(LinearRegressionModelBase): - - """A linear regression model derived from a least-squares fit with - an l_1 penalty term. - - >>> from pyspark.mllib.regression import LabeledPoint - >>> data = [ - ... LabeledPoint(0.0, [0.0]), - ... LabeledPoint(1.0, [1.0]), - ... LabeledPoint(3.0, [2.0]), - ... LabeledPoint(2.0, [3.0]) - ... ] - >>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, initialWeights=array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 - True - >>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5 - True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - >>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5 - True - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> lrm.save(sc, path) - >>> sameModel = LassoModel.load(sc, path) - >>> abs(sameModel.predict(np.array([0.0])) - 0) < 0.5 - True - >>> abs(sameModel.predict(np.array([1.0])) - 1) < 0.5 - True - >>> abs(sameModel.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except: - ... pass - >>> data = [ - ... LabeledPoint(0.0, SparseVector(1, {0: 0.0})), - ... LabeledPoint(1.0, SparseVector(1, {0: 1.0})), - ... LabeledPoint(3.0, SparseVector(1, {0: 2.0})), - ... LabeledPoint(2.0, SparseVector(1, {0: 3.0})) - ... ] - >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, - ... initialWeights=array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 - True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - >>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=10, step=1.0, - ... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True, - ... validateData=True) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 - True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - - .. versionadded:: 0.9.0 - """ - @since("1.4.0") - def save(self, sc, path): - """Save a LassoModel.""" - java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel( - _py2java(sc, self._coeff), self.intercept) - java_model.save(sc._jsc.sc(), path) - - @classmethod - @since("1.4.0") - def load(cls, sc, path): - """Load a LassoModel.""" - java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel.load( - sc._jsc.sc(), path) - weights = _java2py(sc, java_model.weights()) - intercept = java_model.intercept() - model = LassoModel(weights, intercept) - return model - - -class LassoWithSGD(object): - """ - .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 1.0. - Note the default regParam is 0.01 for LassoWithSGD, but is 0.0 for LinearRegression. - """ - @classmethod - @since("0.9.0") - def train(cls, data, iterations=100, step=1.0, regParam=0.01, - miniBatchFraction=1.0, initialWeights=None, intercept=False, - validateData=True, convergenceTol=0.001): - """ - Train a regression model with L1-regularization using Stochastic - Gradient Descent. This solves the l1-regularized least squares - regression formulation - - f(weights) = 1/(2n) ||A weights - y||^2 + regParam ||weights||_1 - - Here the data matrix has n rows, and the input RDD holds the set - of rows of A, each with its corresponding right hand side label y. - See also the documentation for the precise formulation. - - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e. whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) - """ - warnings.warn( - "Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 1.0. " - "Note the default regParam is 0.01 for LassoWithSGD, but is 0.0 for LinearRegression.", - DeprecationWarning) - - def train(rdd, i): - return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step), - float(regParam), float(miniBatchFraction), i, bool(intercept), - bool(validateData), float(convergenceTol)) - - return _regression_train_wrapper(train, LassoModel, data, initialWeights) - - -@inherit_doc -class RidgeRegressionModel(LinearRegressionModelBase): - - """A linear regression model derived from a least-squares fit with - an l_2 penalty term. - - >>> from pyspark.mllib.regression import LabeledPoint - >>> data = [ - ... LabeledPoint(0.0, [0.0]), - ... LabeledPoint(1.0, [1.0]), - ... LabeledPoint(3.0, [2.0]), - ... LabeledPoint(2.0, [3.0]) - ... ] - >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10, - ... initialWeights=array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 - True - >>> abs(lrm.predict(np.array([1.0])) - 1) < 0.5 - True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - >>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5 - True - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> lrm.save(sc, path) - >>> sameModel = RidgeRegressionModel.load(sc, path) - >>> abs(sameModel.predict(np.array([0.0])) - 0) < 0.5 - True - >>> abs(sameModel.predict(np.array([1.0])) - 1) < 0.5 - True - >>> abs(sameModel.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except: - ... pass - >>> data = [ - ... LabeledPoint(0.0, SparseVector(1, {0: 0.0})), - ... LabeledPoint(1.0, SparseVector(1, {0: 1.0})), - ... LabeledPoint(3.0, SparseVector(1, {0: 2.0})), - ... LabeledPoint(2.0, SparseVector(1, {0: 3.0})) - ... ] - >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=10, - ... initialWeights=array([1.0])) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 - True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=10, step=1.0, - ... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True, - ... validateData=True) - >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5 - True - >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 - True - - .. versionadded:: 0.9.0 - """ - @since("1.4.0") - def save(self, sc, path): - """Save a RidgeRegressionMode.""" - java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel( - _py2java(sc, self._coeff), self.intercept) - java_model.save(sc._jsc.sc(), path) - - @classmethod - @since("1.4.0") - def load(cls, sc, path): - """Load a RidgeRegressionMode.""" - java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel.load( - sc._jsc.sc(), path) - weights = _java2py(sc, java_model.weights()) - intercept = java_model.intercept() - model = RidgeRegressionModel(weights, intercept) - return model - - -class RidgeRegressionWithSGD(object): - """ - .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 0.0. - Note the default regParam is 0.01 for RidgeRegressionWithSGD, but is 0.0 for - LinearRegression. - """ - @classmethod - @since("0.9.0") - def train(cls, data, iterations=100, step=1.0, regParam=0.01, - miniBatchFraction=1.0, initialWeights=None, intercept=False, - validateData=True, convergenceTol=0.001): - """ - Train a regression model with L2-regularization using Stochastic - Gradient Descent. This solves the l2-regularized least squares - regression formulation - - f(weights) = 1/(2n) ||A weights - y||^2 + regParam/2 ||weights||^2 - - Here the data matrix has n rows, and the input RDD holds the set - of rows of A, each with its corresponding right hand side label y. - See also the documentation for the precise formulation. - - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e. whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) - """ - warnings.warn( - "Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 0.0. " - "Note the default regParam is 0.01 for RidgeRegressionWithSGD, but is 0.0 for " - "LinearRegression.", DeprecationWarning) - - def train(rdd, i): - return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step), - float(regParam), float(miniBatchFraction), i, bool(intercept), - bool(validateData), float(convergenceTol)) - - return _regression_train_wrapper(train, RidgeRegressionModel, data, initialWeights) - - -class IsotonicRegressionModel(Saveable, Loader): - - """ - Regression model for isotonic regression. - - :param boundaries: - Array of boundaries for which predictions are known. Boundaries - must be sorted in increasing order. - :param predictions: - Array of predictions associated to the boundaries at the same - index. Results of isotonic regression and therefore monotone. - :param isotonic: - Indicates whether this is isotonic or antitonic. - - >>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)] - >>> irm = IsotonicRegression.train(sc.parallelize(data)) - >>> irm.predict(3) - 2.0 - >>> irm.predict(5) - 16.5 - >>> irm.predict(sc.parallelize([3, 5])).collect() - [2.0, 16.5] - >>> import os, tempfile - >>> path = tempfile.mkdtemp() - >>> irm.save(sc, path) - >>> sameModel = IsotonicRegressionModel.load(sc, path) - >>> sameModel.predict(3) - 2.0 - >>> sameModel.predict(5) - 16.5 - >>> from shutil import rmtree - >>> try: - ... rmtree(path) - ... except OSError: - ... pass - - .. versionadded:: 1.4.0 - """ - - def __init__(self, boundaries, predictions, isotonic): - self.boundaries = boundaries - self.predictions = predictions - self.isotonic = isotonic - - @since("1.4.0") - def predict(self, x): - """ - Predict labels for provided features. - Using a piecewise linear function. - 1) If x exactly matches a boundary then associated prediction - is returned. In case there are multiple predictions with the - same boundary then one of them is returned. Which one is - undefined (same as java.util.Arrays.binarySearch). - 2) If x is lower or higher than all boundaries then first or - last prediction is returned respectively. In case there are - multiple predictions with the same boundary then the lowest - or highest is returned respectively. - 3) If x falls between two values in boundary array then - prediction is treated as piecewise linear function and - interpolated value is returned. In case there are multiple - values with the same boundary then the same rules as in 2) - are used. - - :param x: - Feature or RDD of Features to be labeled. - """ - if isinstance(x, RDD): - return x.map(lambda v: self.predict(v)) - return np.interp(x, self.boundaries, self.predictions) - - @since("1.4.0") - def save(self, sc, path): - """Save an IsotonicRegressionModel.""" - java_boundaries = _py2java(sc, self.boundaries.tolist()) - java_predictions = _py2java(sc, self.predictions.tolist()) - java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel( - java_boundaries, java_predictions, self.isotonic) - java_model.save(sc._jsc.sc(), path) - - @classmethod - @since("1.4.0") - def load(cls, sc, path): - """Load an IsotonicRegressionModel.""" - java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load( - sc._jsc.sc(), path) - py_boundaries = _java2py(sc, java_model.boundaryVector()).toArray() - py_predictions = _java2py(sc, java_model.predictionVector()).toArray() - return IsotonicRegressionModel(py_boundaries, py_predictions, java_model.isotonic) - - -class IsotonicRegression(object): - """ - Isotonic regression. - Currently implemented using parallelized pool adjacent violators - algorithm. Only univariate (single feature) algorithm supported. - - Sequential PAV implementation based on: - - Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani. - "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61. - Available from http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf - - Sequential PAV parallelization based on: - - Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset. - "An approach to parallelizing isotonic regression." - Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147. - Available from http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf - - See `Isotonic regression (Wikipedia) `_. - - .. versionadded:: 1.4.0 - """ - - @classmethod - @since("1.4.0") - def train(cls, data, isotonic=True): - """ - Train an isotonic regression model on the given data. - - :param data: - RDD of (label, feature, weight) tuples. - :param isotonic: - Whether this is isotonic (which is default) or antitonic. - (default: True) - """ - boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel", - data.map(_convert_to_vector), bool(isotonic)) - return IsotonicRegressionModel(boundaries.toArray(), predictions.toArray(), isotonic) - - -class StreamingLinearAlgorithm(object): - """ - Base class that has to be inherited by any StreamingLinearAlgorithm. - - Prevents reimplementation of methods predictOn and predictOnValues. - - .. versionadded:: 1.5.0 - """ - def __init__(self, model): - self._model = model - - @since("1.5.0") - def latestModel(self): - """ - Returns the latest model. - """ - return self._model - - def _validate(self, dstream): - if not isinstance(dstream, DStream): - raise TypeError( - "dstream should be a DStream object, got %s" % type(dstream)) - if not self._model: - raise ValueError( - "Model must be intialized using setInitialWeights") - - @since("1.5.0") - def predictOn(self, dstream): - """ - Use the model to make predictions on batches of data from a - DStream. - - :return: - DStream containing predictions. - """ - self._validate(dstream) - return dstream.map(lambda x: self._model.predict(x)) - - @since("1.5.0") - def predictOnValues(self, dstream): - """ - Use the model to make predictions on the values of a DStream and - carry over its keys. - - :return: - DStream containing the input keys and the predictions as values. - """ - self._validate(dstream) - return dstream.mapValues(lambda x: self._model.predict(x)) - - -@inherit_doc -class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm): - """ - Train or predict a linear regression model on streaming data. - Training uses Stochastic Gradient Descent to update the model - based on each new batch of incoming data from a DStream - (see `LinearRegressionWithSGD` for model equation). - - Each batch of data is assumed to be an RDD of LabeledPoints. - The number of data points per batch can vary, but the number - of features must be constant. An initial weight vector must - be provided. - - :param stepSize: - Step size for each iteration of gradient descent. - (default: 0.1) - :param numIterations: - Number of iterations run for each batch of data. - (default: 50) - :param miniBatchFraction: - Fraction of each batch of data to use for updates. - (default: 1.0) - :param convergenceTol: - Value used to determine when to terminate iterations. - (default: 0.001) - - .. versionadded:: 1.5.0 - """ - def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, convergenceTol=0.001): - self.stepSize = stepSize - self.numIterations = numIterations - self.miniBatchFraction = miniBatchFraction - self.convergenceTol = convergenceTol - self._model = None - super(StreamingLinearRegressionWithSGD, self).__init__( - model=self._model) - - @since("1.5.0") - def setInitialWeights(self, initialWeights): - """ - Set the initial value of weights. - - This must be set before running trainOn and predictOn - """ - initialWeights = _convert_to_vector(initialWeights) - self._model = LinearRegressionModel(initialWeights, 0) - return self - - @since("1.5.0") - def trainOn(self, dstream): - """Train the model on the incoming dstream.""" - self._validate(dstream) - - def update(rdd): - # LinearRegressionWithSGD.train raises an error for an empty RDD. - if not rdd.isEmpty(): - self._model = LinearRegressionWithSGD.train( - rdd, self.numIterations, self.stepSize, - self.miniBatchFraction, self._model.weights, - intercept=self._model.intercept, convergenceTol=self.convergenceTol) - - dstream.foreachRDD(update) - - -def _test(): - import doctest - from pyspark.sql import SparkSession - import pyspark.mllib.regression - globs = pyspark.mllib.regression.__dict__.copy() - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("mllib.regression tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/KernelDensity.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/KernelDensity.py deleted file mode 100644 index 7250eab..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/KernelDensity.py +++ /dev/null @@ -1,59 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys - -if sys.version > '3': - xrange = range - -import numpy as np - -from pyspark.mllib.common import callMLlibFunc -from pyspark.rdd import RDD - - -class KernelDensity(object): - """ - Estimate probability density at required points given an RDD of samples - from the population. - - >>> kd = KernelDensity() - >>> sample = sc.parallelize([0.0, 1.0]) - >>> kd.setSample(sample) - >>> kd.estimate([0.0, 1.0]) - array([ 0.12938758, 0.12938758]) - """ - def __init__(self): - self._bandwidth = 1.0 - self._sample = None - - def setBandwidth(self, bandwidth): - """Set bandwidth of each sample. Defaults to 1.0""" - self._bandwidth = bandwidth - - def setSample(self, sample): - """Set sample points from the population. Should be a RDD""" - if not isinstance(sample, RDD): - raise TypeError("samples should be a RDD, received %s" % type(sample)) - self._sample = sample - - def estimate(self, points): - """Estimate the probability density at points""" - points = list(points) - densities = callMLlibFunc( - "estimateKernelDensity", self._sample, self._bandwidth, points) - return np.asarray(densities) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/__init__.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/__init__.py deleted file mode 100644 index c8a721d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Python package for statistical functions in MLlib. -""" - -from pyspark.mllib.stat._statistics import * -from pyspark.mllib.stat.distribution import MultivariateGaussian -from pyspark.mllib.stat.test import ChiSqTestResult -from pyspark.mllib.stat.KernelDensity import KernelDensity - -__all__ = ["Statistics", "MultivariateStatisticalSummary", "ChiSqTestResult", - "MultivariateGaussian", "KernelDensity"] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/_statistics.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/_statistics.py deleted file mode 100644 index 6e89bfd..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/_statistics.py +++ /dev/null @@ -1,326 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -if sys.version >= '3': - basestring = str - -from pyspark.rdd import RDD, ignore_unicode_prefix -from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper -from pyspark.mllib.linalg import Matrix, _convert_to_vector -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.stat.test import ChiSqTestResult, KolmogorovSmirnovTestResult - - -__all__ = ['MultivariateStatisticalSummary', 'Statistics'] - - -class MultivariateStatisticalSummary(JavaModelWrapper): - - """ - Trait for multivariate statistical summary of a data matrix. - """ - - def mean(self): - return self.call("mean").toArray() - - def variance(self): - return self.call("variance").toArray() - - def count(self): - return int(self.call("count")) - - def numNonzeros(self): - return self.call("numNonzeros").toArray() - - def max(self): - return self.call("max").toArray() - - def min(self): - return self.call("min").toArray() - - def normL1(self): - return self.call("normL1").toArray() - - def normL2(self): - return self.call("normL2").toArray() - - -class Statistics(object): - - @staticmethod - def colStats(rdd): - """ - Computes column-wise summary statistics for the input RDD[Vector]. - - :param rdd: an RDD[Vector] for which column-wise summary statistics - are to be computed. - :return: :class:`MultivariateStatisticalSummary` object containing - column-wise summary statistics. - - >>> from pyspark.mllib.linalg import Vectors - >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]), - ... Vectors.dense([4, 5, 0, 3]), - ... Vectors.dense([6, 7, 0, 8])]) - >>> cStats = Statistics.colStats(rdd) - >>> cStats.mean() - array([ 4., 4., 0., 3.]) - >>> cStats.variance() - array([ 4., 13., 0., 25.]) - >>> cStats.count() - 3 - >>> cStats.numNonzeros() - array([ 3., 2., 0., 3.]) - >>> cStats.max() - array([ 6., 7., 0., 8.]) - >>> cStats.min() - array([ 2., 0., 0., -2.]) - """ - cStats = callMLlibFunc("colStats", rdd.map(_convert_to_vector)) - return MultivariateStatisticalSummary(cStats) - - @staticmethod - def corr(x, y=None, method=None): - """ - Compute the correlation (matrix) for the input RDD(s) using the - specified method. - Methods currently supported: I{pearson (default), spearman}. - - If a single RDD of Vectors is passed in, a correlation matrix - comparing the columns in the input RDD is returned. Use C{method=} - to specify the method to be used for single RDD inout. - If two RDDs of floats are passed in, a single float is returned. - - :param x: an RDD of vector for which the correlation matrix is to be computed, - or an RDD of float of the same cardinality as y when y is specified. - :param y: an RDD of float of the same cardinality as x. - :param method: String specifying the method to use for computing correlation. - Supported: `pearson` (default), `spearman` - :return: Correlation matrix comparing columns in x. - - >>> x = sc.parallelize([1.0, 0.0, -2.0], 2) - >>> y = sc.parallelize([4.0, 5.0, 3.0], 2) - >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2) - >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7 - True - >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson") - True - >>> Statistics.corr(x, y, "spearman") - 0.5 - >>> from math import isnan - >>> isnan(Statistics.corr(x, zeros)) - True - >>> from pyspark.mllib.linalg import Vectors - >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), - ... Vectors.dense([6, 7, 0, 8]), Vectors.dense([9, 0, 0, 1])]) - >>> pearsonCorr = Statistics.corr(rdd) - >>> print(str(pearsonCorr).replace('nan', 'NaN')) - [[ 1. 0.05564149 NaN 0.40047142] - [ 0.05564149 1. NaN 0.91359586] - [ NaN NaN 1. NaN] - [ 0.40047142 0.91359586 NaN 1. ]] - >>> spearmanCorr = Statistics.corr(rdd, method="spearman") - >>> print(str(spearmanCorr).replace('nan', 'NaN')) - [[ 1. 0.10540926 NaN 0.4 ] - [ 0.10540926 1. NaN 0.9486833 ] - [ NaN NaN 1. NaN] - [ 0.4 0.9486833 NaN 1. ]] - >>> try: - ... Statistics.corr(rdd, "spearman") - ... print("Method name as second argument without 'method=' shouldn't be allowed.") - ... except TypeError: - ... pass - """ - # Check inputs to determine whether a single value or a matrix is needed for output. - # Since it's legal for users to use the method name as the second argument, we need to - # check if y is used to specify the method name instead. - if type(y) == str: - raise TypeError("Use 'method=' to specify method name.") - - if not y: - return callMLlibFunc("corr", x.map(_convert_to_vector), method).toArray() - else: - return callMLlibFunc("corr", x.map(float), y.map(float), method) - - @staticmethod - @ignore_unicode_prefix - def chiSqTest(observed, expected=None): - """ - If `observed` is Vector, conduct Pearson's chi-squared goodness - of fit test of the observed data against the expected distribution, - or againt the uniform distribution (by default), with each category - having an expected frequency of `1 / len(observed)`. - - If `observed` is matrix, conduct Pearson's independence test on the - input contingency matrix, which cannot contain negative entries or - columns or rows that sum up to 0. - - If `observed` is an RDD of LabeledPoint, conduct Pearson's independence - test for every feature against the label across the input RDD. - For each feature, the (feature, label) pairs are converted into a - contingency matrix for which the chi-squared statistic is computed. - All label and feature values must be categorical. - - .. note:: `observed` cannot contain negative values - - :param observed: it could be a vector containing the observed categorical - counts/relative frequencies, or the contingency matrix - (containing either counts or relative frequencies), - or an RDD of LabeledPoint containing the labeled dataset - with categorical features. Real-valued features will be - treated as categorical for each distinct value. - :param expected: Vector containing the expected categorical counts/relative - frequencies. `expected` is rescaled if the `expected` sum - differs from the `observed` sum. - :return: ChiSquaredTest object containing the test statistic, degrees - of freedom, p-value, the method used, and the null hypothesis. - - >>> from pyspark.mllib.linalg import Vectors, Matrices - >>> observed = Vectors.dense([4, 6, 5]) - >>> pearson = Statistics.chiSqTest(observed) - >>> print(pearson.statistic) - 0.4 - >>> pearson.degreesOfFreedom - 2 - >>> print(round(pearson.pValue, 4)) - 0.8187 - >>> pearson.method - u'pearson' - >>> pearson.nullHypothesis - u'observed follows the same distribution as expected.' - - >>> observed = Vectors.dense([21, 38, 43, 80]) - >>> expected = Vectors.dense([3, 5, 7, 20]) - >>> pearson = Statistics.chiSqTest(observed, expected) - >>> print(round(pearson.pValue, 4)) - 0.0027 - - >>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] - >>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) - >>> print(round(chi.statistic, 4)) - 21.9958 - - >>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])), - ... LabeledPoint(0.0, Vectors.dense([1.5, 20.0])), - ... LabeledPoint(1.0, Vectors.dense([1.5, 30.0])), - ... LabeledPoint(0.0, Vectors.dense([3.5, 30.0])), - ... LabeledPoint(0.0, Vectors.dense([3.5, 40.0])), - ... LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),] - >>> rdd = sc.parallelize(data, 4) - >>> chi = Statistics.chiSqTest(rdd) - >>> print(chi[0].statistic) - 0.75 - >>> print(chi[1].statistic) - 1.5 - """ - if isinstance(observed, RDD): - if not isinstance(observed.first(), LabeledPoint): - raise ValueError("observed should be an RDD of LabeledPoint") - jmodels = callMLlibFunc("chiSqTest", observed) - return [ChiSqTestResult(m) for m in jmodels] - - if isinstance(observed, Matrix): - jmodel = callMLlibFunc("chiSqTest", observed) - else: - if expected and len(expected) != len(observed): - raise ValueError("`expected` should have same length with `observed`") - jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected) - return ChiSqTestResult(jmodel) - - @staticmethod - @ignore_unicode_prefix - def kolmogorovSmirnovTest(data, distName="norm", *params): - """ - Performs the Kolmogorov-Smirnov (KS) test for data sampled from - a continuous distribution. It tests the null hypothesis that - the data is generated from a particular distribution. - - The given data is sorted and the Empirical Cumulative - Distribution Function (ECDF) is calculated - which for a given point is the number of points having a CDF - value lesser than it divided by the total number of points. - - Since the data is sorted, this is a step function - that rises by (1 / length of data) for every ordered point. - - The KS statistic gives us the maximum distance between the - ECDF and the CDF. Intuitively if this statistic is large, the - probability that the null hypothesis is true becomes small. - For specific details of the implementation, please have a look - at the Scala documentation. - - :param data: RDD, samples from the data - :param distName: string, currently only "norm" is supported. - (Normal distribution) to calculate the - theoretical distribution of the data. - :param params: additional values which need to be provided for - a certain distribution. - If not provided, the default values are used. - :return: KolmogorovSmirnovTestResult object containing the test - statistic, degrees of freedom, p-value, - the method used, and the null hypothesis. - - >>> kstest = Statistics.kolmogorovSmirnovTest - >>> data = sc.parallelize([-1.0, 0.0, 1.0]) - >>> ksmodel = kstest(data, "norm") - >>> print(round(ksmodel.pValue, 3)) - 1.0 - >>> print(round(ksmodel.statistic, 3)) - 0.175 - >>> ksmodel.nullHypothesis - u'Sample follows theoretical distribution' - - >>> data = sc.parallelize([2.0, 3.0, 4.0]) - >>> ksmodel = kstest(data, "norm", 3.0, 1.0) - >>> print(round(ksmodel.pValue, 3)) - 1.0 - >>> print(round(ksmodel.statistic, 3)) - 0.175 - """ - if not isinstance(data, RDD): - raise TypeError("data should be an RDD, got %s." % type(data)) - if not isinstance(distName, basestring): - raise TypeError("distName should be a string, got %s." % type(distName)) - - params = [float(param) for param in params] - return KolmogorovSmirnovTestResult( - callMLlibFunc("kolmogorovSmirnovTest", data, distName, params)) - - -def _test(): - import doctest - import numpy - from pyspark.sql import SparkSession - try: - # Numpy 1.14+ changed it's string format. - numpy.set_printoptions(legacy='1.13') - except TypeError: - pass - globs = globals().copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("mllib.stat.statistics tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/distribution.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/distribution.py deleted file mode 100644 index 46f7a1d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/distribution.py +++ /dev/null @@ -1,32 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from collections import namedtuple - -__all__ = ['MultivariateGaussian'] - - -class MultivariateGaussian(namedtuple('MultivariateGaussian', ['mu', 'sigma'])): - - """Represents a (mu, sigma) tuple - - >>> m = MultivariateGaussian(Vectors.dense([11,12]),DenseMatrix(2, 2, (1.0, 3.0, 5.0, 2.0))) - >>> (m.mu, m.sigma.toArray()) - (DenseVector([11.0, 12.0]), array([[ 1., 5.],[ 3., 2.]])) - >>> (m[0], m[1]) - (DenseVector([11.0, 12.0]), array([[ 1., 5.],[ 3., 2.]])) - """ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/test.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/test.py deleted file mode 100644 index 0abe104..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/stat/test.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from pyspark.mllib.common import inherit_doc, JavaModelWrapper - - -__all__ = ["ChiSqTestResult", "KolmogorovSmirnovTestResult"] - - -class TestResult(JavaModelWrapper): - """ - Base class for all test results. - """ - - @property - def pValue(self): - """ - The probability of obtaining a test statistic result at least as - extreme as the one that was actually observed, assuming that the - null hypothesis is true. - """ - return self._java_model.pValue() - - @property - def degreesOfFreedom(self): - """ - Returns the degree(s) of freedom of the hypothesis test. - Return type should be Number(e.g. Int, Double) or tuples of Numbers. - """ - return self._java_model.degreesOfFreedom() - - @property - def statistic(self): - """ - Test statistic. - """ - return self._java_model.statistic() - - @property - def nullHypothesis(self): - """ - Null hypothesis of the test. - """ - return self._java_model.nullHypothesis() - - def __str__(self): - return self._java_model.toString() - - -@inherit_doc -class ChiSqTestResult(TestResult): - """ - Contains test results for the chi-squared hypothesis test. - """ - - @property - def method(self): - """ - Name of the test method - """ - return self._java_model.method() - - -@inherit_doc -class KolmogorovSmirnovTestResult(TestResult): - """ - Contains test results for the Kolmogorov-Smirnov test. - """ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/tests.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/tests.py deleted file mode 100644 index 4c2ce13..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/tests.py +++ /dev/null @@ -1,1787 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Fuller unit tests for Python MLlib. -""" - -import os -import sys -import tempfile -import array as pyarray -from math import sqrt -from time import time, sleep -from shutil import rmtree - -from numpy import ( - array, array_equal, zeros, inf, random, exp, dot, all, mean, abs, arange, tile, ones) -from numpy import sum as array_sum - -from py4j.protocol import Py4JJavaError -try: - import xmlrunner -except ImportError: - xmlrunner = None - -if sys.version > '3': - basestring = str - -if sys.version_info[:2] <= (2, 6): - try: - import unittest2 as unittest - except ImportError: - sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') - sys.exit(1) -else: - import unittest - -from pyspark import SparkContext -import pyspark.ml.linalg as newlinalg -from pyspark.mllib.common import _to_java_object_rdd -from pyspark.mllib.clustering import StreamingKMeans, StreamingKMeansModel -from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\ - DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT -from pyspark.mllib.linalg.distributed import RowMatrix -from pyspark.mllib.classification import StreamingLogisticRegressionWithSGD -from pyspark.mllib.fpm import FPGrowth -from pyspark.mllib.recommendation import Rating -from pyspark.mllib.regression import LabeledPoint, StreamingLinearRegressionWithSGD -from pyspark.mllib.random import RandomRDDs -from pyspark.mllib.stat import Statistics -from pyspark.mllib.feature import HashingTF -from pyspark.mllib.feature import Word2Vec -from pyspark.mllib.feature import IDF -from pyspark.mllib.feature import StandardScaler, ElementwiseProduct -from pyspark.mllib.util import LinearDataGenerator -from pyspark.mllib.util import MLUtils -from pyspark.serializers import PickleSerializer -from pyspark.streaming import StreamingContext -from pyspark.sql import SparkSession -from pyspark.sql.utils import IllegalArgumentException -from pyspark.streaming import StreamingContext - -_have_scipy = False -try: - import scipy.sparse - _have_scipy = True -except: - # No SciPy, but that's okay, we'll skip those tests - pass - -ser = PickleSerializer() - - -class MLlibTestCase(unittest.TestCase): - def setUp(self): - self.sc = SparkContext('local[4]', "MLlib tests") - self.spark = SparkSession(self.sc) - - def tearDown(self): - self.spark.stop() - - -class MLLibStreamingTestCase(unittest.TestCase): - def setUp(self): - self.sc = SparkContext('local[4]', "MLlib tests") - self.ssc = StreamingContext(self.sc, 1.0) - - def tearDown(self): - self.ssc.stop(False) - self.sc.stop() - - @staticmethod - def _eventually(condition, timeout=30.0, catch_assertions=False): - """ - Wait a given amount of time for a condition to pass, else fail with an error. - This is a helper utility for streaming ML tests. - :param condition: Function that checks for termination conditions. - condition() can return: - - True: Conditions met. Return without error. - - other value: Conditions not met yet. Continue. Upon timeout, - include last such value in error message. - Note that this method may be called at any time during - streaming execution (e.g., even before any results - have been created). - :param timeout: Number of seconds to wait. Default 30 seconds. - :param catch_assertions: If False (default), do not catch AssertionErrors. - If True, catch AssertionErrors; continue, but save - error to throw upon timeout. - """ - start_time = time() - lastValue = None - while time() - start_time < timeout: - if catch_assertions: - try: - lastValue = condition() - except AssertionError as e: - lastValue = e - else: - lastValue = condition() - if lastValue is True: - return - sleep(0.01) - if isinstance(lastValue, AssertionError): - raise lastValue - else: - raise AssertionError( - "Test failed due to timeout after %g sec, with last condition returning: %s" - % (timeout, lastValue)) - - -def _squared_distance(a, b): - if isinstance(a, Vector): - return a.squared_distance(b) - else: - return b.squared_distance(a) - - -class VectorTests(MLlibTestCase): - - def _test_serialize(self, v): - self.assertEqual(v, ser.loads(ser.dumps(v))) - jvec = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(v))) - nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvec))) - self.assertEqual(v, nv) - vs = [v] * 100 - jvecs = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(vs))) - nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvecs))) - self.assertEqual(vs, nvs) - - def test_serialize(self): - self._test_serialize(DenseVector(range(10))) - self._test_serialize(DenseVector(array([1., 2., 3., 4.]))) - self._test_serialize(DenseVector(pyarray.array('d', range(10)))) - self._test_serialize(SparseVector(4, {1: 1, 3: 2})) - self._test_serialize(SparseVector(3, {})) - self._test_serialize(DenseMatrix(2, 3, range(6))) - sm1 = SparseMatrix( - 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0]) - self._test_serialize(sm1) - - def test_dot(self): - sv = SparseVector(4, {1: 1, 3: 2}) - dv = DenseVector(array([1., 2., 3., 4.])) - lst = DenseVector([1, 2, 3, 4]) - mat = array([[1., 2., 3., 4.], - [1., 2., 3., 4.], - [1., 2., 3., 4.], - [1., 2., 3., 4.]]) - arr = pyarray.array('d', [0, 1, 2, 3]) - self.assertEqual(10.0, sv.dot(dv)) - self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) - self.assertEqual(30.0, dv.dot(dv)) - self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) - self.assertEqual(30.0, lst.dot(dv)) - self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat))) - self.assertEqual(7.0, sv.dot(arr)) - - def test_squared_distance(self): - sv = SparseVector(4, {1: 1, 3: 2}) - dv = DenseVector(array([1., 2., 3., 4.])) - lst = DenseVector([4, 3, 2, 1]) - lst1 = [4, 3, 2, 1] - arr = pyarray.array('d', [0, 2, 1, 3]) - narr = array([0, 2, 1, 3]) - self.assertEqual(15.0, _squared_distance(sv, dv)) - self.assertEqual(25.0, _squared_distance(sv, lst)) - self.assertEqual(20.0, _squared_distance(dv, lst)) - self.assertEqual(15.0, _squared_distance(dv, sv)) - self.assertEqual(25.0, _squared_distance(lst, sv)) - self.assertEqual(20.0, _squared_distance(lst, dv)) - self.assertEqual(0.0, _squared_distance(sv, sv)) - self.assertEqual(0.0, _squared_distance(dv, dv)) - self.assertEqual(0.0, _squared_distance(lst, lst)) - self.assertEqual(25.0, _squared_distance(sv, lst1)) - self.assertEqual(3.0, _squared_distance(sv, arr)) - self.assertEqual(3.0, _squared_distance(sv, narr)) - - def test_hash(self): - v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) - v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) - v4 = SparseVector(4, [(1, 1.0), (3, 2.5)]) - self.assertEqual(hash(v1), hash(v2)) - self.assertEqual(hash(v1), hash(v3)) - self.assertEqual(hash(v2), hash(v3)) - self.assertFalse(hash(v1) == hash(v4)) - self.assertFalse(hash(v2) == hash(v4)) - - def test_eq(self): - v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) - v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) - v4 = SparseVector(6, [(1, 1.0), (3, 5.5)]) - v5 = DenseVector([0.0, 1.0, 0.0, 2.5]) - v6 = SparseVector(4, [(1, 1.0), (3, 2.5)]) - self.assertEqual(v1, v2) - self.assertEqual(v1, v3) - self.assertFalse(v2 == v4) - self.assertFalse(v1 == v5) - self.assertFalse(v1 == v6) - - def test_equals(self): - indices = [1, 2, 4] - values = [1., 3., 2.] - self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.])) - self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.])) - self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.])) - self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.])) - - def test_conversion(self): - # numpy arrays should be automatically upcast to float64 - # tests for fix of [SPARK-5089] - v = array([1, 2, 3, 4], dtype='float64') - dv = DenseVector(v) - self.assertTrue(dv.array.dtype == 'float64') - v = array([1, 2, 3, 4], dtype='float32') - dv = DenseVector(v) - self.assertTrue(dv.array.dtype == 'float64') - - def test_sparse_vector_indexing(self): - sv = SparseVector(5, {1: 1, 3: 2}) - self.assertEqual(sv[0], 0.) - self.assertEqual(sv[3], 2.) - self.assertEqual(sv[1], 1.) - self.assertEqual(sv[2], 0.) - self.assertEqual(sv[4], 0.) - self.assertEqual(sv[-1], 0.) - self.assertEqual(sv[-2], 2.) - self.assertEqual(sv[-3], 0.) - self.assertEqual(sv[-5], 0.) - for ind in [5, -6]: - self.assertRaises(IndexError, sv.__getitem__, ind) - for ind in [7.8, '1']: - self.assertRaises(TypeError, sv.__getitem__, ind) - - zeros = SparseVector(4, {}) - self.assertEqual(zeros[0], 0.0) - self.assertEqual(zeros[3], 0.0) - for ind in [4, -5]: - self.assertRaises(IndexError, zeros.__getitem__, ind) - - empty = SparseVector(0, {}) - for ind in [-1, 0, 1]: - self.assertRaises(IndexError, empty.__getitem__, ind) - - def test_sparse_vector_iteration(self): - self.assertListEqual(list(SparseVector(3, [], [])), [0.0, 0.0, 0.0]) - self.assertListEqual(list(SparseVector(5, [0, 3], [1.0, 2.0])), [1.0, 0.0, 0.0, 2.0, 0.0]) - - def test_matrix_indexing(self): - mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10]) - expected = [[0, 6], [1, 8], [4, 10]] - for i in range(3): - for j in range(2): - self.assertEqual(mat[i, j], expected[i][j]) - - for i, j in [(-1, 0), (4, 1), (3, 4)]: - self.assertRaises(IndexError, mat.__getitem__, (i, j)) - - def test_repr_dense_matrix(self): - mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10]) - self.assertTrue( - repr(mat), - 'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)') - - mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True) - self.assertTrue( - repr(mat), - 'DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)') - - mat = DenseMatrix(6, 3, zeros(18)) - self.assertTrue( - repr(mat), - 'DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)') - - def test_repr_sparse_matrix(self): - sm1t = SparseMatrix( - 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], - isTransposed=True) - self.assertTrue( - repr(sm1t), - 'SparseMatrix(3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], True)') - - indices = tile(arange(6), 3) - values = ones(18) - sm = SparseMatrix(6, 3, [0, 6, 12, 18], indices, values) - self.assertTrue( - repr(sm), "SparseMatrix(6, 3, [0, 6, 12, 18], \ - [0, 1, 2, 3, 4, 5, 0, 1, ..., 4, 5, 0, 1, 2, 3, 4, 5], \ - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..., \ - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], False)") - - self.assertTrue( - str(sm), - "6 X 3 CSCMatrix\n\ - (0,0) 1.0\n(1,0) 1.0\n(2,0) 1.0\n(3,0) 1.0\n(4,0) 1.0\n(5,0) 1.0\n\ - (0,1) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(3,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n\ - (0,2) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(3,2) 1.0\n..\n..") - - sm = SparseMatrix(1, 18, zeros(19), [], []) - self.assertTrue( - repr(sm), - 'SparseMatrix(1, 18, \ - [0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0], [], [], False)') - - def test_sparse_matrix(self): - # Test sparse matrix creation. - sm1 = SparseMatrix( - 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0]) - self.assertEqual(sm1.numRows, 3) - self.assertEqual(sm1.numCols, 4) - self.assertEqual(sm1.colPtrs.tolist(), [0, 2, 2, 4, 4]) - self.assertEqual(sm1.rowIndices.tolist(), [1, 2, 1, 2]) - self.assertEqual(sm1.values.tolist(), [1.0, 2.0, 4.0, 5.0]) - self.assertTrue( - repr(sm1), - 'SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0], False)') - - # Test indexing - expected = [ - [0, 0, 0, 0], - [1, 0, 4, 0], - [2, 0, 5, 0]] - - for i in range(3): - for j in range(4): - self.assertEqual(expected[i][j], sm1[i, j]) - self.assertTrue(array_equal(sm1.toArray(), expected)) - - for i, j in [(-1, 1), (4, 3), (3, 5)]: - self.assertRaises(IndexError, sm1.__getitem__, (i, j)) - - # Test conversion to dense and sparse. - smnew = sm1.toDense().toSparse() - self.assertEqual(sm1.numRows, smnew.numRows) - self.assertEqual(sm1.numCols, smnew.numCols) - self.assertTrue(array_equal(sm1.colPtrs, smnew.colPtrs)) - self.assertTrue(array_equal(sm1.rowIndices, smnew.rowIndices)) - self.assertTrue(array_equal(sm1.values, smnew.values)) - - sm1t = SparseMatrix( - 3, 4, [0, 2, 3, 5], [0, 1, 2, 0, 2], [3.0, 2.0, 4.0, 9.0, 8.0], - isTransposed=True) - self.assertEqual(sm1t.numRows, 3) - self.assertEqual(sm1t.numCols, 4) - self.assertEqual(sm1t.colPtrs.tolist(), [0, 2, 3, 5]) - self.assertEqual(sm1t.rowIndices.tolist(), [0, 1, 2, 0, 2]) - self.assertEqual(sm1t.values.tolist(), [3.0, 2.0, 4.0, 9.0, 8.0]) - - expected = [ - [3, 2, 0, 0], - [0, 0, 4, 0], - [9, 0, 8, 0]] - - for i in range(3): - for j in range(4): - self.assertEqual(expected[i][j], sm1t[i, j]) - self.assertTrue(array_equal(sm1t.toArray(), expected)) - - def test_dense_matrix_is_transposed(self): - mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True) - mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9]) - self.assertEqual(mat1, mat) - - expected = [[0, 4], [1, 6], [3, 9]] - for i in range(3): - for j in range(2): - self.assertEqual(mat1[i, j], expected[i][j]) - self.assertTrue(array_equal(mat1.toArray(), expected)) - - sm = mat1.toSparse() - self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2])) - self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5])) - self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9])) - - def test_parse_vector(self): - a = DenseVector([]) - self.assertEqual(str(a), '[]') - self.assertEqual(Vectors.parse(str(a)), a) - a = DenseVector([3, 4, 6, 7]) - self.assertEqual(str(a), '[3.0,4.0,6.0,7.0]') - self.assertEqual(Vectors.parse(str(a)), a) - a = SparseVector(4, [], []) - self.assertEqual(str(a), '(4,[],[])') - self.assertEqual(SparseVector.parse(str(a)), a) - a = SparseVector(4, [0, 2], [3, 4]) - self.assertEqual(str(a), '(4,[0,2],[3.0,4.0])') - self.assertEqual(Vectors.parse(str(a)), a) - a = SparseVector(10, [0, 1], [4, 5]) - self.assertEqual(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a) - - def test_norms(self): - a = DenseVector([0, 2, 3, -1]) - self.assertAlmostEqual(a.norm(2), 3.742, 3) - self.assertTrue(a.norm(1), 6) - self.assertTrue(a.norm(inf), 3) - a = SparseVector(4, [0, 2], [3, -4]) - self.assertAlmostEqual(a.norm(2), 5) - self.assertTrue(a.norm(1), 7) - self.assertTrue(a.norm(inf), 4) - - tmp = SparseVector(4, [0, 2], [3, 0]) - self.assertEqual(tmp.numNonzeros(), 1) - - def test_ml_mllib_vector_conversion(self): - # to ml - # dense - mllibDV = Vectors.dense([1, 2, 3]) - mlDV1 = newlinalg.Vectors.dense([1, 2, 3]) - mlDV2 = mllibDV.asML() - self.assertEqual(mlDV2, mlDV1) - # sparse - mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5}) - mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) - mlSV2 = mllibSV.asML() - self.assertEqual(mlSV2, mlSV1) - # from ml - # dense - mllibDV1 = Vectors.dense([1, 2, 3]) - mlDV = newlinalg.Vectors.dense([1, 2, 3]) - mllibDV2 = Vectors.fromML(mlDV) - self.assertEqual(mllibDV1, mllibDV2) - # sparse - mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5}) - mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) - mllibSV2 = Vectors.fromML(mlSV) - self.assertEqual(mllibSV1, mllibSV2) - - def test_ml_mllib_matrix_conversion(self): - # to ml - # dense - mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3]) - mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3]) - mlDM2 = mllibDM.asML() - self.assertEqual(mlDM2, mlDM1) - # transposed - mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True) - mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True) - mlDMt2 = mllibDMt.asML() - self.assertEqual(mlDMt2, mlDMt1) - # sparse - mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - mlSM2 = mllibSM.asML() - self.assertEqual(mlSM2, mlSM1) - # transposed - mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) - mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) - mlSMt2 = mllibSMt.asML() - self.assertEqual(mlSMt2, mlSMt1) - # from ml - # dense - mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4]) - mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4]) - mllibDM2 = Matrices.fromML(mlDM) - self.assertEqual(mllibDM1, mllibDM2) - # transposed - mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True) - mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True) - mllibDMt2 = Matrices.fromML(mlDMt) - self.assertEqual(mllibDMt1, mllibDMt2) - # sparse - mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - mllibSM2 = Matrices.fromML(mlSM) - self.assertEqual(mllibSM1, mllibSM2) - # transposed - mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) - mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) - mllibSMt2 = Matrices.fromML(mlSMt) - self.assertEqual(mllibSMt1, mllibSMt2) - - -class ListTests(MLlibTestCase): - - """ - Test MLlib algorithms on plain lists, to make sure they're passed through - as NumPy arrays. - """ - - def test_bisecting_kmeans(self): - from pyspark.mllib.clustering import BisectingKMeans - data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2) - bskm = BisectingKMeans() - model = bskm.train(self.sc.parallelize(data, 2), k=4) - p = array([0.0, 0.0]) - rdd_p = self.sc.parallelize([p]) - self.assertEqual(model.predict(p), model.predict(rdd_p).first()) - self.assertEqual(model.computeCost(p), model.computeCost(rdd_p)) - self.assertEqual(model.k, len(model.clusterCenters)) - - def test_kmeans(self): - from pyspark.mllib.clustering import KMeans - data = [ - [0, 1.1], - [0, 1.2], - [1.1, 0], - [1.2, 0], - ] - clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||", - initializationSteps=7, epsilon=1e-4) - self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1])) - self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3])) - - def test_kmeans_deterministic(self): - from pyspark.mllib.clustering import KMeans - X = range(0, 100, 10) - Y = range(0, 100, 10) - data = [[x, y] for x, y in zip(X, Y)] - clusters1 = KMeans.train(self.sc.parallelize(data), - 3, initializationMode="k-means||", - seed=42, initializationSteps=7, epsilon=1e-4) - clusters2 = KMeans.train(self.sc.parallelize(data), - 3, initializationMode="k-means||", - seed=42, initializationSteps=7, epsilon=1e-4) - centers1 = clusters1.centers - centers2 = clusters2.centers - for c1, c2 in zip(centers1, centers2): - # TODO: Allow small numeric difference. - self.assertTrue(array_equal(c1, c2)) - - def test_gmm(self): - from pyspark.mllib.clustering import GaussianMixture - data = self.sc.parallelize([ - [1, 2], - [8, 9], - [-4, -3], - [-6, -7], - ]) - clusters = GaussianMixture.train(data, 2, convergenceTol=0.001, - maxIterations=10, seed=1) - labels = clusters.predict(data).collect() - self.assertEqual(labels[0], labels[1]) - self.assertEqual(labels[2], labels[3]) - - def test_gmm_deterministic(self): - from pyspark.mllib.clustering import GaussianMixture - x = range(0, 100, 10) - y = range(0, 100, 10) - data = self.sc.parallelize([[a, b] for a, b in zip(x, y)]) - clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001, - maxIterations=10, seed=63) - clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001, - maxIterations=10, seed=63) - for c1, c2 in zip(clusters1.weights, clusters2.weights): - self.assertEqual(round(c1, 7), round(c2, 7)) - - def test_gmm_with_initial_model(self): - from pyspark.mllib.clustering import GaussianMixture - data = self.sc.parallelize([ - (-10, -5), (-9, -4), (10, 5), (9, 4) - ]) - - gmm1 = GaussianMixture.train(data, 2, convergenceTol=0.001, - maxIterations=10, seed=63) - gmm2 = GaussianMixture.train(data, 2, convergenceTol=0.001, - maxIterations=10, seed=63, initialModel=gmm1) - self.assertAlmostEqual((gmm1.weights - gmm2.weights).sum(), 0.0) - - def test_classification(self): - from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes - from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ - RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel - data = [ - LabeledPoint(0.0, [1, 0, 0]), - LabeledPoint(1.0, [0, 1, 1]), - LabeledPoint(0.0, [2, 0, 0]), - LabeledPoint(1.0, [0, 2, 1]) - ] - rdd = self.sc.parallelize(data) - features = [p.features.tolist() for p in data] - - temp_dir = tempfile.mkdtemp() - - lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) - self.assertTrue(lr_model.predict(features[0]) <= 0) - self.assertTrue(lr_model.predict(features[1]) > 0) - self.assertTrue(lr_model.predict(features[2]) <= 0) - self.assertTrue(lr_model.predict(features[3]) > 0) - - svm_model = SVMWithSGD.train(rdd, iterations=10) - self.assertTrue(svm_model.predict(features[0]) <= 0) - self.assertTrue(svm_model.predict(features[1]) > 0) - self.assertTrue(svm_model.predict(features[2]) <= 0) - self.assertTrue(svm_model.predict(features[3]) > 0) - - nb_model = NaiveBayes.train(rdd) - self.assertTrue(nb_model.predict(features[0]) <= 0) - self.assertTrue(nb_model.predict(features[1]) > 0) - self.assertTrue(nb_model.predict(features[2]) <= 0) - self.assertTrue(nb_model.predict(features[3]) > 0) - - categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories - dt_model = DecisionTree.trainClassifier( - rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) - self.assertTrue(dt_model.predict(features[0]) <= 0) - self.assertTrue(dt_model.predict(features[1]) > 0) - self.assertTrue(dt_model.predict(features[2]) <= 0) - self.assertTrue(dt_model.predict(features[3]) > 0) - - dt_model_dir = os.path.join(temp_dir, "dt") - dt_model.save(self.sc, dt_model_dir) - same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) - self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) - - rf_model = RandomForest.trainClassifier( - rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, - maxBins=4, seed=1) - self.assertTrue(rf_model.predict(features[0]) <= 0) - self.assertTrue(rf_model.predict(features[1]) > 0) - self.assertTrue(rf_model.predict(features[2]) <= 0) - self.assertTrue(rf_model.predict(features[3]) > 0) - - rf_model_dir = os.path.join(temp_dir, "rf") - rf_model.save(self.sc, rf_model_dir) - same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) - self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) - - gbt_model = GradientBoostedTrees.trainClassifier( - rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) - self.assertTrue(gbt_model.predict(features[0]) <= 0) - self.assertTrue(gbt_model.predict(features[1]) > 0) - self.assertTrue(gbt_model.predict(features[2]) <= 0) - self.assertTrue(gbt_model.predict(features[3]) > 0) - - gbt_model_dir = os.path.join(temp_dir, "gbt") - gbt_model.save(self.sc, gbt_model_dir) - same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) - self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) - - try: - rmtree(temp_dir) - except OSError: - pass - - def test_regression(self): - from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ - RidgeRegressionWithSGD - from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees - data = [ - LabeledPoint(-1.0, [0, -1]), - LabeledPoint(1.0, [0, 1]), - LabeledPoint(-1.0, [0, -2]), - LabeledPoint(1.0, [0, 2]) - ] - rdd = self.sc.parallelize(data) - features = [p.features.tolist() for p in data] - - lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) - self.assertTrue(lr_model.predict(features[0]) <= 0) - self.assertTrue(lr_model.predict(features[1]) > 0) - self.assertTrue(lr_model.predict(features[2]) <= 0) - self.assertTrue(lr_model.predict(features[3]) > 0) - - lasso_model = LassoWithSGD.train(rdd, iterations=10) - self.assertTrue(lasso_model.predict(features[0]) <= 0) - self.assertTrue(lasso_model.predict(features[1]) > 0) - self.assertTrue(lasso_model.predict(features[2]) <= 0) - self.assertTrue(lasso_model.predict(features[3]) > 0) - - rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) - self.assertTrue(rr_model.predict(features[0]) <= 0) - self.assertTrue(rr_model.predict(features[1]) > 0) - self.assertTrue(rr_model.predict(features[2]) <= 0) - self.assertTrue(rr_model.predict(features[3]) > 0) - - categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories - dt_model = DecisionTree.trainRegressor( - rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) - self.assertTrue(dt_model.predict(features[0]) <= 0) - self.assertTrue(dt_model.predict(features[1]) > 0) - self.assertTrue(dt_model.predict(features[2]) <= 0) - self.assertTrue(dt_model.predict(features[3]) > 0) - - rf_model = RandomForest.trainRegressor( - rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) - self.assertTrue(rf_model.predict(features[0]) <= 0) - self.assertTrue(rf_model.predict(features[1]) > 0) - self.assertTrue(rf_model.predict(features[2]) <= 0) - self.assertTrue(rf_model.predict(features[3]) > 0) - - gbt_model = GradientBoostedTrees.trainRegressor( - rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) - self.assertTrue(gbt_model.predict(features[0]) <= 0) - self.assertTrue(gbt_model.predict(features[1]) > 0) - self.assertTrue(gbt_model.predict(features[2]) <= 0) - self.assertTrue(gbt_model.predict(features[3]) > 0) - - try: - LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) - LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) - RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) - except ValueError: - self.fail() - - # Verify that maxBins is being passed through - GradientBoostedTrees.trainRegressor( - rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32) - with self.assertRaises(Exception) as cm: - GradientBoostedTrees.trainRegressor( - rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1) - - -class StatTests(MLlibTestCase): - # SPARK-4023 - def test_col_with_different_rdds(self): - # numpy - data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) - summary = Statistics.colStats(data) - self.assertEqual(1000, summary.count()) - # array - data = self.sc.parallelize([range(10)] * 10) - summary = Statistics.colStats(data) - self.assertEqual(10, summary.count()) - # array - data = self.sc.parallelize([pyarray.array("d", range(10))] * 10) - summary = Statistics.colStats(data) - self.assertEqual(10, summary.count()) - - def test_col_norms(self): - data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) - summary = Statistics.colStats(data) - self.assertEqual(10, len(summary.normL1())) - self.assertEqual(10, len(summary.normL2())) - - data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x)) - summary2 = Statistics.colStats(data2) - self.assertEqual(array([45.0]), summary2.normL1()) - import math - expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10)))) - self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14) - - -class VectorUDTTests(MLlibTestCase): - - dv0 = DenseVector([]) - dv1 = DenseVector([1.0, 2.0]) - sv0 = SparseVector(2, [], []) - sv1 = SparseVector(2, [1], [2.0]) - udt = VectorUDT() - - def test_json_schema(self): - self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt) - - def test_serialization(self): - for v in [self.dv0, self.dv1, self.sv0, self.sv1]: - self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v))) - - def test_infer_schema(self): - rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)]) - df = rdd.toDF() - schema = df.schema - field = [f for f in schema.fields if f.name == "features"][0] - self.assertEqual(field.dataType, self.udt) - vectors = df.rdd.map(lambda p: p.features).collect() - self.assertEqual(len(vectors), 2) - for v in vectors: - if isinstance(v, SparseVector): - self.assertEqual(v, self.sv1) - elif isinstance(v, DenseVector): - self.assertEqual(v, self.dv1) - else: - raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) - - -class MatrixUDTTests(MLlibTestCase): - - dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10]) - dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True) - sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0]) - sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True) - udt = MatrixUDT() - - def test_json_schema(self): - self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt) - - def test_serialization(self): - for m in [self.dm1, self.dm2, self.sm1, self.sm2]: - self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m))) - - def test_infer_schema(self): - rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)]) - df = rdd.toDF() - schema = df.schema - self.assertTrue(schema.fields[1].dataType, self.udt) - matrices = df.rdd.map(lambda x: x._2).collect() - self.assertEqual(len(matrices), 2) - for m in matrices: - if isinstance(m, DenseMatrix): - self.assertTrue(m, self.dm1) - elif isinstance(m, SparseMatrix): - self.assertTrue(m, self.sm1) - else: - raise ValueError("Expected a matrix but got type %r" % type(m)) - - -@unittest.skipIf(not _have_scipy, "SciPy not installed") -class SciPyTests(MLlibTestCase): - - """ - Test both vector operations and MLlib algorithms with SciPy sparse matrices, - if SciPy is available. - """ - - def test_serialize(self): - from scipy.sparse import lil_matrix - lil = lil_matrix((4, 1)) - lil[1, 0] = 1 - lil[3, 0] = 2 - sv = SparseVector(4, {1: 1, 3: 2}) - self.assertEqual(sv, _convert_to_vector(lil)) - self.assertEqual(sv, _convert_to_vector(lil.tocsc())) - self.assertEqual(sv, _convert_to_vector(lil.tocoo())) - self.assertEqual(sv, _convert_to_vector(lil.tocsr())) - self.assertEqual(sv, _convert_to_vector(lil.todok())) - - def serialize(l): - return ser.loads(ser.dumps(_convert_to_vector(l))) - self.assertEqual(sv, serialize(lil)) - self.assertEqual(sv, serialize(lil.tocsc())) - self.assertEqual(sv, serialize(lil.tocsr())) - self.assertEqual(sv, serialize(lil.todok())) - - def test_convert_to_vector(self): - from scipy.sparse import csc_matrix - # Create a CSC matrix with non-sorted indices - indptr = array([0, 2]) - indices = array([3, 1]) - data = array([2.0, 1.0]) - csc = csc_matrix((data, indices, indptr)) - self.assertFalse(csc.has_sorted_indices) - sv = SparseVector(4, {1: 1, 3: 2}) - self.assertEqual(sv, _convert_to_vector(csc)) - - def test_dot(self): - from scipy.sparse import lil_matrix - lil = lil_matrix((4, 1)) - lil[1, 0] = 1 - lil[3, 0] = 2 - dv = DenseVector(array([1., 2., 3., 4.])) - self.assertEqual(10.0, dv.dot(lil)) - - def test_squared_distance(self): - from scipy.sparse import lil_matrix - lil = lil_matrix((4, 1)) - lil[1, 0] = 3 - lil[3, 0] = 2 - dv = DenseVector(array([1., 2., 3., 4.])) - sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4}) - self.assertEqual(15.0, dv.squared_distance(lil)) - self.assertEqual(15.0, sv.squared_distance(lil)) - - def scipy_matrix(self, size, values): - """Create a column SciPy matrix from a dictionary of values""" - from scipy.sparse import lil_matrix - lil = lil_matrix((size, 1)) - for key, value in values.items(): - lil[key, 0] = value - return lil - - def test_clustering(self): - from pyspark.mllib.clustering import KMeans - data = [ - self.scipy_matrix(3, {1: 1.0}), - self.scipy_matrix(3, {1: 1.1}), - self.scipy_matrix(3, {2: 1.0}), - self.scipy_matrix(3, {2: 1.1}) - ] - clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||") - self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1])) - self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3])) - - def test_classification(self): - from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes - from pyspark.mllib.tree import DecisionTree - data = [ - LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), - LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), - LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), - LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) - ] - rdd = self.sc.parallelize(data) - features = [p.features for p in data] - - lr_model = LogisticRegressionWithSGD.train(rdd) - self.assertTrue(lr_model.predict(features[0]) <= 0) - self.assertTrue(lr_model.predict(features[1]) > 0) - self.assertTrue(lr_model.predict(features[2]) <= 0) - self.assertTrue(lr_model.predict(features[3]) > 0) - - svm_model = SVMWithSGD.train(rdd) - self.assertTrue(svm_model.predict(features[0]) <= 0) - self.assertTrue(svm_model.predict(features[1]) > 0) - self.assertTrue(svm_model.predict(features[2]) <= 0) - self.assertTrue(svm_model.predict(features[3]) > 0) - - nb_model = NaiveBayes.train(rdd) - self.assertTrue(nb_model.predict(features[0]) <= 0) - self.assertTrue(nb_model.predict(features[1]) > 0) - self.assertTrue(nb_model.predict(features[2]) <= 0) - self.assertTrue(nb_model.predict(features[3]) > 0) - - categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories - dt_model = DecisionTree.trainClassifier(rdd, numClasses=2, - categoricalFeaturesInfo=categoricalFeaturesInfo) - self.assertTrue(dt_model.predict(features[0]) <= 0) - self.assertTrue(dt_model.predict(features[1]) > 0) - self.assertTrue(dt_model.predict(features[2]) <= 0) - self.assertTrue(dt_model.predict(features[3]) > 0) - - def test_regression(self): - from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ - RidgeRegressionWithSGD - from pyspark.mllib.tree import DecisionTree - data = [ - LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), - LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), - LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), - LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) - ] - rdd = self.sc.parallelize(data) - features = [p.features for p in data] - - lr_model = LinearRegressionWithSGD.train(rdd) - self.assertTrue(lr_model.predict(features[0]) <= 0) - self.assertTrue(lr_model.predict(features[1]) > 0) - self.assertTrue(lr_model.predict(features[2]) <= 0) - self.assertTrue(lr_model.predict(features[3]) > 0) - - lasso_model = LassoWithSGD.train(rdd) - self.assertTrue(lasso_model.predict(features[0]) <= 0) - self.assertTrue(lasso_model.predict(features[1]) > 0) - self.assertTrue(lasso_model.predict(features[2]) <= 0) - self.assertTrue(lasso_model.predict(features[3]) > 0) - - rr_model = RidgeRegressionWithSGD.train(rdd) - self.assertTrue(rr_model.predict(features[0]) <= 0) - self.assertTrue(rr_model.predict(features[1]) > 0) - self.assertTrue(rr_model.predict(features[2]) <= 0) - self.assertTrue(rr_model.predict(features[3]) > 0) - - categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories - dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) - self.assertTrue(dt_model.predict(features[0]) <= 0) - self.assertTrue(dt_model.predict(features[1]) > 0) - self.assertTrue(dt_model.predict(features[2]) <= 0) - self.assertTrue(dt_model.predict(features[3]) > 0) - - -class ChiSqTestTests(MLlibTestCase): - def test_goodness_of_fit(self): - from numpy import inf - - observed = Vectors.dense([4, 6, 5]) - pearson = Statistics.chiSqTest(observed) - - # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))` - self.assertEqual(pearson.statistic, 0.4) - self.assertEqual(pearson.degreesOfFreedom, 2) - self.assertAlmostEqual(pearson.pValue, 0.8187, 4) - - # Different expected and observed sum - observed1 = Vectors.dense([21, 38, 43, 80]) - expected1 = Vectors.dense([3, 5, 7, 20]) - pearson1 = Statistics.chiSqTest(observed1, expected1) - - # Results validated against the R command - # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))` - self.assertAlmostEqual(pearson1.statistic, 14.1429, 4) - self.assertEqual(pearson1.degreesOfFreedom, 3) - self.assertAlmostEqual(pearson1.pValue, 0.002717, 4) - - # Vectors with different sizes - observed3 = Vectors.dense([1.0, 2.0, 3.0]) - expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0]) - self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3) - - # Negative counts in observed - neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0]) - self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1) - - # Count = 0.0 in expected but not observed - zero_expected = Vectors.dense([1.0, 0.0, 3.0]) - pearson_inf = Statistics.chiSqTest(observed, zero_expected) - self.assertEqual(pearson_inf.statistic, inf) - self.assertEqual(pearson_inf.degreesOfFreedom, 2) - self.assertEqual(pearson_inf.pValue, 0.0) - - # 0.0 in expected and observed simultaneously - zero_observed = Vectors.dense([2.0, 0.0, 1.0]) - self.assertRaises( - IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected) - - def test_matrix_independence(self): - data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] - chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) - - # Results validated against R command - # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))` - self.assertAlmostEqual(chi.statistic, 21.9958, 4) - self.assertEqual(chi.degreesOfFreedom, 6) - self.assertAlmostEqual(chi.pValue, 0.001213, 4) - - # Negative counts - neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0]) - self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_counts) - - # Row sum = 0.0 - row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0]) - self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, row_zero) - - # Column sum = 0.0 - col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0]) - self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, col_zero) - - def test_chi_sq_pearson(self): - data = [ - LabeledPoint(0.0, Vectors.dense([0.5, 10.0])), - LabeledPoint(0.0, Vectors.dense([1.5, 20.0])), - LabeledPoint(1.0, Vectors.dense([1.5, 30.0])), - LabeledPoint(0.0, Vectors.dense([3.5, 30.0])), - LabeledPoint(0.0, Vectors.dense([3.5, 40.0])), - LabeledPoint(1.0, Vectors.dense([3.5, 40.0])) - ] - - for numParts in [2, 4, 6, 8]: - chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts)) - feature1 = chi[0] - self.assertEqual(feature1.statistic, 0.75) - self.assertEqual(feature1.degreesOfFreedom, 2) - self.assertAlmostEqual(feature1.pValue, 0.6873, 4) - - feature2 = chi[1] - self.assertEqual(feature2.statistic, 1.5) - self.assertEqual(feature2.degreesOfFreedom, 3) - self.assertAlmostEqual(feature2.pValue, 0.6823, 4) - - def test_right_number_of_results(self): - num_cols = 1001 - sparse_data = [ - LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])), - LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)])) - ] - chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data)) - self.assertEqual(len(chi), num_cols) - self.assertIsNotNone(chi[1000]) - - -class KolmogorovSmirnovTest(MLlibTestCase): - - def test_R_implementation_equivalence(self): - data = self.sc.parallelize([ - 1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501, - -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555, - -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063, - -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691, - 0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942 - ]) - model = Statistics.kolmogorovSmirnovTest(data, "norm") - self.assertAlmostEqual(model.statistic, 0.189, 3) - self.assertAlmostEqual(model.pValue, 0.422, 3) - - model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) - self.assertAlmostEqual(model.statistic, 0.189, 3) - self.assertAlmostEqual(model.pValue, 0.422, 3) - - -class SerDeTest(MLlibTestCase): - def test_to_java_object_rdd(self): # SPARK-6660 - data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0) - self.assertEqual(_to_java_object_rdd(data).count(), 10) - - -class FeatureTest(MLlibTestCase): - def test_idf_model(self): - data = [ - Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]), - Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]), - Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]), - Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9]) - ] - model = IDF().fit(self.sc.parallelize(data, 2)) - idf = model.idf() - self.assertEqual(len(idf), 11) - - -class Word2VecTests(MLlibTestCase): - def test_word2vec_setters(self): - model = Word2Vec() \ - .setVectorSize(2) \ - .setLearningRate(0.01) \ - .setNumPartitions(2) \ - .setNumIterations(10) \ - .setSeed(1024) \ - .setMinCount(3) \ - .setWindowSize(6) - self.assertEqual(model.vectorSize, 2) - self.assertTrue(model.learningRate < 0.02) - self.assertEqual(model.numPartitions, 2) - self.assertEqual(model.numIterations, 10) - self.assertEqual(model.seed, 1024) - self.assertEqual(model.minCount, 3) - self.assertEqual(model.windowSize, 6) - - def test_word2vec_get_vectors(self): - data = [ - ["a", "b", "c", "d", "e", "f", "g"], - ["a", "b", "c", "d", "e", "f"], - ["a", "b", "c", "d", "e"], - ["a", "b", "c", "d"], - ["a", "b", "c"], - ["a", "b"], - ["a"] - ] - model = Word2Vec().fit(self.sc.parallelize(data)) - self.assertEqual(len(model.getVectors()), 3) - - -class StandardScalerTests(MLlibTestCase): - def test_model_setters(self): - data = [ - [1.0, 2.0, 3.0], - [2.0, 3.0, 4.0], - [3.0, 4.0, 5.0] - ] - model = StandardScaler().fit(self.sc.parallelize(data)) - self.assertIsNotNone(model.setWithMean(True)) - self.assertIsNotNone(model.setWithStd(True)) - self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0])) - - def test_model_transform(self): - data = [ - [1.0, 2.0, 3.0], - [2.0, 3.0, 4.0], - [3.0, 4.0, 5.0] - ] - model = StandardScaler().fit(self.sc.parallelize(data)) - self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0])) - - -class ElementwiseProductTests(MLlibTestCase): - def test_model_transform(self): - weight = Vectors.dense([3, 2, 1]) - - densevec = Vectors.dense([4, 5, 6]) - sparsevec = Vectors.sparse(3, [0], [1]) - eprod = ElementwiseProduct(weight) - self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6])) - self.assertEqual( - eprod.transform(sparsevec), SparseVector(3, [0], [3])) - - -class StreamingKMeansTest(MLLibStreamingTestCase): - def test_model_params(self): - """Test that the model params are set correctly""" - stkm = StreamingKMeans() - stkm.setK(5).setDecayFactor(0.0) - self.assertEqual(stkm._k, 5) - self.assertEqual(stkm._decayFactor, 0.0) - - # Model not set yet. - self.assertIsNone(stkm.latestModel()) - self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0]) - - stkm.setInitialCenters( - centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0]) - self.assertEqual( - stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]]) - self.assertEqual(stkm.latestModel().clusterWeights, [1.0, 1.0]) - - def test_accuracy_for_single_center(self): - """Test that parameters obtained are correct for a single center.""" - centers, batches = self.streamingKMeansDataGenerator( - batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0) - stkm = StreamingKMeans(1) - stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.]) - input_stream = self.ssc.queueStream( - [self.sc.parallelize(batch, 1) for batch in batches]) - stkm.trainOn(input_stream) - - self.ssc.start() - - def condition(): - self.assertEqual(stkm.latestModel().clusterWeights, [25.0]) - return True - self._eventually(condition, catch_assertions=True) - - realCenters = array_sum(array(centers), axis=0) - for i in range(5): - modelCenters = stkm.latestModel().centers[0][i] - self.assertAlmostEqual(centers[0][i], modelCenters, 1) - self.assertAlmostEqual(realCenters[i], modelCenters, 1) - - def streamingKMeansDataGenerator(self, batches, numPoints, - k, d, r, seed, centers=None): - rng = random.RandomState(seed) - - # Generate centers. - centers = [rng.randn(d) for i in range(k)] - - return centers, [[Vectors.dense(centers[j % k] + r * rng.randn(d)) - for j in range(numPoints)] - for i in range(batches)] - - def test_trainOn_model(self): - """Test the model on toy data with four clusters.""" - stkm = StreamingKMeans() - initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]] - stkm.setInitialCenters( - centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0]) - - # Create a toy dataset by setting a tiny offset for each point. - offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]] - batches = [] - for offset in offsets: - batches.append([[offset[0] + center[0], offset[1] + center[1]] - for center in initCenters]) - - batches = [self.sc.parallelize(batch, 1) for batch in batches] - input_stream = self.ssc.queueStream(batches) - stkm.trainOn(input_stream) - self.ssc.start() - - # Give enough time to train the model. - def condition(): - finalModel = stkm.latestModel() - self.assertTrue(all(finalModel.centers == array(initCenters))) - self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0]) - return True - self._eventually(condition, catch_assertions=True) - - def test_predictOn_model(self): - """Test that the model predicts correctly on toy data.""" - stkm = StreamingKMeans() - stkm._model = StreamingKMeansModel( - clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], - clusterWeights=[1.0, 1.0, 1.0, 1.0]) - - predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] - predict_data = [self.sc.parallelize(batch, 1) for batch in predict_data] - predict_stream = self.ssc.queueStream(predict_data) - predict_val = stkm.predictOn(predict_stream) - - result = [] - - def update(rdd): - rdd_collect = rdd.collect() - if rdd_collect: - result.append(rdd_collect) - - predict_val.foreachRDD(update) - self.ssc.start() - - def condition(): - self.assertEqual(result, [[0], [1], [2], [3]]) - return True - - self._eventually(condition, catch_assertions=True) - - @unittest.skip("SPARK-10086: Flaky StreamingKMeans test in PySpark") - def test_trainOn_predictOn(self): - """Test that prediction happens on the updated model.""" - stkm = StreamingKMeans(decayFactor=0.0, k=2) - stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0]) - - # Since decay factor is set to zero, once the first batch - # is passed the clusterCenters are updated to [-0.5, 0.7] - # which causes 0.2 & 0.3 to be classified as 1, even though the - # classification based in the initial model would have been 0 - # proving that the model is updated. - batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]] - batches = [self.sc.parallelize(batch) for batch in batches] - input_stream = self.ssc.queueStream(batches) - predict_results = [] - - def collect(rdd): - rdd_collect = rdd.collect() - if rdd_collect: - predict_results.append(rdd_collect) - - stkm.trainOn(input_stream) - predict_stream = stkm.predictOn(input_stream) - predict_stream.foreachRDD(collect) - - self.ssc.start() - - def condition(): - self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]]) - return True - - self._eventually(condition, catch_assertions=True) - - -class LinearDataGeneratorTests(MLlibTestCase): - def test_dim(self): - linear_data = LinearDataGenerator.generateLinearInput( - intercept=0.0, weights=[0.0, 0.0, 0.0], - xMean=[0.0, 0.0, 0.0], xVariance=[0.33, 0.33, 0.33], - nPoints=4, seed=0, eps=0.1) - self.assertEqual(len(linear_data), 4) - for point in linear_data: - self.assertEqual(len(point.features), 3) - - linear_data = LinearDataGenerator.generateLinearRDD( - sc=self.sc, nexamples=6, nfeatures=2, eps=0.1, - nParts=2, intercept=0.0).collect() - self.assertEqual(len(linear_data), 6) - for point in linear_data: - self.assertEqual(len(point.features), 2) - - -class StreamingLogisticRegressionWithSGDTests(MLLibStreamingTestCase): - - @staticmethod - def generateLogisticInput(offset, scale, nPoints, seed): - """ - Generate 1 / (1 + exp(-x * scale + offset)) - - where, - x is randomnly distributed and the threshold - and labels for each sample in x is obtained from a random uniform - distribution. - """ - rng = random.RandomState(seed) - x = rng.randn(nPoints) - sigmoid = 1. / (1 + exp(-(dot(x, scale) + offset))) - y_p = rng.rand(nPoints) - cut_off = y_p <= sigmoid - y_p[cut_off] = 1.0 - y_p[~cut_off] = 0.0 - return [ - LabeledPoint(y_p[i], Vectors.dense([x[i]])) - for i in range(nPoints)] - - def test_parameter_accuracy(self): - """ - Test that the final value of weights is close to the desired value. - """ - input_batches = [ - self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) - for i in range(20)] - input_stream = self.ssc.queueStream(input_batches) - - slr = StreamingLogisticRegressionWithSGD( - stepSize=0.2, numIterations=25) - slr.setInitialWeights([0.0]) - slr.trainOn(input_stream) - - self.ssc.start() - - def condition(): - rel = (1.5 - slr.latestModel().weights.array[0]) / 1.5 - self.assertAlmostEqual(rel, 0.1, 1) - return True - - self._eventually(condition, catch_assertions=True) - - def test_convergence(self): - """ - Test that weights converge to the required value on toy data. - """ - input_batches = [ - self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) - for i in range(20)] - input_stream = self.ssc.queueStream(input_batches) - models = [] - - slr = StreamingLogisticRegressionWithSGD( - stepSize=0.2, numIterations=25) - slr.setInitialWeights([0.0]) - slr.trainOn(input_stream) - input_stream.foreachRDD( - lambda x: models.append(slr.latestModel().weights[0])) - - self.ssc.start() - - def condition(): - self.assertEqual(len(models), len(input_batches)) - return True - - # We want all batches to finish for this test. - self._eventually(condition, 60.0, catch_assertions=True) - - t_models = array(models) - diff = t_models[1:] - t_models[:-1] - # Test that weights improve with a small tolerance - self.assertTrue(all(diff >= -0.1)) - self.assertTrue(array_sum(diff > 0) > 1) - - @staticmethod - def calculate_accuracy_error(true, predicted): - return sum(abs(array(true) - array(predicted))) / len(true) - - def test_predictions(self): - """Test predicted values on a toy model.""" - input_batches = [] - for i in range(20): - batch = self.sc.parallelize( - self.generateLogisticInput(0, 1.5, 100, 42 + i)) - input_batches.append(batch.map(lambda x: (x.label, x.features))) - input_stream = self.ssc.queueStream(input_batches) - - slr = StreamingLogisticRegressionWithSGD( - stepSize=0.2, numIterations=25) - slr.setInitialWeights([1.5]) - predict_stream = slr.predictOnValues(input_stream) - true_predicted = [] - predict_stream.foreachRDD(lambda x: true_predicted.append(x.collect())) - self.ssc.start() - - def condition(): - self.assertEqual(len(true_predicted), len(input_batches)) - return True - - self._eventually(condition, catch_assertions=True) - - # Test that the accuracy error is no more than 0.4 on each batch. - for batch in true_predicted: - true, predicted = zip(*batch) - self.assertTrue( - self.calculate_accuracy_error(true, predicted) < 0.4) - - def test_training_and_prediction(self): - """Test that the model improves on toy data with no. of batches""" - input_batches = [ - self.sc.parallelize(self.generateLogisticInput(0, 1.5, 100, 42 + i)) - for i in range(20)] - predict_batches = [ - b.map(lambda lp: (lp.label, lp.features)) for b in input_batches] - - slr = StreamingLogisticRegressionWithSGD( - stepSize=0.01, numIterations=25) - slr.setInitialWeights([-0.1]) - errors = [] - - def collect_errors(rdd): - true, predicted = zip(*rdd.collect()) - errors.append(self.calculate_accuracy_error(true, predicted)) - - true_predicted = [] - input_stream = self.ssc.queueStream(input_batches) - predict_stream = self.ssc.queueStream(predict_batches) - slr.trainOn(input_stream) - ps = slr.predictOnValues(predict_stream) - ps.foreachRDD(lambda x: collect_errors(x)) - - self.ssc.start() - - def condition(): - # Test that the improvement in error is > 0.3 - if len(errors) == len(predict_batches): - self.assertGreater(errors[1] - errors[-1], 0.3) - if len(errors) >= 3 and errors[1] - errors[-1] > 0.3: - return True - return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) - - self._eventually(condition) - - -class StreamingLinearRegressionWithTests(MLLibStreamingTestCase): - - def assertArrayAlmostEqual(self, array1, array2, dec): - for i, j in array1, array2: - self.assertAlmostEqual(i, j, dec) - - def test_parameter_accuracy(self): - """Test that coefs are predicted accurately by fitting on toy data.""" - - # Test that fitting (10*X1 + 10*X2), (X1, X2) gives coefficients - # (10, 10) - slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) - slr.setInitialWeights([0.0, 0.0]) - xMean = [0.0, 0.0] - xVariance = [1.0 / 3.0, 1.0 / 3.0] - - # Create ten batches with 100 sample points in each. - batches = [] - for i in range(10): - batch = LinearDataGenerator.generateLinearInput( - 0.0, [10.0, 10.0], xMean, xVariance, 100, 42 + i, 0.1) - batches.append(self.sc.parallelize(batch)) - - input_stream = self.ssc.queueStream(batches) - slr.trainOn(input_stream) - self.ssc.start() - - def condition(): - self.assertArrayAlmostEqual( - slr.latestModel().weights.array, [10., 10.], 1) - self.assertAlmostEqual(slr.latestModel().intercept, 0.0, 1) - return True - - self._eventually(condition, catch_assertions=True) - - def test_parameter_convergence(self): - """Test that the model parameters improve with streaming data.""" - slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) - slr.setInitialWeights([0.0]) - - # Create ten batches with 100 sample points in each. - batches = [] - for i in range(10): - batch = LinearDataGenerator.generateLinearInput( - 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) - batches.append(self.sc.parallelize(batch)) - - model_weights = [] - input_stream = self.ssc.queueStream(batches) - input_stream.foreachRDD( - lambda x: model_weights.append(slr.latestModel().weights[0])) - slr.trainOn(input_stream) - self.ssc.start() - - def condition(): - self.assertEqual(len(model_weights), len(batches)) - return True - - # We want all batches to finish for this test. - self._eventually(condition, catch_assertions=True) - - w = array(model_weights) - diff = w[1:] - w[:-1] - self.assertTrue(all(diff >= -0.1)) - - def test_prediction(self): - """Test prediction on a model with weights already set.""" - # Create a model with initial Weights equal to coefs - slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) - slr.setInitialWeights([10.0, 10.0]) - - # Create ten batches with 100 sample points in each. - batches = [] - for i in range(10): - batch = LinearDataGenerator.generateLinearInput( - 0.0, [10.0, 10.0], [0.0, 0.0], [1.0 / 3.0, 1.0 / 3.0], - 100, 42 + i, 0.1) - batches.append( - self.sc.parallelize(batch).map(lambda lp: (lp.label, lp.features))) - - input_stream = self.ssc.queueStream(batches) - output_stream = slr.predictOnValues(input_stream) - samples = [] - output_stream.foreachRDD(lambda x: samples.append(x.collect())) - - self.ssc.start() - - def condition(): - self.assertEqual(len(samples), len(batches)) - return True - - # We want all batches to finish for this test. - self._eventually(condition, catch_assertions=True) - - # Test that mean absolute error on each batch is less than 0.1 - for batch in samples: - true, predicted = zip(*batch) - self.assertTrue(mean(abs(array(true) - array(predicted))) < 0.1) - - def test_train_prediction(self): - """Test that error on test data improves as model is trained.""" - slr = StreamingLinearRegressionWithSGD(stepSize=0.2, numIterations=25) - slr.setInitialWeights([0.0]) - - # Create ten batches with 100 sample points in each. - batches = [] - for i in range(10): - batch = LinearDataGenerator.generateLinearInput( - 0.0, [10.0], [0.0], [1.0 / 3.0], 100, 42 + i, 0.1) - batches.append(self.sc.parallelize(batch)) - - predict_batches = [ - b.map(lambda lp: (lp.label, lp.features)) for b in batches] - errors = [] - - def func(rdd): - true, predicted = zip(*rdd.collect()) - errors.append(mean(abs(true) - abs(predicted))) - - input_stream = self.ssc.queueStream(batches) - output_stream = self.ssc.queueStream(predict_batches) - slr.trainOn(input_stream) - output_stream = slr.predictOnValues(output_stream) - output_stream.foreachRDD(func) - self.ssc.start() - - def condition(): - if len(errors) == len(predict_batches): - self.assertGreater(errors[1] - errors[-1], 2) - if len(errors) >= 3 and errors[1] - errors[-1] > 2: - return True - return "Latest errors: " + ", ".join(map(lambda x: str(x), errors)) - - self._eventually(condition) - - -class MLUtilsTests(MLlibTestCase): - def test_append_bias(self): - data = [2.0, 2.0, 2.0] - ret = MLUtils.appendBias(data) - self.assertEqual(ret[3], 1.0) - self.assertEqual(type(ret), DenseVector) - - def test_append_bias_with_vector(self): - data = Vectors.dense([2.0, 2.0, 2.0]) - ret = MLUtils.appendBias(data) - self.assertEqual(ret[3], 1.0) - self.assertEqual(type(ret), DenseVector) - - def test_append_bias_with_sp_vector(self): - data = Vectors.sparse(3, {0: 2.0, 2: 2.0}) - expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0}) - # Returned value must be SparseVector - ret = MLUtils.appendBias(data) - self.assertEqual(ret, expected) - self.assertEqual(type(ret), SparseVector) - - def test_load_vectors(self): - import shutil - data = [ - [1.0, 2.0, 3.0], - [1.0, 2.0, 3.0] - ] - temp_dir = tempfile.mkdtemp() - load_vectors_path = os.path.join(temp_dir, "test_load_vectors") - try: - self.sc.parallelize(data).saveAsTextFile(load_vectors_path) - ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path) - ret = ret_rdd.collect() - self.assertEqual(len(ret), 2) - self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0])) - self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0])) - except: - self.fail() - finally: - shutil.rmtree(load_vectors_path) - - -class ALSTests(MLlibTestCase): - - def test_als_ratings_serialize(self): - r = Rating(7, 1123, 3.14) - jr = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(r))) - nr = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jr))) - self.assertEqual(r.user, nr.user) - self.assertEqual(r.product, nr.product) - self.assertAlmostEqual(r.rating, nr.rating, 2) - - def test_als_ratings_id_long_error(self): - r = Rating(1205640308657491975, 50233468418, 1.0) - # rating user id exceeds max int value, should fail when pickled - self.assertRaises(Py4JJavaError, self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads, - bytearray(ser.dumps(r))) - - -class HashingTFTest(MLlibTestCase): - - def test_binary_term_freqs(self): - hashingTF = HashingTF(100).setBinary(True) - doc = "a a b c c c".split(" ") - n = hashingTF.numFeatures - output = hashingTF.transform(doc).toArray() - expected = Vectors.sparse(n, {hashingTF.indexOf("a"): 1.0, - hashingTF.indexOf("b"): 1.0, - hashingTF.indexOf("c"): 1.0}).toArray() - for i in range(0, n): - self.assertAlmostEqual(output[i], expected[i], 14, "Error at " + str(i) + - ": expected " + str(expected[i]) + ", got " + str(output[i])) - - -class DimensionalityReductionTests(MLlibTestCase): - - denseData = [ - Vectors.dense([0.0, 1.0, 2.0]), - Vectors.dense([3.0, 4.0, 5.0]), - Vectors.dense([6.0, 7.0, 8.0]), - Vectors.dense([9.0, 0.0, 1.0]) - ] - sparseData = [ - Vectors.sparse(3, [(1, 1.0), (2, 2.0)]), - Vectors.sparse(3, [(0, 3.0), (1, 4.0), (2, 5.0)]), - Vectors.sparse(3, [(0, 6.0), (1, 7.0), (2, 8.0)]), - Vectors.sparse(3, [(0, 9.0), (2, 1.0)]) - ] - - def assertEqualUpToSign(self, vecA, vecB): - eq1 = vecA - vecB - eq2 = vecA + vecB - self.assertTrue(sum(abs(eq1)) < 1e-6 or sum(abs(eq2)) < 1e-6) - - def test_svd(self): - denseMat = RowMatrix(self.sc.parallelize(self.denseData)) - sparseMat = RowMatrix(self.sc.parallelize(self.sparseData)) - m = 4 - n = 3 - for mat in [denseMat, sparseMat]: - for k in range(1, 4): - rm = mat.computeSVD(k, computeU=True) - self.assertEqual(rm.s.size, k) - self.assertEqual(rm.U.numRows(), m) - self.assertEqual(rm.U.numCols(), k) - self.assertEqual(rm.V.numRows, n) - self.assertEqual(rm.V.numCols, k) - - # Test that U returned is None if computeU is set to False. - self.assertEqual(mat.computeSVD(1).U, None) - - # Test that low rank matrices cannot have number of singular values - # greater than a limit. - rm = RowMatrix(self.sc.parallelize(tile([1, 2, 3], (3, 1)))) - self.assertEqual(rm.computeSVD(3, False, 1e-6).s.size, 1) - - def test_pca(self): - expected_pcs = array([ - [0.0, 1.0, 0.0], - [sqrt(2.0) / 2.0, 0.0, sqrt(2.0) / 2.0], - [sqrt(2.0) / 2.0, 0.0, -sqrt(2.0) / 2.0] - ]) - n = 3 - denseMat = RowMatrix(self.sc.parallelize(self.denseData)) - sparseMat = RowMatrix(self.sc.parallelize(self.sparseData)) - for mat in [denseMat, sparseMat]: - for k in range(1, 4): - pcs = mat.computePrincipalComponents(k) - self.assertEqual(pcs.numRows, n) - self.assertEqual(pcs.numCols, k) - - # We can just test the updated principal component for equality. - self.assertEqualUpToSign(pcs.toArray()[:, k - 1], expected_pcs[:, k - 1]) - - -class FPGrowthTest(MLlibTestCase): - - def test_fpgrowth(self): - data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]] - rdd = self.sc.parallelize(data, 2) - model1 = FPGrowth.train(rdd, 0.6, 2) - # use default data partition number when numPartitions is not specified - model2 = FPGrowth.train(rdd, 0.6) - self.assertEqual(sorted(model1.freqItemsets().collect()), - sorted(model2.freqItemsets().collect())) - -if __name__ == "__main__": - from pyspark.mllib.tests import * - if not _have_scipy: - print("NOTE: Skipping SciPy tests as it does not seem to be installed") - if xmlrunner: - unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2) - else: - unittest.main(verbosity=2) - if not _have_scipy: - print("NOTE: SciPy tests were skipped as it does not seem to be installed") - sc.stop() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/tree.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/tree.py deleted file mode 100644 index b05734c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/tree.py +++ /dev/null @@ -1,661 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import absolute_import - -import sys -import random - -from pyspark import SparkContext, RDD, since -from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper -from pyspark.mllib.linalg import _convert_to_vector -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.util import JavaLoader, JavaSaveable - -__all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel', - 'RandomForest', 'GradientBoostedTreesModel', 'GradientBoostedTrees'] - - -class TreeEnsembleModel(JavaModelWrapper, JavaSaveable): - """TreeEnsembleModel - - .. versionadded:: 1.3.0 - """ - @since("1.3.0") - def predict(self, x): - """ - Predict values for a single data point or an RDD of points using - the model trained. - - .. note:: In Python, predict cannot currently be used within an RDD - transformation or action. - Call predict directly on the RDD instead. - """ - if isinstance(x, RDD): - return self.call("predict", x.map(_convert_to_vector)) - - else: - return self.call("predict", _convert_to_vector(x)) - - @since("1.3.0") - def numTrees(self): - """ - Get number of trees in ensemble. - """ - return self.call("numTrees") - - @since("1.3.0") - def totalNumNodes(self): - """ - Get total number of nodes, summed over all trees in the ensemble. - """ - return self.call("totalNumNodes") - - def __repr__(self): - """ Summary of model """ - return self._java_model.toString() - - @since("1.3.0") - def toDebugString(self): - """ Full model """ - return self._java_model.toDebugString() - - -class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader): - """ - A decision tree model for classification or regression. - - .. versionadded:: 1.1.0 - """ - @since("1.1.0") - def predict(self, x): - """ - Predict the label of one or more examples. - - .. note:: In Python, predict cannot currently be used within an RDD - transformation or action. - Call predict directly on the RDD instead. - - :param x: - Data point (feature vector), or an RDD of data points (feature - vectors). - """ - if isinstance(x, RDD): - return self.call("predict", x.map(_convert_to_vector)) - - else: - return self.call("predict", _convert_to_vector(x)) - - @since("1.1.0") - def numNodes(self): - """Get number of nodes in tree, including leaf nodes.""" - return self._java_model.numNodes() - - @since("1.1.0") - def depth(self): - """ - Get depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - """ - return self._java_model.depth() - - def __repr__(self): - """ summary of model. """ - return self._java_model.toString() - - @since("1.2.0") - def toDebugString(self): - """ full model. """ - return self._java_model.toDebugString() - - @classmethod - def _java_loader_class(cls): - return "org.apache.spark.mllib.tree.model.DecisionTreeModel" - - -class DecisionTree(object): - """ - Learning algorithm for a decision tree model for classification or - regression. - - .. versionadded:: 1.1.0 - """ - - @classmethod - def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32, - minInstancesPerNode=1, minInfoGain=0.0): - first = data.first() - assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" - model = callMLlibFunc("trainDecisionTreeModel", data, type, numClasses, features, - impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) - return DecisionTreeModel(model) - - @classmethod - @since("1.1.0") - def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, - impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, - minInfoGain=0.0): - """ - Train a decision tree model for classification. - - :param data: - Training data: RDD of LabeledPoint. Labels should take values - {0, 1, ..., numClasses-1}. - :param numClasses: - Number of classes for classification. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param impurity: - Criterion used for information gain calculation. - Supported values: "gini" or "entropy". - (default: "gini") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 5) - :param maxBins: - Number of bins used for finding splits at each node. - (default: 32) - :param minInstancesPerNode: - Minimum number of instances required at child nodes to create - the parent split. - (default: 1) - :param minInfoGain: - Minimum info gain required to create a split. - (default: 0.0) - :return: - DecisionTreeModel. - - Example usage: - - >>> from numpy import array - >>> from pyspark.mllib.regression import LabeledPoint - >>> from pyspark.mllib.tree import DecisionTree - >>> - >>> data = [ - ... LabeledPoint(0.0, [0.0]), - ... LabeledPoint(1.0, [1.0]), - ... LabeledPoint(1.0, [2.0]), - ... LabeledPoint(1.0, [3.0]) - ... ] - >>> model = DecisionTree.trainClassifier(sc.parallelize(data), 2, {}) - >>> print(model) - DecisionTreeModel classifier of depth 1 with 3 nodes - - >>> print(model.toDebugString()) - DecisionTreeModel classifier of depth 1 with 3 nodes - If (feature 0 <= 0.5) - Predict: 0.0 - Else (feature 0 > 0.5) - Predict: 1.0 - - >>> model.predict(array([1.0])) - 1.0 - >>> model.predict(array([0.0])) - 0.0 - >>> rdd = sc.parallelize([[1.0], [0.0]]) - >>> model.predict(rdd).collect() - [1.0, 0.0] - """ - return cls._train(data, "classification", numClasses, categoricalFeaturesInfo, - impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) - - @classmethod - @since("1.1.0") - def trainRegressor(cls, data, categoricalFeaturesInfo, - impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1, - minInfoGain=0.0): - """ - Train a decision tree model for regression. - - :param data: - Training data: RDD of LabeledPoint. Labels are real numbers. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param impurity: - Criterion used for information gain calculation. - The only supported value for regression is "variance". - (default: "variance") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 5) - :param maxBins: - Number of bins used for finding splits at each node. - (default: 32) - :param minInstancesPerNode: - Minimum number of instances required at child nodes to create - the parent split. - (default: 1) - :param minInfoGain: - Minimum info gain required to create a split. - (default: 0.0) - :return: - DecisionTreeModel. - - Example usage: - - >>> from pyspark.mllib.regression import LabeledPoint - >>> from pyspark.mllib.tree import DecisionTree - >>> from pyspark.mllib.linalg import SparseVector - >>> - >>> sparse_data = [ - ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})), - ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), - ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})), - ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) - ... ] - >>> - >>> model = DecisionTree.trainRegressor(sc.parallelize(sparse_data), {}) - >>> model.predict(SparseVector(2, {1: 1.0})) - 1.0 - >>> model.predict(SparseVector(2, {1: 0.0})) - 0.0 - >>> rdd = sc.parallelize([[0.0, 1.0], [0.0, 0.0]]) - >>> model.predict(rdd).collect() - [1.0, 0.0] - """ - return cls._train(data, "regression", 0, categoricalFeaturesInfo, - impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) - - -@inherit_doc -class RandomForestModel(TreeEnsembleModel, JavaLoader): - """ - Represents a random forest model. - - .. versionadded:: 1.2.0 - """ - - @classmethod - def _java_loader_class(cls): - return "org.apache.spark.mllib.tree.model.RandomForestModel" - - -class RandomForest(object): - """ - Learning algorithm for a random forest model for classification or - regression. - - .. versionadded:: 1.2.0 - """ - - supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird") - - @classmethod - def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees, - featureSubsetStrategy, impurity, maxDepth, maxBins, seed): - first = data.first() - assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" - if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies: - raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy) - if seed is None: - seed = random.randint(0, 1 << 30) - model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses, - categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, - maxDepth, maxBins, seed) - return RandomForestModel(model) - - @classmethod - @since("1.2.0") - def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, - featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, - seed=None): - """ - Train a random forest model for binary or multiclass - classification. - - :param data: - Training dataset: RDD of LabeledPoint. Labels should take values - {0, 1, ..., numClasses-1}. - :param numClasses: - Number of classes for classification. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param numTrees: - Number of trees in the random forest. - :param featureSubsetStrategy: - Number of features to consider for splits at each node. - Supported values: "auto", "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "sqrt". - (default: "auto") - :param impurity: - Criterion used for information gain calculation. - Supported values: "gini" or "entropy". - (default: "gini") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 4) - :param maxBins: - Maximum number of bins used for splitting features. - (default: 32) - :param seed: - Random seed for bootstrapping and choosing feature subsets. - Set as None to generate seed based on system time. - (default: None) - :return: - RandomForestModel that can be used for prediction. - - Example usage: - - >>> from pyspark.mllib.regression import LabeledPoint - >>> from pyspark.mllib.tree import RandomForest - >>> - >>> data = [ - ... LabeledPoint(0.0, [0.0]), - ... LabeledPoint(0.0, [1.0]), - ... LabeledPoint(1.0, [2.0]), - ... LabeledPoint(1.0, [3.0]) - ... ] - >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42) - >>> model.numTrees() - 3 - >>> model.totalNumNodes() - 7 - >>> print(model) - TreeEnsembleModel classifier with 3 trees - - >>> print(model.toDebugString()) - TreeEnsembleModel classifier with 3 trees - - Tree 0: - Predict: 1.0 - Tree 1: - If (feature 0 <= 1.5) - Predict: 0.0 - Else (feature 0 > 1.5) - Predict: 1.0 - Tree 2: - If (feature 0 <= 1.5) - Predict: 0.0 - Else (feature 0 > 1.5) - Predict: 1.0 - - >>> model.predict([2.0]) - 1.0 - >>> model.predict([0.0]) - 0.0 - >>> rdd = sc.parallelize([[3.0], [1.0]]) - >>> model.predict(rdd).collect() - [1.0, 0.0] - """ - return cls._train(data, "classification", numClasses, - categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, - maxDepth, maxBins, seed) - - @classmethod - @since("1.2.0") - def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", - impurity="variance", maxDepth=4, maxBins=32, seed=None): - """ - Train a random forest model for regression. - - :param data: - Training dataset: RDD of LabeledPoint. Labels are real numbers. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param numTrees: - Number of trees in the random forest. - :param featureSubsetStrategy: - Number of features to consider for splits at each node. - Supported values: "auto", "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "onethird" for regression. - (default: "auto") - :param impurity: - Criterion used for information gain calculation. - The only supported value for regression is "variance". - (default: "variance") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 4) - :param maxBins: - Maximum number of bins used for splitting features. - (default: 32) - :param seed: - Random seed for bootstrapping and choosing feature subsets. - Set as None to generate seed based on system time. - (default: None) - :return: - RandomForestModel that can be used for prediction. - - Example usage: - - >>> from pyspark.mllib.regression import LabeledPoint - >>> from pyspark.mllib.tree import RandomForest - >>> from pyspark.mllib.linalg import SparseVector - >>> - >>> sparse_data = [ - ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), - ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), - ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), - ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) - ... ] - >>> - >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42) - >>> model.numTrees() - 2 - >>> model.totalNumNodes() - 4 - >>> model.predict(SparseVector(2, {1: 1.0})) - 1.0 - >>> model.predict(SparseVector(2, {0: 1.0})) - 0.5 - >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) - >>> model.predict(rdd).collect() - [1.0, 0.5] - """ - return cls._train(data, "regression", 0, categoricalFeaturesInfo, numTrees, - featureSubsetStrategy, impurity, maxDepth, maxBins, seed) - - -@inherit_doc -class GradientBoostedTreesModel(TreeEnsembleModel, JavaLoader): - """ - Represents a gradient-boosted tree model. - - .. versionadded:: 1.3.0 - """ - - @classmethod - def _java_loader_class(cls): - return "org.apache.spark.mllib.tree.model.GradientBoostedTreesModel" - - -class GradientBoostedTrees(object): - """ - Learning algorithm for a gradient boosted trees model for - classification or regression. - - .. versionadded:: 1.3.0 - """ - - @classmethod - def _train(cls, data, algo, categoricalFeaturesInfo, - loss, numIterations, learningRate, maxDepth, maxBins): - first = data.first() - assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" - model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo, - loss, numIterations, learningRate, maxDepth, maxBins) - return GradientBoostedTreesModel(model) - - @classmethod - @since("1.3.0") - def trainClassifier(cls, data, categoricalFeaturesInfo, - loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3, - maxBins=32): - """ - Train a gradient-boosted trees model for classification. - - :param data: - Training dataset: RDD of LabeledPoint. Labels should take values - {0, 1}. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param loss: - Loss function used for minimization during gradient boosting. - Supported values: "logLoss", "leastSquaresError", - "leastAbsoluteError". - (default: "logLoss") - :param numIterations: - Number of iterations of boosting. - (default: 100) - :param learningRate: - Learning rate for shrinking the contribution of each estimator. - The learning rate should be between in the interval (0, 1]. - (default: 0.1) - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 3) - :param maxBins: - Maximum number of bins used for splitting features. DecisionTree - requires maxBins >= max categories. - (default: 32) - :return: - GradientBoostedTreesModel that can be used for prediction. - - Example usage: - - >>> from pyspark.mllib.regression import LabeledPoint - >>> from pyspark.mllib.tree import GradientBoostedTrees - >>> - >>> data = [ - ... LabeledPoint(0.0, [0.0]), - ... LabeledPoint(0.0, [1.0]), - ... LabeledPoint(1.0, [2.0]), - ... LabeledPoint(1.0, [3.0]) - ... ] - >>> - >>> model = GradientBoostedTrees.trainClassifier(sc.parallelize(data), {}, numIterations=10) - >>> model.numTrees() - 10 - >>> model.totalNumNodes() - 30 - >>> print(model) # it already has newline - TreeEnsembleModel classifier with 10 trees - - >>> model.predict([2.0]) - 1.0 - >>> model.predict([0.0]) - 0.0 - >>> rdd = sc.parallelize([[2.0], [0.0]]) - >>> model.predict(rdd).collect() - [1.0, 0.0] - """ - return cls._train(data, "classification", categoricalFeaturesInfo, - loss, numIterations, learningRate, maxDepth, maxBins) - - @classmethod - @since("1.3.0") - def trainRegressor(cls, data, categoricalFeaturesInfo, - loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3, - maxBins=32): - """ - Train a gradient-boosted trees model for regression. - - :param data: - Training dataset: RDD of LabeledPoint. Labels are real numbers. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param loss: - Loss function used for minimization during gradient boosting. - Supported values: "logLoss", "leastSquaresError", - "leastAbsoluteError". - (default: "leastSquaresError") - :param numIterations: - Number of iterations of boosting. - (default: 100) - :param learningRate: - Learning rate for shrinking the contribution of each estimator. - The learning rate should be between in the interval (0, 1]. - (default: 0.1) - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 3) - :param maxBins: - Maximum number of bins used for splitting features. DecisionTree - requires maxBins >= max categories. - (default: 32) - :return: - GradientBoostedTreesModel that can be used for prediction. - - Example usage: - - >>> from pyspark.mllib.regression import LabeledPoint - >>> from pyspark.mllib.tree import GradientBoostedTrees - >>> from pyspark.mllib.linalg import SparseVector - >>> - >>> sparse_data = [ - ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), - ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), - ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), - ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) - ... ] - >>> - >>> data = sc.parallelize(sparse_data) - >>> model = GradientBoostedTrees.trainRegressor(data, {}, numIterations=10) - >>> model.numTrees() - 10 - >>> model.totalNumNodes() - 12 - >>> model.predict(SparseVector(2, {1: 1.0})) - 1.0 - >>> model.predict(SparseVector(2, {0: 1.0})) - 0.0 - >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) - >>> model.predict(rdd).collect() - [1.0, 0.0] - """ - return cls._train(data, "regression", categoricalFeaturesInfo, - loss, numIterations, learningRate, maxDepth, maxBins) - - -def _test(): - import doctest - globs = globals().copy() - from pyspark.sql import SparkSession - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("mllib.tree tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/util.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/util.py deleted file mode 100644 index fc78093..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/mllib/util.py +++ /dev/null @@ -1,528 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import numpy as np -import warnings - -if sys.version > '3': - xrange = range - basestring = str - -from pyspark import SparkContext, since -from pyspark.mllib.common import callMLlibFunc, inherit_doc -from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector -from pyspark.sql import DataFrame - - -class MLUtils(object): - - """ - Helper methods to load, save and pre-process data used in MLlib. - - .. versionadded:: 1.0.0 - """ - - @staticmethod - def _parse_libsvm_line(line, multiclass=None): - """ - Parses a line in LIBSVM format into (label, indices, values). - """ - if multiclass is not None: - warnings.warn("deprecated", DeprecationWarning) - items = line.split(None) - label = float(items[0]) - nnz = len(items) - 1 - indices = np.zeros(nnz, dtype=np.int32) - values = np.zeros(nnz) - for i in xrange(nnz): - index, value = items[1 + i].split(":") - indices[i] = int(index) - 1 - values[i] = float(value) - return label, indices, values - - @staticmethod - def _convert_labeled_point_to_libsvm(p): - """Converts a LabeledPoint to a string in LIBSVM format.""" - from pyspark.mllib.regression import LabeledPoint - assert isinstance(p, LabeledPoint) - items = [str(p.label)] - v = _convert_to_vector(p.features) - if isinstance(v, SparseVector): - nnz = len(v.indices) - for i in xrange(nnz): - items.append(str(v.indices[i] + 1) + ":" + str(v.values[i])) - else: - for i in xrange(len(v)): - items.append(str(i + 1) + ":" + str(v[i])) - return " ".join(items) - - @staticmethod - @since("1.0.0") - def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None, multiclass=None): - """ - Loads labeled data in the LIBSVM format into an RDD of - LabeledPoint. The LIBSVM format is a text-based format used by - LIBSVM and LIBLINEAR. Each line represents a labeled sparse - feature vector using the following format: - - label index1:value1 index2:value2 ... - - where the indices are one-based and in ascending order. This - method parses each line into a LabeledPoint, where the feature - indices are converted to zero-based. - - :param sc: Spark context - :param path: file or directory path in any Hadoop-supported file - system URI - :param numFeatures: number of features, which will be determined - from the input data if a nonpositive value - is given. This is useful when the dataset is - already split into multiple files and you - want to load them separately, because some - features may not present in certain files, - which leads to inconsistent feature - dimensions. - :param minPartitions: min number of partitions - @return: labeled data stored as an RDD of LabeledPoint - - >>> from tempfile import NamedTemporaryFile - >>> from pyspark.mllib.util import MLUtils - >>> from pyspark.mllib.regression import LabeledPoint - >>> tempFile = NamedTemporaryFile(delete=True) - >>> _ = tempFile.write(b"+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0") - >>> tempFile.flush() - >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect() - >>> tempFile.close() - >>> examples[0] - LabeledPoint(1.0, (6,[0,2,4],[1.0,2.0,3.0])) - >>> examples[1] - LabeledPoint(-1.0, (6,[],[])) - >>> examples[2] - LabeledPoint(-1.0, (6,[1,3,5],[4.0,5.0,6.0])) - """ - from pyspark.mllib.regression import LabeledPoint - if multiclass is not None: - warnings.warn("deprecated", DeprecationWarning) - - lines = sc.textFile(path, minPartitions) - parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l)) - if numFeatures <= 0: - parsed.cache() - numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1 - return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2]))) - - @staticmethod - @since("1.0.0") - def saveAsLibSVMFile(data, dir): - """ - Save labeled data in LIBSVM format. - - :param data: an RDD of LabeledPoint to be saved - :param dir: directory to save the data - - >>> from tempfile import NamedTemporaryFile - >>> from fileinput import input - >>> from pyspark.mllib.regression import LabeledPoint - >>> from glob import glob - >>> from pyspark.mllib.util import MLUtils - >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), - ... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] - >>> tempFile = NamedTemporaryFile(delete=True) - >>> tempFile.close() - >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name) - >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*")))) - '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n' - """ - lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p)) - lines.saveAsTextFile(dir) - - @staticmethod - @since("1.1.0") - def loadLabeledPoints(sc, path, minPartitions=None): - """ - Load labeled points saved using RDD.saveAsTextFile. - - :param sc: Spark context - :param path: file or directory path in any Hadoop-supported file - system URI - :param minPartitions: min number of partitions - @return: labeled data stored as an RDD of LabeledPoint - - >>> from tempfile import NamedTemporaryFile - >>> from pyspark.mllib.util import MLUtils - >>> from pyspark.mllib.regression import LabeledPoint - >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), - ... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))] - >>> tempFile = NamedTemporaryFile(delete=True) - >>> tempFile.close() - >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name) - >>> MLUtils.loadLabeledPoints(sc, tempFile.name).collect() - [LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])] - """ - minPartitions = minPartitions or min(sc.defaultParallelism, 2) - return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions) - - @staticmethod - @since("1.5.0") - def appendBias(data): - """ - Returns a new vector with `1.0` (bias) appended to - the end of the input vector. - """ - vec = _convert_to_vector(data) - if isinstance(vec, SparseVector): - newIndices = np.append(vec.indices, len(vec)) - newValues = np.append(vec.values, 1.0) - return SparseVector(len(vec) + 1, newIndices, newValues) - else: - return _convert_to_vector(np.append(vec.toArray(), 1.0)) - - @staticmethod - @since("1.5.0") - def loadVectors(sc, path): - """ - Loads vectors saved using `RDD[Vector].saveAsTextFile` - with the default number of partitions. - """ - return callMLlibFunc("loadVectors", sc, path) - - @staticmethod - @since("2.0.0") - def convertVectorColumnsToML(dataset, *cols): - """ - Converts vector columns in an input DataFrame from the - :py:class:`pyspark.mllib.linalg.Vector` type to the new - :py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml` - package. - - :param dataset: - input dataset - :param cols: - a list of vector columns to be converted. - New vector columns will be ignored. If unspecified, all old - vector columns will be converted excepted nested ones. - :return: - the input dataset with old vector columns converted to the - new vector type - - >>> import pyspark - >>> from pyspark.mllib.linalg import Vectors - >>> from pyspark.mllib.util import MLUtils - >>> df = spark.createDataFrame( - ... [(0, Vectors.sparse(2, [1], [1.0]), Vectors.dense(2.0, 3.0))], - ... ["id", "x", "y"]) - >>> r1 = MLUtils.convertVectorColumnsToML(df).first() - >>> isinstance(r1.x, pyspark.ml.linalg.SparseVector) - True - >>> isinstance(r1.y, pyspark.ml.linalg.DenseVector) - True - >>> r2 = MLUtils.convertVectorColumnsToML(df, "x").first() - >>> isinstance(r2.x, pyspark.ml.linalg.SparseVector) - True - >>> isinstance(r2.y, pyspark.mllib.linalg.DenseVector) - True - """ - if not isinstance(dataset, DataFrame): - raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) - return callMLlibFunc("convertVectorColumnsToML", dataset, list(cols)) - - @staticmethod - @since("2.0.0") - def convertVectorColumnsFromML(dataset, *cols): - """ - Converts vector columns in an input DataFrame to the - :py:class:`pyspark.mllib.linalg.Vector` type from the new - :py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml` - package. - - :param dataset: - input dataset - :param cols: - a list of vector columns to be converted. - Old vector columns will be ignored. If unspecified, all new - vector columns will be converted except nested ones. - :return: - the input dataset with new vector columns converted to the - old vector type - - >>> import pyspark - >>> from pyspark.ml.linalg import Vectors - >>> from pyspark.mllib.util import MLUtils - >>> df = spark.createDataFrame( - ... [(0, Vectors.sparse(2, [1], [1.0]), Vectors.dense(2.0, 3.0))], - ... ["id", "x", "y"]) - >>> r1 = MLUtils.convertVectorColumnsFromML(df).first() - >>> isinstance(r1.x, pyspark.mllib.linalg.SparseVector) - True - >>> isinstance(r1.y, pyspark.mllib.linalg.DenseVector) - True - >>> r2 = MLUtils.convertVectorColumnsFromML(df, "x").first() - >>> isinstance(r2.x, pyspark.mllib.linalg.SparseVector) - True - >>> isinstance(r2.y, pyspark.ml.linalg.DenseVector) - True - """ - if not isinstance(dataset, DataFrame): - raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) - return callMLlibFunc("convertVectorColumnsFromML", dataset, list(cols)) - - @staticmethod - @since("2.0.0") - def convertMatrixColumnsToML(dataset, *cols): - """ - Converts matrix columns in an input DataFrame from the - :py:class:`pyspark.mllib.linalg.Matrix` type to the new - :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` - package. - - :param dataset: - input dataset - :param cols: - a list of matrix columns to be converted. - New matrix columns will be ignored. If unspecified, all old - matrix columns will be converted excepted nested ones. - :return: - the input dataset with old matrix columns converted to the - new matrix type - - >>> import pyspark - >>> from pyspark.mllib.linalg import Matrices - >>> from pyspark.mllib.util import MLUtils - >>> df = spark.createDataFrame( - ... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]), - ... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"]) - >>> r1 = MLUtils.convertMatrixColumnsToML(df).first() - >>> isinstance(r1.x, pyspark.ml.linalg.SparseMatrix) - True - >>> isinstance(r1.y, pyspark.ml.linalg.DenseMatrix) - True - >>> r2 = MLUtils.convertMatrixColumnsToML(df, "x").first() - >>> isinstance(r2.x, pyspark.ml.linalg.SparseMatrix) - True - >>> isinstance(r2.y, pyspark.mllib.linalg.DenseMatrix) - True - """ - if not isinstance(dataset, DataFrame): - raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) - return callMLlibFunc("convertMatrixColumnsToML", dataset, list(cols)) - - @staticmethod - @since("2.0.0") - def convertMatrixColumnsFromML(dataset, *cols): - """ - Converts matrix columns in an input DataFrame to the - :py:class:`pyspark.mllib.linalg.Matrix` type from the new - :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` - package. - - :param dataset: - input dataset - :param cols: - a list of matrix columns to be converted. - Old matrix columns will be ignored. If unspecified, all new - matrix columns will be converted except nested ones. - :return: - the input dataset with new matrix columns converted to the - old matrix type - - >>> import pyspark - >>> from pyspark.ml.linalg import Matrices - >>> from pyspark.mllib.util import MLUtils - >>> df = spark.createDataFrame( - ... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]), - ... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"]) - >>> r1 = MLUtils.convertMatrixColumnsFromML(df).first() - >>> isinstance(r1.x, pyspark.mllib.linalg.SparseMatrix) - True - >>> isinstance(r1.y, pyspark.mllib.linalg.DenseMatrix) - True - >>> r2 = MLUtils.convertMatrixColumnsFromML(df, "x").first() - >>> isinstance(r2.x, pyspark.mllib.linalg.SparseMatrix) - True - >>> isinstance(r2.y, pyspark.ml.linalg.DenseMatrix) - True - """ - if not isinstance(dataset, DataFrame): - raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) - return callMLlibFunc("convertMatrixColumnsFromML", dataset, list(cols)) - - -class Saveable(object): - """ - Mixin for models and transformers which may be saved as files. - - .. versionadded:: 1.3.0 - """ - - def save(self, sc, path): - """ - Save this model to the given path. - - This saves: - * human-readable (JSON) model metadata to path/metadata/ - * Parquet formatted data to path/data/ - - The model may be loaded using py:meth:`Loader.load`. - - :param sc: Spark context used to save model data. - :param path: Path specifying the directory in which to save - this model. If the directory already exists, - this method throws an exception. - """ - raise NotImplementedError - - -@inherit_doc -class JavaSaveable(Saveable): - """ - Mixin for models that provide save() through their Scala - implementation. - - .. versionadded:: 1.3.0 - """ - - @since("1.3.0") - def save(self, sc, path): - """Save this model to the given path.""" - if not isinstance(sc, SparkContext): - raise TypeError("sc should be a SparkContext, got type %s" % type(sc)) - if not isinstance(path, basestring): - raise TypeError("path should be a basestring, got type %s" % type(path)) - self._java_model.save(sc._jsc.sc(), path) - - -class Loader(object): - """ - Mixin for classes which can load saved models from files. - - .. versionadded:: 1.3.0 - """ - - @classmethod - def load(cls, sc, path): - """ - Load a model from the given path. The model should have been - saved using py:meth:`Saveable.save`. - - :param sc: Spark context used for loading model files. - :param path: Path specifying the directory to which the model - was saved. - :return: model instance - """ - raise NotImplemented - - -@inherit_doc -class JavaLoader(Loader): - """ - Mixin for classes which can load saved models using its Scala - implementation. - - .. versionadded:: 1.3.0 - """ - - @classmethod - def _java_loader_class(cls): - """ - Returns the full class name of the Java loader. The default - implementation replaces "pyspark" by "org.apache.spark" in - the Python full class name. - """ - java_package = cls.__module__.replace("pyspark", "org.apache.spark") - return ".".join([java_package, cls.__name__]) - - @classmethod - def _load_java(cls, sc, path): - """ - Load a Java model from the given path. - """ - java_class = cls._java_loader_class() - java_obj = sc._jvm - for name in java_class.split("."): - java_obj = getattr(java_obj, name) - return java_obj.load(sc._jsc.sc(), path) - - @classmethod - @since("1.3.0") - def load(cls, sc, path): - """Load a model from the given path.""" - java_model = cls._load_java(sc, path) - return cls(java_model) - - -class LinearDataGenerator(object): - """Utils for generating linear data. - - .. versionadded:: 1.5.0 - """ - - @staticmethod - @since("1.5.0") - def generateLinearInput(intercept, weights, xMean, xVariance, - nPoints, seed, eps): - """ - :param: intercept bias factor, the term c in X'w + c - :param: weights feature vector, the term w in X'w + c - :param: xMean Point around which the data X is centered. - :param: xVariance Variance of the given data - :param: nPoints Number of points to be generated - :param: seed Random Seed - :param: eps Used to scale the noise. If eps is set high, - the amount of gaussian noise added is more. - - Returns a list of LabeledPoints of length nPoints - """ - weights = [float(weight) for weight in weights] - xMean = [float(mean) for mean in xMean] - xVariance = [float(var) for var in xVariance] - return list(callMLlibFunc( - "generateLinearInputWrapper", float(intercept), weights, xMean, - xVariance, int(nPoints), int(seed), float(eps))) - - @staticmethod - @since("1.5.0") - def generateLinearRDD(sc, nexamples, nfeatures, eps, - nParts=2, intercept=0.0): - """ - Generate an RDD of LabeledPoints. - """ - return callMLlibFunc( - "generateLinearRDDWrapper", sc, int(nexamples), int(nfeatures), - float(eps), int(nParts), float(intercept)) - - -def _test(): - import doctest - from pyspark.sql import SparkSession - globs = globals().copy() - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - spark = SparkSession.builder\ - .master("local[2]")\ - .appName("mllib.util tests")\ - .getOrCreate() - globs['spark'] = spark - globs['sc'] = spark.sparkContext - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/profiler.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/profiler.py deleted file mode 100644 index 3c7656a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/profiler.py +++ /dev/null @@ -1,177 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import cProfile -import pstats -import os -import atexit -import sys - -from pyspark.accumulators import AccumulatorParam - - -class ProfilerCollector(object): - """ - This class keeps track of different profilers on a per - stage basis. Also this is used to create new profilers for - the different stages. - """ - - def __init__(self, profiler_cls, dump_path=None): - self.profiler_cls = profiler_cls - self.profile_dump_path = dump_path - self.profilers = [] - - def new_profiler(self, ctx): - """ Create a new profiler using class `profiler_cls` """ - return self.profiler_cls(ctx) - - def add_profiler(self, id, profiler): - """ Add a profiler for RDD `id` """ - if not self.profilers: - if self.profile_dump_path: - atexit.register(self.dump_profiles, self.profile_dump_path) - else: - atexit.register(self.show_profiles) - - self.profilers.append([id, profiler, False]) - - def dump_profiles(self, path): - """ Dump the profile stats into directory `path` """ - for id, profiler, _ in self.profilers: - profiler.dump(id, path) - self.profilers = [] - - def show_profiles(self): - """ Print the profile stats to stdout """ - for i, (id, profiler, showed) in enumerate(self.profilers): - if not showed and profiler: - profiler.show(id) - # mark it as showed - self.profilers[i][2] = True - - -class Profiler(object): - """ - .. note:: DeveloperApi - - PySpark supports custom profilers, this is to allow for different profilers to - be used as well as outputting to different formats than what is provided in the - BasicProfiler. - - A custom profiler has to define or inherit the following methods: - profile - will produce a system profile of some sort. - stats - return the collected stats. - dump - dumps the profiles to a path - add - adds a profile to the existing accumulated profile - - The profiler class is chosen when creating a SparkContext - - >>> from pyspark import SparkConf, SparkContext - >>> from pyspark import BasicProfiler - >>> class MyCustomProfiler(BasicProfiler): - ... def show(self, id): - ... print("My custom profiles for RDD:%s" % id) - ... - >>> conf = SparkConf().set("spark.python.profile", "true") - >>> sc = SparkContext('local', 'test', conf=conf, profiler_cls=MyCustomProfiler) - >>> sc.parallelize(range(1000)).map(lambda x: 2 * x).take(10) - [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] - >>> sc.parallelize(range(1000)).count() - 1000 - >>> sc.show_profiles() - My custom profiles for RDD:1 - My custom profiles for RDD:3 - >>> sc.stop() - """ - - def __init__(self, ctx): - pass - - def profile(self, func): - """ Do profiling on the function `func`""" - raise NotImplemented - - def stats(self): - """ Return the collected profiling stats (pstats.Stats)""" - raise NotImplemented - - def show(self, id): - """ Print the profile stats to stdout, id is the RDD id """ - stats = self.stats() - if stats: - print("=" * 60) - print("Profile of RDD" % id) - print("=" * 60) - stats.sort_stats("time", "cumulative").print_stats() - - def dump(self, id, path): - """ Dump the profile into path, id is the RDD id """ - if not os.path.exists(path): - os.makedirs(path) - stats = self.stats() - if stats: - p = os.path.join(path, "rdd_%d.pstats" % id) - stats.dump_stats(p) - - -class PStatsParam(AccumulatorParam): - """PStatsParam is used to merge pstats.Stats""" - - @staticmethod - def zero(value): - return None - - @staticmethod - def addInPlace(value1, value2): - if value1 is None: - return value2 - value1.add(value2) - return value1 - - -class BasicProfiler(Profiler): - """ - BasicProfiler is the default profiler, which is implemented based on - cProfile and Accumulator - """ - def __init__(self, ctx): - Profiler.__init__(self, ctx) - # Creates a new accumulator for combining the profiles of different - # partitions of a stage - self._accumulator = ctx.accumulator(None, PStatsParam) - - def profile(self, func): - """ Runs and profiles the method to_profile passed in. A profile object is returned. """ - pr = cProfile.Profile() - pr.runcall(func) - st = pstats.Stats(pr) - st.stream = None # make it picklable - st.strip_dirs() - - # Adds a new profile to the existing accumulated value - self._accumulator.add(st) - - def stats(self): - return self._accumulator.value - - -if __name__ == "__main__": - import doctest - (failure_count, test_count) = doctest.testmod() - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/python/pyspark/shell.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/python/pyspark/shell.py deleted file mode 100644 index 65e3bdb..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/python/pyspark/shell.py +++ /dev/null @@ -1,76 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -An interactive shell. - -This file is designed to be launched as a PYTHONSTARTUP script. -""" - -import atexit -import os -import platform -import warnings - -import py4j - -from pyspark import SparkConf -from pyspark.context import SparkContext -from pyspark.sql import SparkSession, SQLContext - -if os.environ.get("SPARK_EXECUTOR_URI"): - SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) - -SparkContext._ensure_initialized() - -try: - spark = SparkSession._create_shell_session() -except Exception: - import sys - import traceback - warnings.warn("Failed to initialize Spark session.") - traceback.print_exc(file=sys.stderr) - sys.exit(1) - -sc = spark.sparkContext -sql = spark.sql -atexit.register(lambda: sc.stop()) - -# for compatibility -sqlContext = spark._wrapped -sqlCtx = sqlContext - -print(r"""Welcome to - ____ __ - / __/__ ___ _____/ /__ - _\ \/ _ \/ _ `/ __/ '_/ - /__ / .__/\_,_/_/ /_/\_\ version %s - /_/ -""" % sc.version) -print("Using Python version %s (%s, %s)" % ( - platform.python_version(), - platform.python_build()[0], - platform.python_build()[1])) -print("SparkSession available as 'spark'.") - -# The ./bin/pyspark script stores the old PYTHONSTARTUP value in OLD_PYTHONSTARTUP, -# which allows us to execute the user's PYTHONSTARTUP file: -_pythonstartup = os.environ.get('OLD_PYTHONSTARTUP') -if _pythonstartup and os.path.isfile(_pythonstartup): - with open(_pythonstartup) as f: - code = compile(f.read(), _pythonstartup, 'exec') - exec(code) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/rdd.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/rdd.py deleted file mode 100644 index ccf39e1..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/rdd.py +++ /dev/null @@ -1,2569 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import copy -import sys -import os -import re -import operator -import shlex -import warnings -import heapq -import bisect -import random -import socket -from subprocess import Popen, PIPE -from tempfile import NamedTemporaryFile -from threading import Thread -from collections import defaultdict -from itertools import chain -from functools import reduce -from math import sqrt, log, isinf, isnan, pow, ceil - -if sys.version > '3': - basestring = unicode = str -else: - from itertools import imap as map, ifilter as filter - -from pyspark.java_gateway import local_connect_and_auth -from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \ - BatchedSerializer, CloudPickleSerializer, PairDeserializer, \ - PickleSerializer, pack_long, AutoBatchedSerializer, write_with_length, \ - UTF8Deserializer -from pyspark.join import python_join, python_left_outer_join, \ - python_right_outer_join, python_full_outer_join, python_cogroup -from pyspark.statcounter import StatCounter -from pyspark.rddsampler import RDDSampler, RDDRangeSampler, RDDStratifiedSampler -from pyspark.storagelevel import StorageLevel -from pyspark.resultiterable import ResultIterable -from pyspark.shuffle import Aggregator, ExternalMerger, \ - get_used_memory, ExternalSorter, ExternalGroupBy -from pyspark.traceback_utils import SCCallSiteSync -from pyspark.util import fail_on_stopiteration, _exception_message - - -__all__ = ["RDD"] - - -class PythonEvalType(object): - """ - Evaluation type of python rdd. - - These values are internal to PySpark. - - These values should match values in org.apache.spark.api.python.PythonEvalType. - """ - NON_UDF = 0 - - SQL_BATCHED_UDF = 100 - - SQL_SCALAR_PANDAS_UDF = 200 - SQL_GROUPED_MAP_PANDAS_UDF = 201 - SQL_GROUPED_AGG_PANDAS_UDF = 202 - SQL_WINDOW_AGG_PANDAS_UDF = 203 - - -def portable_hash(x): - """ - This function returns consistent hash code for builtin types, especially - for None and tuple with None. - - The algorithm is similar to that one used by CPython 2.7 - - >>> portable_hash(None) - 0 - >>> portable_hash((None, 1)) & 0xffffffff - 219750521 - """ - - if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ: - raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED") - - if x is None: - return 0 - if isinstance(x, tuple): - h = 0x345678 - for i in x: - h ^= portable_hash(i) - h *= 1000003 - h &= sys.maxsize - h ^= len(x) - if h == -1: - h = -2 - return int(h) - return hash(x) - - -class BoundedFloat(float): - """ - Bounded value is generated by approximate job, with confidence and low - bound and high bound. - - >>> BoundedFloat(100.0, 0.95, 95.0, 105.0) - 100.0 - """ - def __new__(cls, mean, confidence, low, high): - obj = float.__new__(cls, mean) - obj.confidence = confidence - obj.low = low - obj.high = high - return obj - - -def _parse_memory(s): - """ - Parse a memory string in the format supported by Java (e.g. 1g, 200m) and - return the value in MB - - >>> _parse_memory("256m") - 256 - >>> _parse_memory("2g") - 2048 - """ - units = {'g': 1024, 'm': 1, 't': 1 << 20, 'k': 1.0 / 1024} - if s[-1].lower() not in units: - raise ValueError("invalid format: " + s) - return int(float(s[:-1]) * units[s[-1].lower()]) - - -def _load_from_socket(sock_info, serializer): - (sockfile, sock) = local_connect_and_auth(*sock_info) - # The RDD materialization time is unpredicable, if we set a timeout for socket reading - # operation, it will very possibly fail. See SPARK-18281. - sock.settimeout(None) - # The socket will be automatically closed when garbage-collected. - return serializer.load_stream(sockfile) - - -def ignore_unicode_prefix(f): - """ - Ignore the 'u' prefix of string in doc tests, to make it works - in both python 2 and 3 - """ - if sys.version >= '3': - # the representation of unicode string in Python 3 does not have prefix 'u', - # so remove the prefix 'u' for doc tests - literal_re = re.compile(r"(\W|^)[uU](['])", re.UNICODE) - f.__doc__ = literal_re.sub(r'\1\2', f.__doc__) - return f - - -class Partitioner(object): - def __init__(self, numPartitions, partitionFunc): - self.numPartitions = numPartitions - self.partitionFunc = partitionFunc - - def __eq__(self, other): - return (isinstance(other, Partitioner) and self.numPartitions == other.numPartitions - and self.partitionFunc == other.partitionFunc) - - def __call__(self, k): - return self.partitionFunc(k) % self.numPartitions - - -class RDD(object): - - """ - A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. - Represents an immutable, partitioned collection of elements that can be - operated on in parallel. - """ - - def __init__(self, jrdd, ctx, jrdd_deserializer=AutoBatchedSerializer(PickleSerializer())): - self._jrdd = jrdd - self.is_cached = False - self.is_checkpointed = False - self.ctx = ctx - self._jrdd_deserializer = jrdd_deserializer - self._id = jrdd.id() - self.partitioner = None - - def _pickled(self): - return self._reserialize(AutoBatchedSerializer(PickleSerializer())) - - def id(self): - """ - A unique ID for this RDD (within its SparkContext). - """ - return self._id - - def __repr__(self): - return self._jrdd.toString() - - def __getnewargs__(self): - # This method is called when attempting to pickle an RDD, which is always an error: - raise Exception( - "It appears that you are attempting to broadcast an RDD or reference an RDD from an " - "action or transformation. RDD transformations and actions can only be invoked by the " - "driver, not inside of other transformations; for example, " - "rdd1.map(lambda x: rdd2.values.count() * x) is invalid because the values " - "transformation and count action cannot be performed inside of the rdd1.map " - "transformation. For more information, see SPARK-5063." - ) - - @property - def context(self): - """ - The L{SparkContext} that this RDD was created on. - """ - return self.ctx - - def cache(self): - """ - Persist this RDD with the default storage level (C{MEMORY_ONLY}). - """ - self.is_cached = True - self.persist(StorageLevel.MEMORY_ONLY) - return self - - def persist(self, storageLevel=StorageLevel.MEMORY_ONLY): - """ - Set this RDD's storage level to persist its values across operations - after the first time it is computed. This can only be used to assign - a new storage level if the RDD does not have a storage level set yet. - If no storage level is specified defaults to (C{MEMORY_ONLY}). - - >>> rdd = sc.parallelize(["b", "a", "c"]) - >>> rdd.persist().is_cached - True - """ - self.is_cached = True - javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel) - self._jrdd.persist(javaStorageLevel) - return self - - def unpersist(self): - """ - Mark the RDD as non-persistent, and remove all blocks for it from - memory and disk. - """ - self.is_cached = False - self._jrdd.unpersist() - return self - - def checkpoint(self): - """ - Mark this RDD for checkpointing. It will be saved to a file inside the - checkpoint directory set with L{SparkContext.setCheckpointDir()} and - all references to its parent RDDs will be removed. This function must - be called before any job has been executed on this RDD. It is strongly - recommended that this RDD is persisted in memory, otherwise saving it - on a file will require recomputation. - """ - self.is_checkpointed = True - self._jrdd.rdd().checkpoint() - - def isCheckpointed(self): - """ - Return whether this RDD is checkpointed and materialized, either reliably or locally. - """ - return self._jrdd.rdd().isCheckpointed() - - def localCheckpoint(self): - """ - Mark this RDD for local checkpointing using Spark's existing caching layer. - - This method is for users who wish to truncate RDD lineages while skipping the expensive - step of replicating the materialized data in a reliable distributed file system. This is - useful for RDDs with long lineages that need to be truncated periodically (e.g. GraphX). - - Local checkpointing sacrifices fault-tolerance for performance. In particular, checkpointed - data is written to ephemeral local storage in the executors instead of to a reliable, - fault-tolerant storage. The effect is that if an executor fails during the computation, - the checkpointed data may no longer be accessible, causing an irrecoverable job failure. - - This is NOT safe to use with dynamic allocation, which removes executors along - with their cached blocks. If you must use both features, you are advised to set - L{spark.dynamicAllocation.cachedExecutorIdleTimeout} to a high value. - - The checkpoint directory set through L{SparkContext.setCheckpointDir()} is not used. - """ - self._jrdd.rdd().localCheckpoint() - - def isLocallyCheckpointed(self): - """ - Return whether this RDD is marked for local checkpointing. - - Exposed for testing. - """ - return self._jrdd.rdd().isLocallyCheckpointed() - - def getCheckpointFile(self): - """ - Gets the name of the file to which this RDD was checkpointed - - Not defined if RDD is checkpointed locally. - """ - checkpointFile = self._jrdd.rdd().getCheckpointFile() - if checkpointFile.isDefined(): - return checkpointFile.get() - - def map(self, f, preservesPartitioning=False): - """ - Return a new RDD by applying a function to each element of this RDD. - - >>> rdd = sc.parallelize(["b", "a", "c"]) - >>> sorted(rdd.map(lambda x: (x, 1)).collect()) - [('a', 1), ('b', 1), ('c', 1)] - """ - def func(_, iterator): - return map(fail_on_stopiteration(f), iterator) - return self.mapPartitionsWithIndex(func, preservesPartitioning) - - def flatMap(self, f, preservesPartitioning=False): - """ - Return a new RDD by first applying a function to all elements of this - RDD, and then flattening the results. - - >>> rdd = sc.parallelize([2, 3, 4]) - >>> sorted(rdd.flatMap(lambda x: range(1, x)).collect()) - [1, 1, 1, 2, 2, 3] - >>> sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect()) - [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)] - """ - def func(s, iterator): - return chain.from_iterable(map(fail_on_stopiteration(f), iterator)) - return self.mapPartitionsWithIndex(func, preservesPartitioning) - - def mapPartitions(self, f, preservesPartitioning=False): - """ - Return a new RDD by applying a function to each partition of this RDD. - - >>> rdd = sc.parallelize([1, 2, 3, 4], 2) - >>> def f(iterator): yield sum(iterator) - >>> rdd.mapPartitions(f).collect() - [3, 7] - """ - def func(s, iterator): - return f(iterator) - return self.mapPartitionsWithIndex(func, preservesPartitioning) - - def mapPartitionsWithIndex(self, f, preservesPartitioning=False): - """ - Return a new RDD by applying a function to each partition of this RDD, - while tracking the index of the original partition. - - >>> rdd = sc.parallelize([1, 2, 3, 4], 4) - >>> def f(splitIndex, iterator): yield splitIndex - >>> rdd.mapPartitionsWithIndex(f).sum() - 6 - """ - return PipelinedRDD(self, f, preservesPartitioning) - - def mapPartitionsWithSplit(self, f, preservesPartitioning=False): - """ - Deprecated: use mapPartitionsWithIndex instead. - - Return a new RDD by applying a function to each partition of this RDD, - while tracking the index of the original partition. - - >>> rdd = sc.parallelize([1, 2, 3, 4], 4) - >>> def f(splitIndex, iterator): yield splitIndex - >>> rdd.mapPartitionsWithSplit(f).sum() - 6 - """ - warnings.warn("mapPartitionsWithSplit is deprecated; " - "use mapPartitionsWithIndex instead", DeprecationWarning, stacklevel=2) - return self.mapPartitionsWithIndex(f, preservesPartitioning) - - def getNumPartitions(self): - """ - Returns the number of partitions in RDD - - >>> rdd = sc.parallelize([1, 2, 3, 4], 2) - >>> rdd.getNumPartitions() - 2 - """ - return self._jrdd.partitions().size() - - def filter(self, f): - """ - Return a new RDD containing only the elements that satisfy a predicate. - - >>> rdd = sc.parallelize([1, 2, 3, 4, 5]) - >>> rdd.filter(lambda x: x % 2 == 0).collect() - [2, 4] - """ - def func(iterator): - return filter(fail_on_stopiteration(f), iterator) - return self.mapPartitions(func, True) - - def distinct(self, numPartitions=None): - """ - Return a new RDD containing the distinct elements in this RDD. - - >>> sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect()) - [1, 2, 3] - """ - return self.map(lambda x: (x, None)) \ - .reduceByKey(lambda x, _: x, numPartitions) \ - .map(lambda x: x[0]) - - def sample(self, withReplacement, fraction, seed=None): - """ - Return a sampled subset of this RDD. - - :param withReplacement: can elements be sampled multiple times (replaced when sampled out) - :param fraction: expected size of the sample as a fraction of this RDD's size - without replacement: probability that each element is chosen; fraction must be [0, 1] - with replacement: expected number of times each element is chosen; fraction must be >= 0 - :param seed: seed for the random number generator - - .. note:: This is not guaranteed to provide exactly the fraction specified of the total - count of the given :class:`DataFrame`. - - >>> rdd = sc.parallelize(range(100), 4) - >>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14 - True - """ - assert fraction >= 0.0, "Negative fraction value: %s" % fraction - return self.mapPartitionsWithIndex(RDDSampler(withReplacement, fraction, seed).func, True) - - def randomSplit(self, weights, seed=None): - """ - Randomly splits this RDD with the provided weights. - - :param weights: weights for splits, will be normalized if they don't sum to 1 - :param seed: random seed - :return: split RDDs in a list - - >>> rdd = sc.parallelize(range(500), 1) - >>> rdd1, rdd2 = rdd.randomSplit([2, 3], 17) - >>> len(rdd1.collect() + rdd2.collect()) - 500 - >>> 150 < rdd1.count() < 250 - True - >>> 250 < rdd2.count() < 350 - True - """ - s = float(sum(weights)) - cweights = [0.0] - for w in weights: - cweights.append(cweights[-1] + w / s) - if seed is None: - seed = random.randint(0, 2 ** 32 - 1) - return [self.mapPartitionsWithIndex(RDDRangeSampler(lb, ub, seed).func, True) - for lb, ub in zip(cweights, cweights[1:])] - - # this is ported from scala/spark/RDD.scala - def takeSample(self, withReplacement, num, seed=None): - """ - Return a fixed-size sampled subset of this RDD. - - .. note:: This method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. - - >>> rdd = sc.parallelize(range(0, 10)) - >>> len(rdd.takeSample(True, 20, 1)) - 20 - >>> len(rdd.takeSample(False, 5, 2)) - 5 - >>> len(rdd.takeSample(False, 15, 3)) - 10 - """ - numStDev = 10.0 - - if num < 0: - raise ValueError("Sample size cannot be negative.") - elif num == 0: - return [] - - initialCount = self.count() - if initialCount == 0: - return [] - - rand = random.Random(seed) - - if (not withReplacement) and num >= initialCount: - # shuffle current RDD and return - samples = self.collect() - rand.shuffle(samples) - return samples - - maxSampleSize = sys.maxsize - int(numStDev * sqrt(sys.maxsize)) - if num > maxSampleSize: - raise ValueError( - "Sample size cannot be greater than %d." % maxSampleSize) - - fraction = RDD._computeFractionForSampleSize( - num, initialCount, withReplacement) - samples = self.sample(withReplacement, fraction, seed).collect() - - # If the first sample didn't turn out large enough, keep trying to take samples; - # this shouldn't happen often because we use a big multiplier for their initial size. - # See: scala/spark/RDD.scala - while len(samples) < num: - # TODO: add log warning for when more than one iteration was run - seed = rand.randint(0, sys.maxsize) - samples = self.sample(withReplacement, fraction, seed).collect() - - rand.shuffle(samples) - - return samples[0:num] - - @staticmethod - def _computeFractionForSampleSize(sampleSizeLowerBound, total, withReplacement): - """ - Returns a sampling rate that guarantees a sample of - size >= sampleSizeLowerBound 99.99% of the time. - - How the sampling rate is determined: - Let p = num / total, where num is the sample size and total is the - total number of data points in the RDD. We're trying to compute - q > p such that - - when sampling with replacement, we're drawing each data point - with prob_i ~ Pois(q), where we want to guarantee - Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to - total), i.e. the failure rate of not having a sufficiently large - sample < 0.0001. Setting q = p + 5 * sqrt(p/total) is sufficient - to guarantee 0.9999 success rate for num > 12, but we need a - slightly larger q (9 empirically determined). - - when sampling without replacement, we're drawing each data point - with prob_i ~ Binomial(total, fraction) and our choice of q - guarantees 1-delta, or 0.9999 success rate, where success rate is - defined the same as in sampling with replacement. - """ - fraction = float(sampleSizeLowerBound) / total - if withReplacement: - numStDev = 5 - if (sampleSizeLowerBound < 12): - numStDev = 9 - return fraction + numStDev * sqrt(fraction / total) - else: - delta = 0.00005 - gamma = - log(delta) / total - return min(1, fraction + gamma + sqrt(gamma * gamma + 2 * gamma * fraction)) - - def union(self, other): - """ - Return the union of this RDD and another one. - - >>> rdd = sc.parallelize([1, 1, 2, 3]) - >>> rdd.union(rdd).collect() - [1, 1, 2, 3, 1, 1, 2, 3] - """ - if self._jrdd_deserializer == other._jrdd_deserializer: - rdd = RDD(self._jrdd.union(other._jrdd), self.ctx, - self._jrdd_deserializer) - else: - # These RDDs contain data in different serialized formats, so we - # must normalize them to the default serializer. - self_copy = self._reserialize() - other_copy = other._reserialize() - rdd = RDD(self_copy._jrdd.union(other_copy._jrdd), self.ctx, - self.ctx.serializer) - if (self.partitioner == other.partitioner and - self.getNumPartitions() == rdd.getNumPartitions()): - rdd.partitioner = self.partitioner - return rdd - - def intersection(self, other): - """ - Return the intersection of this RDD and another one. The output will - not contain any duplicate elements, even if the input RDDs did. - - .. note:: This method performs a shuffle internally. - - >>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5]) - >>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8]) - >>> rdd1.intersection(rdd2).collect() - [1, 2, 3] - """ - return self.map(lambda v: (v, None)) \ - .cogroup(other.map(lambda v: (v, None))) \ - .filter(lambda k_vs: all(k_vs[1])) \ - .keys() - - def _reserialize(self, serializer=None): - serializer = serializer or self.ctx.serializer - if self._jrdd_deserializer != serializer: - self = self.map(lambda x: x, preservesPartitioning=True) - self._jrdd_deserializer = serializer - return self - - def __add__(self, other): - """ - Return the union of this RDD and another one. - - >>> rdd = sc.parallelize([1, 1, 2, 3]) - >>> (rdd + rdd).collect() - [1, 1, 2, 3, 1, 1, 2, 3] - """ - if not isinstance(other, RDD): - raise TypeError - return self.union(other) - - def repartitionAndSortWithinPartitions(self, numPartitions=None, partitionFunc=portable_hash, - ascending=True, keyfunc=lambda x: x): - """ - Repartition the RDD according to the given partitioner and, within each resulting partition, - sort records by their keys. - - >>> rdd = sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)]) - >>> rdd2 = rdd.repartitionAndSortWithinPartitions(2, lambda x: x % 2, True) - >>> rdd2.glom().collect() - [[(0, 5), (0, 8), (2, 6)], [(1, 3), (3, 8), (3, 8)]] - """ - if numPartitions is None: - numPartitions = self._defaultReducePartitions() - - memory = _parse_memory(self.ctx._conf.get("spark.python.worker.memory", "512m")) - serializer = self._jrdd_deserializer - - def sortPartition(iterator): - sort = ExternalSorter(memory * 0.9, serializer).sorted - return iter(sort(iterator, key=lambda k_v: keyfunc(k_v[0]), reverse=(not ascending))) - - return self.partitionBy(numPartitions, partitionFunc).mapPartitions(sortPartition, True) - - def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x): - """ - Sorts this RDD, which is assumed to consist of (key, value) pairs. - - >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)] - >>> sc.parallelize(tmp).sortByKey().first() - ('1', 3) - >>> sc.parallelize(tmp).sortByKey(True, 1).collect() - [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)] - >>> sc.parallelize(tmp).sortByKey(True, 2).collect() - [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)] - >>> tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)] - >>> tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)]) - >>> sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect() - [('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5),...('white', 9), ('whose', 6)] - """ - if numPartitions is None: - numPartitions = self._defaultReducePartitions() - - memory = self._memory_limit() - serializer = self._jrdd_deserializer - - def sortPartition(iterator): - sort = ExternalSorter(memory * 0.9, serializer).sorted - return iter(sort(iterator, key=lambda kv: keyfunc(kv[0]), reverse=(not ascending))) - - if numPartitions == 1: - if self.getNumPartitions() > 1: - self = self.coalesce(1) - return self.mapPartitions(sortPartition, True) - - # first compute the boundary of each part via sampling: we want to partition - # the key-space into bins such that the bins have roughly the same - # number of (key, value) pairs falling into them - rddSize = self.count() - if not rddSize: - return self # empty RDD - maxSampleSize = numPartitions * 20.0 # constant from Spark's RangePartitioner - fraction = min(maxSampleSize / max(rddSize, 1), 1.0) - samples = self.sample(False, fraction, 1).map(lambda kv: kv[0]).collect() - samples = sorted(samples, key=keyfunc) - - # we have numPartitions many parts but one of the them has - # an implicit boundary - bounds = [samples[int(len(samples) * (i + 1) / numPartitions)] - for i in range(0, numPartitions - 1)] - - def rangePartitioner(k): - p = bisect.bisect_left(bounds, keyfunc(k)) - if ascending: - return p - else: - return numPartitions - 1 - p - - return self.partitionBy(numPartitions, rangePartitioner).mapPartitions(sortPartition, True) - - def sortBy(self, keyfunc, ascending=True, numPartitions=None): - """ - Sorts this RDD by the given keyfunc - - >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)] - >>> sc.parallelize(tmp).sortBy(lambda x: x[0]).collect() - [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)] - >>> sc.parallelize(tmp).sortBy(lambda x: x[1]).collect() - [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)] - """ - return self.keyBy(keyfunc).sortByKey(ascending, numPartitions).values() - - def glom(self): - """ - Return an RDD created by coalescing all elements within each partition - into a list. - - >>> rdd = sc.parallelize([1, 2, 3, 4], 2) - >>> sorted(rdd.glom().collect()) - [[1, 2], [3, 4]] - """ - def func(iterator): - yield list(iterator) - return self.mapPartitions(func) - - def cartesian(self, other): - """ - Return the Cartesian product of this RDD and another one, that is, the - RDD of all pairs of elements C{(a, b)} where C{a} is in C{self} and - C{b} is in C{other}. - - >>> rdd = sc.parallelize([1, 2]) - >>> sorted(rdd.cartesian(rdd).collect()) - [(1, 1), (1, 2), (2, 1), (2, 2)] - """ - # Due to batching, we can't use the Java cartesian method. - deserializer = CartesianDeserializer(self._jrdd_deserializer, - other._jrdd_deserializer) - return RDD(self._jrdd.cartesian(other._jrdd), self.ctx, deserializer) - - def groupBy(self, f, numPartitions=None, partitionFunc=portable_hash): - """ - Return an RDD of grouped items. - - >>> rdd = sc.parallelize([1, 1, 2, 3, 5, 8]) - >>> result = rdd.groupBy(lambda x: x % 2).collect() - >>> sorted([(x, sorted(y)) for (x, y) in result]) - [(0, [2, 8]), (1, [1, 1, 3, 5])] - """ - return self.map(lambda x: (f(x), x)).groupByKey(numPartitions, partitionFunc) - - @ignore_unicode_prefix - def pipe(self, command, env=None, checkCode=False): - """ - Return an RDD created by piping elements to a forked external process. - - >>> sc.parallelize(['1', '2', '', '3']).pipe('cat').collect() - [u'1', u'2', u'', u'3'] - - :param checkCode: whether or not to check the return value of the shell command. - """ - if env is None: - env = dict() - - def func(iterator): - pipe = Popen( - shlex.split(command), env=env, stdin=PIPE, stdout=PIPE) - - def pipe_objs(out): - for obj in iterator: - s = unicode(obj).rstrip('\n') + '\n' - out.write(s.encode('utf-8')) - out.close() - Thread(target=pipe_objs, args=[pipe.stdin]).start() - - def check_return_code(): - pipe.wait() - if checkCode and pipe.returncode: - raise Exception("Pipe function `%s' exited " - "with error code %d" % (command, pipe.returncode)) - else: - for i in range(0): - yield i - return (x.rstrip(b'\n').decode('utf-8') for x in - chain(iter(pipe.stdout.readline, b''), check_return_code())) - return self.mapPartitions(func) - - def foreach(self, f): - """ - Applies a function to all elements of this RDD. - - >>> def f(x): print(x) - >>> sc.parallelize([1, 2, 3, 4, 5]).foreach(f) - """ - f = fail_on_stopiteration(f) - - def processPartition(iterator): - for x in iterator: - f(x) - return iter([]) - self.mapPartitions(processPartition).count() # Force evaluation - - def foreachPartition(self, f): - """ - Applies a function to each partition of this RDD. - - >>> def f(iterator): - ... for x in iterator: - ... print(x) - >>> sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(f) - """ - def func(it): - r = f(it) - try: - return iter(r) - except TypeError: - return iter([]) - self.mapPartitions(func).count() # Force evaluation - - def collect(self): - """ - Return a list that contains all of the elements in this RDD. - - .. note:: This method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. - """ - with SCCallSiteSync(self.context) as css: - sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) - return list(_load_from_socket(sock_info, self._jrdd_deserializer)) - - def reduce(self, f): - """ - Reduces the elements of this RDD using the specified commutative and - associative binary operator. Currently reduces partitions locally. - - >>> from operator import add - >>> sc.parallelize([1, 2, 3, 4, 5]).reduce(add) - 15 - >>> sc.parallelize((2 for _ in range(10))).map(lambda x: 1).cache().reduce(add) - 10 - >>> sc.parallelize([]).reduce(add) - Traceback (most recent call last): - ... - ValueError: Can not reduce() empty RDD - """ - f = fail_on_stopiteration(f) - - def func(iterator): - iterator = iter(iterator) - try: - initial = next(iterator) - except StopIteration: - return - yield reduce(f, iterator, initial) - - vals = self.mapPartitions(func).collect() - if vals: - return reduce(f, vals) - raise ValueError("Can not reduce() empty RDD") - - def treeReduce(self, f, depth=2): - """ - Reduces the elements of this RDD in a multi-level tree pattern. - - :param depth: suggested depth of the tree (default: 2) - - >>> add = lambda x, y: x + y - >>> rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10) - >>> rdd.treeReduce(add) - -5 - >>> rdd.treeReduce(add, 1) - -5 - >>> rdd.treeReduce(add, 2) - -5 - >>> rdd.treeReduce(add, 5) - -5 - >>> rdd.treeReduce(add, 10) - -5 - """ - if depth < 1: - raise ValueError("Depth cannot be smaller than 1 but got %d." % depth) - - zeroValue = None, True # Use the second entry to indicate whether this is a dummy value. - - def op(x, y): - if x[1]: - return y - elif y[1]: - return x - else: - return f(x[0], y[0]), False - - reduced = self.map(lambda x: (x, False)).treeAggregate(zeroValue, op, op, depth) - if reduced[1]: - raise ValueError("Cannot reduce empty RDD.") - return reduced[0] - - def fold(self, zeroValue, op): - """ - Aggregate the elements of each partition, and then the results for all - the partitions, using a given associative function and a neutral "zero value." - - The function C{op(t1, t2)} is allowed to modify C{t1} and return it - as its result value to avoid object allocation; however, it should not - modify C{t2}. - - This behaves somewhat differently from fold operations implemented - for non-distributed collections in functional languages like Scala. - This fold operation may be applied to partitions individually, and then - fold those results into the final result, rather than apply the fold - to each element sequentially in some defined ordering. For functions - that are not commutative, the result may differ from that of a fold - applied to a non-distributed collection. - - >>> from operator import add - >>> sc.parallelize([1, 2, 3, 4, 5]).fold(0, add) - 15 - """ - op = fail_on_stopiteration(op) - - def func(iterator): - acc = zeroValue - for obj in iterator: - acc = op(acc, obj) - yield acc - # collecting result of mapPartitions here ensures that the copy of - # zeroValue provided to each partition is unique from the one provided - # to the final reduce call - vals = self.mapPartitions(func).collect() - return reduce(op, vals, zeroValue) - - def aggregate(self, zeroValue, seqOp, combOp): - """ - Aggregate the elements of each partition, and then the results for all - the partitions, using a given combine functions and a neutral "zero - value." - - The functions C{op(t1, t2)} is allowed to modify C{t1} and return it - as its result value to avoid object allocation; however, it should not - modify C{t2}. - - The first function (seqOp) can return a different result type, U, than - the type of this RDD. Thus, we need one operation for merging a T into - an U and one operation for merging two U - - >>> seqOp = (lambda x, y: (x[0] + y, x[1] + 1)) - >>> combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1])) - >>> sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp) - (10, 4) - >>> sc.parallelize([]).aggregate((0, 0), seqOp, combOp) - (0, 0) - """ - seqOp = fail_on_stopiteration(seqOp) - combOp = fail_on_stopiteration(combOp) - - def func(iterator): - acc = zeroValue - for obj in iterator: - acc = seqOp(acc, obj) - yield acc - # collecting result of mapPartitions here ensures that the copy of - # zeroValue provided to each partition is unique from the one provided - # to the final reduce call - vals = self.mapPartitions(func).collect() - return reduce(combOp, vals, zeroValue) - - def treeAggregate(self, zeroValue, seqOp, combOp, depth=2): - """ - Aggregates the elements of this RDD in a multi-level tree - pattern. - - :param depth: suggested depth of the tree (default: 2) - - >>> add = lambda x, y: x + y - >>> rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10) - >>> rdd.treeAggregate(0, add, add) - -5 - >>> rdd.treeAggregate(0, add, add, 1) - -5 - >>> rdd.treeAggregate(0, add, add, 2) - -5 - >>> rdd.treeAggregate(0, add, add, 5) - -5 - >>> rdd.treeAggregate(0, add, add, 10) - -5 - """ - if depth < 1: - raise ValueError("Depth cannot be smaller than 1 but got %d." % depth) - - if self.getNumPartitions() == 0: - return zeroValue - - def aggregatePartition(iterator): - acc = zeroValue - for obj in iterator: - acc = seqOp(acc, obj) - yield acc - - partiallyAggregated = self.mapPartitions(aggregatePartition) - numPartitions = partiallyAggregated.getNumPartitions() - scale = max(int(ceil(pow(numPartitions, 1.0 / depth))), 2) - # If creating an extra level doesn't help reduce the wall-clock time, we stop the tree - # aggregation. - while numPartitions > scale + numPartitions / scale: - numPartitions /= scale - curNumPartitions = int(numPartitions) - - def mapPartition(i, iterator): - for obj in iterator: - yield (i % curNumPartitions, obj) - - partiallyAggregated = partiallyAggregated \ - .mapPartitionsWithIndex(mapPartition) \ - .reduceByKey(combOp, curNumPartitions) \ - .values() - - return partiallyAggregated.reduce(combOp) - - def max(self, key=None): - """ - Find the maximum item in this RDD. - - :param key: A function used to generate key for comparing - - >>> rdd = sc.parallelize([1.0, 5.0, 43.0, 10.0]) - >>> rdd.max() - 43.0 - >>> rdd.max(key=str) - 5.0 - """ - if key is None: - return self.reduce(max) - return self.reduce(lambda a, b: max(a, b, key=key)) - - def min(self, key=None): - """ - Find the minimum item in this RDD. - - :param key: A function used to generate key for comparing - - >>> rdd = sc.parallelize([2.0, 5.0, 43.0, 10.0]) - >>> rdd.min() - 2.0 - >>> rdd.min(key=str) - 10.0 - """ - if key is None: - return self.reduce(min) - return self.reduce(lambda a, b: min(a, b, key=key)) - - def sum(self): - """ - Add up the elements in this RDD. - - >>> sc.parallelize([1.0, 2.0, 3.0]).sum() - 6.0 - """ - return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add) - - def count(self): - """ - Return the number of elements in this RDD. - - >>> sc.parallelize([2, 3, 4]).count() - 3 - """ - return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() - - def stats(self): - """ - Return a L{StatCounter} object that captures the mean, variance - and count of the RDD's elements in one operation. - """ - def redFunc(left_counter, right_counter): - return left_counter.mergeStats(right_counter) - - return self.mapPartitions(lambda i: [StatCounter(i)]).reduce(redFunc) - - def histogram(self, buckets): - """ - Compute a histogram using the provided buckets. The buckets - are all open to the right except for the last which is closed. - e.g. [1,10,20,50] means the buckets are [1,10) [10,20) [20,50], - which means 1<=x<10, 10<=x<20, 20<=x<=50. And on the input of 1 - and 50 we would have a histogram of 1,0,1. - - If your histogram is evenly spaced (e.g. [0, 10, 20, 30]), - this can be switched from an O(log n) inseration to O(1) per - element (where n is the number of buckets). - - Buckets must be sorted, not contain any duplicates, and have - at least two elements. - - If `buckets` is a number, it will generate buckets which are - evenly spaced between the minimum and maximum of the RDD. For - example, if the min value is 0 and the max is 100, given `buckets` - as 2, the resulting buckets will be [0,50) [50,100]. `buckets` must - be at least 1. An exception is raised if the RDD contains infinity. - If the elements in the RDD do not vary (max == min), a single bucket - will be used. - - The return value is a tuple of buckets and histogram. - - >>> rdd = sc.parallelize(range(51)) - >>> rdd.histogram(2) - ([0, 25, 50], [25, 26]) - >>> rdd.histogram([0, 5, 25, 50]) - ([0, 5, 25, 50], [5, 20, 26]) - >>> rdd.histogram([0, 15, 30, 45, 60]) # evenly spaced buckets - ([0, 15, 30, 45, 60], [15, 15, 15, 6]) - >>> rdd = sc.parallelize(["ab", "ac", "b", "bd", "ef"]) - >>> rdd.histogram(("a", "b", "c")) - (('a', 'b', 'c'), [2, 2]) - """ - - if isinstance(buckets, int): - if buckets < 1: - raise ValueError("number of buckets must be >= 1") - - # filter out non-comparable elements - def comparable(x): - if x is None: - return False - if type(x) is float and isnan(x): - return False - return True - - filtered = self.filter(comparable) - - # faster than stats() - def minmax(a, b): - return min(a[0], b[0]), max(a[1], b[1]) - try: - minv, maxv = filtered.map(lambda x: (x, x)).reduce(minmax) - except TypeError as e: - if " empty " in str(e): - raise ValueError("can not generate buckets from empty RDD") - raise - - if minv == maxv or buckets == 1: - return [minv, maxv], [filtered.count()] - - try: - inc = (maxv - minv) / buckets - except TypeError: - raise TypeError("Can not generate buckets with non-number in RDD") - - if isinf(inc): - raise ValueError("Can not generate buckets with infinite value") - - # keep them as integer if possible - inc = int(inc) - if inc * buckets != maxv - minv: - inc = (maxv - minv) * 1.0 / buckets - - buckets = [i * inc + minv for i in range(buckets)] - buckets.append(maxv) # fix accumulated error - even = True - - elif isinstance(buckets, (list, tuple)): - if len(buckets) < 2: - raise ValueError("buckets should have more than one value") - - if any(i is None or isinstance(i, float) and isnan(i) for i in buckets): - raise ValueError("can not have None or NaN in buckets") - - if sorted(buckets) != list(buckets): - raise ValueError("buckets should be sorted") - - if len(set(buckets)) != len(buckets): - raise ValueError("buckets should not contain duplicated values") - - minv = buckets[0] - maxv = buckets[-1] - even = False - inc = None - try: - steps = [buckets[i + 1] - buckets[i] for i in range(len(buckets) - 1)] - except TypeError: - pass # objects in buckets do not support '-' - else: - if max(steps) - min(steps) < 1e-10: # handle precision errors - even = True - inc = (maxv - minv) / (len(buckets) - 1) - - else: - raise TypeError("buckets should be a list or tuple or number(int or long)") - - def histogram(iterator): - counters = [0] * len(buckets) - for i in iterator: - if i is None or (type(i) is float and isnan(i)) or i > maxv or i < minv: - continue - t = (int((i - minv) / inc) if even - else bisect.bisect_right(buckets, i) - 1) - counters[t] += 1 - # add last two together - last = counters.pop() - counters[-1] += last - return [counters] - - def mergeCounters(a, b): - return [i + j for i, j in zip(a, b)] - - return buckets, self.mapPartitions(histogram).reduce(mergeCounters) - - def mean(self): - """ - Compute the mean of this RDD's elements. - - >>> sc.parallelize([1, 2, 3]).mean() - 2.0 - """ - return self.stats().mean() - - def variance(self): - """ - Compute the variance of this RDD's elements. - - >>> sc.parallelize([1, 2, 3]).variance() - 0.666... - """ - return self.stats().variance() - - def stdev(self): - """ - Compute the standard deviation of this RDD's elements. - - >>> sc.parallelize([1, 2, 3]).stdev() - 0.816... - """ - return self.stats().stdev() - - def sampleStdev(self): - """ - Compute the sample standard deviation of this RDD's elements (which - corrects for bias in estimating the standard deviation by dividing by - N-1 instead of N). - - >>> sc.parallelize([1, 2, 3]).sampleStdev() - 1.0 - """ - return self.stats().sampleStdev() - - def sampleVariance(self): - """ - Compute the sample variance of this RDD's elements (which corrects - for bias in estimating the variance by dividing by N-1 instead of N). - - >>> sc.parallelize([1, 2, 3]).sampleVariance() - 1.0 - """ - return self.stats().sampleVariance() - - def countByValue(self): - """ - Return the count of each unique value in this RDD as a dictionary of - (value, count) pairs. - - >>> sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items()) - [(1, 2), (2, 3)] - """ - def countPartition(iterator): - counts = defaultdict(int) - for obj in iterator: - counts[obj] += 1 - yield counts - - def mergeMaps(m1, m2): - for k, v in m2.items(): - m1[k] += v - return m1 - return self.mapPartitions(countPartition).reduce(mergeMaps) - - def top(self, num, key=None): - """ - Get the top N elements from an RDD. - - .. note:: This method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. - - .. note:: It returns the list sorted in descending order. - - >>> sc.parallelize([10, 4, 2, 12, 3]).top(1) - [12] - >>> sc.parallelize([2, 3, 4, 5, 6], 2).top(2) - [6, 5] - >>> sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str) - [4, 3, 2] - """ - def topIterator(iterator): - yield heapq.nlargest(num, iterator, key=key) - - def merge(a, b): - return heapq.nlargest(num, a + b, key=key) - - return self.mapPartitions(topIterator).reduce(merge) - - def takeOrdered(self, num, key=None): - """ - Get the N elements from an RDD ordered in ascending order or as - specified by the optional key function. - - .. note:: this method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. - - >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6) - [1, 2, 3, 4, 5, 6] - >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=lambda x: -x) - [10, 9, 7, 6, 5, 4] - """ - - def merge(a, b): - return heapq.nsmallest(num, a + b, key) - - return self.mapPartitions(lambda it: [heapq.nsmallest(num, it, key)]).reduce(merge) - - def take(self, num): - """ - Take the first num elements of the RDD. - - It works by first scanning one partition, and use the results from - that partition to estimate the number of additional partitions needed - to satisfy the limit. - - Translated from the Scala implementation in RDD#take(). - - .. note:: this method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. - - >>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2) - [2, 3] - >>> sc.parallelize([2, 3, 4, 5, 6]).take(10) - [2, 3, 4, 5, 6] - >>> sc.parallelize(range(100), 100).filter(lambda x: x > 90).take(3) - [91, 92, 93] - """ - items = [] - totalParts = self.getNumPartitions() - partsScanned = 0 - - while len(items) < num and partsScanned < totalParts: - # The number of partitions to try in this iteration. - # It is ok for this number to be greater than totalParts because - # we actually cap it at totalParts in runJob. - numPartsToTry = 1 - if partsScanned > 0: - # If we didn't find any rows after the previous iteration, - # quadruple and retry. Otherwise, interpolate the number of - # partitions we need to try, but overestimate it by 50%. - # We also cap the estimation in the end. - if len(items) == 0: - numPartsToTry = partsScanned * 4 - else: - # the first parameter of max is >=1 whenever partsScanned >= 2 - numPartsToTry = int(1.5 * num * partsScanned / len(items)) - partsScanned - numPartsToTry = min(max(numPartsToTry, 1), partsScanned * 4) - - left = num - len(items) - - def takeUpToNumLeft(iterator): - iterator = iter(iterator) - taken = 0 - while taken < left: - try: - yield next(iterator) - except StopIteration: - return - taken += 1 - - p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts)) - res = self.context.runJob(self, takeUpToNumLeft, p) - - items += res - partsScanned += numPartsToTry - - return items[:num] - - def first(self): - """ - Return the first element in this RDD. - - >>> sc.parallelize([2, 3, 4]).first() - 2 - >>> sc.parallelize([]).first() - Traceback (most recent call last): - ... - ValueError: RDD is empty - """ - rs = self.take(1) - if rs: - return rs[0] - raise ValueError("RDD is empty") - - def isEmpty(self): - """ - Returns true if and only if the RDD contains no elements at all. - - .. note:: an RDD may be empty even when it has at least 1 partition. - - >>> sc.parallelize([]).isEmpty() - True - >>> sc.parallelize([1]).isEmpty() - False - """ - return self.getNumPartitions() == 0 or len(self.take(1)) == 0 - - def saveAsNewAPIHadoopDataset(self, conf, keyConverter=None, valueConverter=None): - """ - Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file - system, using the new Hadoop OutputFormat API (mapreduce package). Keys/values are - converted for output using either user specified converters or, by default, - L{org.apache.spark.api.python.JavaToWritableConverter}. - - :param conf: Hadoop job configuration, passed in as a dict - :param keyConverter: (None by default) - :param valueConverter: (None by default) - """ - jconf = self.ctx._dictToJavaMap(conf) - pickledRDD = self._pickled() - self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd, True, jconf, - keyConverter, valueConverter, True) - - def saveAsNewAPIHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=None, - keyConverter=None, valueConverter=None, conf=None): - """ - Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file - system, using the new Hadoop OutputFormat API (mapreduce package). Key and value types - will be inferred if not specified. Keys and values are converted for output using either - user specified converters or L{org.apache.spark.api.python.JavaToWritableConverter}. The - C{conf} is applied on top of the base Hadoop conf associated with the SparkContext - of this RDD to create a merged Hadoop MapReduce job configuration for saving the data. - - :param path: path to Hadoop file - :param outputFormatClass: fully qualified classname of Hadoop OutputFormat - (e.g. "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat") - :param keyClass: fully qualified classname of key Writable class - (e.g. "org.apache.hadoop.io.IntWritable", None by default) - :param valueClass: fully qualified classname of value Writable class - (e.g. "org.apache.hadoop.io.Text", None by default) - :param keyConverter: (None by default) - :param valueConverter: (None by default) - :param conf: Hadoop job configuration, passed in as a dict (None by default) - """ - jconf = self.ctx._dictToJavaMap(conf) - pickledRDD = self._pickled() - self.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(pickledRDD._jrdd, True, path, - outputFormatClass, - keyClass, valueClass, - keyConverter, valueConverter, jconf) - - def saveAsHadoopDataset(self, conf, keyConverter=None, valueConverter=None): - """ - Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file - system, using the old Hadoop OutputFormat API (mapred package). Keys/values are - converted for output using either user specified converters or, by default, - L{org.apache.spark.api.python.JavaToWritableConverter}. - - :param conf: Hadoop job configuration, passed in as a dict - :param keyConverter: (None by default) - :param valueConverter: (None by default) - """ - jconf = self.ctx._dictToJavaMap(conf) - pickledRDD = self._pickled() - self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd, True, jconf, - keyConverter, valueConverter, False) - - def saveAsHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=None, - keyConverter=None, valueConverter=None, conf=None, - compressionCodecClass=None): - """ - Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file - system, using the old Hadoop OutputFormat API (mapred package). Key and value types - will be inferred if not specified. Keys and values are converted for output using either - user specified converters or L{org.apache.spark.api.python.JavaToWritableConverter}. The - C{conf} is applied on top of the base Hadoop conf associated with the SparkContext - of this RDD to create a merged Hadoop MapReduce job configuration for saving the data. - - :param path: path to Hadoop file - :param outputFormatClass: fully qualified classname of Hadoop OutputFormat - (e.g. "org.apache.hadoop.mapred.SequenceFileOutputFormat") - :param keyClass: fully qualified classname of key Writable class - (e.g. "org.apache.hadoop.io.IntWritable", None by default) - :param valueClass: fully qualified classname of value Writable class - (e.g. "org.apache.hadoop.io.Text", None by default) - :param keyConverter: (None by default) - :param valueConverter: (None by default) - :param conf: (None by default) - :param compressionCodecClass: (None by default) - """ - jconf = self.ctx._dictToJavaMap(conf) - pickledRDD = self._pickled() - self.ctx._jvm.PythonRDD.saveAsHadoopFile(pickledRDD._jrdd, True, path, - outputFormatClass, - keyClass, valueClass, - keyConverter, valueConverter, - jconf, compressionCodecClass) - - def saveAsSequenceFile(self, path, compressionCodecClass=None): - """ - Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file - system, using the L{org.apache.hadoop.io.Writable} types that we convert from the - RDD's key and value types. The mechanism is as follows: - - 1. Pyrolite is used to convert pickled Python RDD into RDD of Java objects. - 2. Keys and values of this Java RDD are converted to Writables and written out. - - :param path: path to sequence file - :param compressionCodecClass: (None by default) - """ - pickledRDD = self._pickled() - self.ctx._jvm.PythonRDD.saveAsSequenceFile(pickledRDD._jrdd, True, - path, compressionCodecClass) - - def saveAsPickleFile(self, path, batchSize=10): - """ - Save this RDD as a SequenceFile of serialized objects. The serializer - used is L{pyspark.serializers.PickleSerializer}, default batch size - is 10. - - >>> tmpFile = NamedTemporaryFile(delete=True) - >>> tmpFile.close() - >>> sc.parallelize([1, 2, 'spark', 'rdd']).saveAsPickleFile(tmpFile.name, 3) - >>> sorted(sc.pickleFile(tmpFile.name, 5).map(str).collect()) - ['1', '2', 'rdd', 'spark'] - """ - if batchSize == 0: - ser = AutoBatchedSerializer(PickleSerializer()) - else: - ser = BatchedSerializer(PickleSerializer(), batchSize) - self._reserialize(ser)._jrdd.saveAsObjectFile(path) - - @ignore_unicode_prefix - def saveAsTextFile(self, path, compressionCodecClass=None): - """ - Save this RDD as a text file, using string representations of elements. - - @param path: path to text file - @param compressionCodecClass: (None by default) string i.e. - "org.apache.hadoop.io.compress.GzipCodec" - - >>> tempFile = NamedTemporaryFile(delete=True) - >>> tempFile.close() - >>> sc.parallelize(range(10)).saveAsTextFile(tempFile.name) - >>> from fileinput import input - >>> from glob import glob - >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*")))) - '0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n' - - Empty lines are tolerated when saving to text files. - - >>> tempFile2 = NamedTemporaryFile(delete=True) - >>> tempFile2.close() - >>> sc.parallelize(['', 'foo', '', 'bar', '']).saveAsTextFile(tempFile2.name) - >>> ''.join(sorted(input(glob(tempFile2.name + "/part-0000*")))) - '\\n\\n\\nbar\\nfoo\\n' - - Using compressionCodecClass - - >>> tempFile3 = NamedTemporaryFile(delete=True) - >>> tempFile3.close() - >>> codec = "org.apache.hadoop.io.compress.GzipCodec" - >>> sc.parallelize(['foo', 'bar']).saveAsTextFile(tempFile3.name, codec) - >>> from fileinput import input, hook_compressed - >>> result = sorted(input(glob(tempFile3.name + "/part*.gz"), openhook=hook_compressed)) - >>> b''.join(result).decode('utf-8') - u'bar\\nfoo\\n' - """ - def func(split, iterator): - for x in iterator: - if not isinstance(x, (unicode, bytes)): - x = unicode(x) - if isinstance(x, unicode): - x = x.encode("utf-8") - yield x - keyed = self.mapPartitionsWithIndex(func) - keyed._bypass_serializer = True - if compressionCodecClass: - compressionCodec = self.ctx._jvm.java.lang.Class.forName(compressionCodecClass) - keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path, compressionCodec) - else: - keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path) - - # Pair functions - - def collectAsMap(self): - """ - Return the key-value pairs in this RDD to the master as a dictionary. - - .. note:: this method should only be used if the resulting data is expected - to be small, as all the data is loaded into the driver's memory. - - >>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap() - >>> m[1] - 2 - >>> m[3] - 4 - """ - return dict(self.collect()) - - def keys(self): - """ - Return an RDD with the keys of each tuple. - - >>> m = sc.parallelize([(1, 2), (3, 4)]).keys() - >>> m.collect() - [1, 3] - """ - return self.map(lambda x: x[0]) - - def values(self): - """ - Return an RDD with the values of each tuple. - - >>> m = sc.parallelize([(1, 2), (3, 4)]).values() - >>> m.collect() - [2, 4] - """ - return self.map(lambda x: x[1]) - - def reduceByKey(self, func, numPartitions=None, partitionFunc=portable_hash): - """ - Merge the values for each key using an associative and commutative reduce function. - - This will also perform the merging locally on each mapper before - sending results to a reducer, similarly to a "combiner" in MapReduce. - - Output will be partitioned with C{numPartitions} partitions, or - the default parallelism level if C{numPartitions} is not specified. - Default partitioner is hash-partition. - - >>> from operator import add - >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) - >>> sorted(rdd.reduceByKey(add).collect()) - [('a', 2), ('b', 1)] - """ - return self.combineByKey(lambda x: x, func, func, numPartitions, partitionFunc) - - def reduceByKeyLocally(self, func): - """ - Merge the values for each key using an associative and commutative reduce function, but - return the results immediately to the master as a dictionary. - - This will also perform the merging locally on each mapper before - sending results to a reducer, similarly to a "combiner" in MapReduce. - - >>> from operator import add - >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) - >>> sorted(rdd.reduceByKeyLocally(add).items()) - [('a', 2), ('b', 1)] - """ - func = fail_on_stopiteration(func) - - def reducePartition(iterator): - m = {} - for k, v in iterator: - m[k] = func(m[k], v) if k in m else v - yield m - - def mergeMaps(m1, m2): - for k, v in m2.items(): - m1[k] = func(m1[k], v) if k in m1 else v - return m1 - return self.mapPartitions(reducePartition).reduce(mergeMaps) - - def countByKey(self): - """ - Count the number of elements for each key, and return the result to the - master as a dictionary. - - >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) - >>> sorted(rdd.countByKey().items()) - [('a', 2), ('b', 1)] - """ - return self.map(lambda x: x[0]).countByValue() - - def join(self, other, numPartitions=None): - """ - Return an RDD containing all pairs of elements with matching keys in - C{self} and C{other}. - - Each pair of elements will be returned as a (k, (v1, v2)) tuple, where - (k, v1) is in C{self} and (k, v2) is in C{other}. - - Performs a hash join across the cluster. - - >>> x = sc.parallelize([("a", 1), ("b", 4)]) - >>> y = sc.parallelize([("a", 2), ("a", 3)]) - >>> sorted(x.join(y).collect()) - [('a', (1, 2)), ('a', (1, 3))] - """ - return python_join(self, other, numPartitions) - - def leftOuterJoin(self, other, numPartitions=None): - """ - Perform a left outer join of C{self} and C{other}. - - For each element (k, v) in C{self}, the resulting RDD will either - contain all pairs (k, (v, w)) for w in C{other}, or the pair - (k, (v, None)) if no elements in C{other} have key k. - - Hash-partitions the resulting RDD into the given number of partitions. - - >>> x = sc.parallelize([("a", 1), ("b", 4)]) - >>> y = sc.parallelize([("a", 2)]) - >>> sorted(x.leftOuterJoin(y).collect()) - [('a', (1, 2)), ('b', (4, None))] - """ - return python_left_outer_join(self, other, numPartitions) - - def rightOuterJoin(self, other, numPartitions=None): - """ - Perform a right outer join of C{self} and C{other}. - - For each element (k, w) in C{other}, the resulting RDD will either - contain all pairs (k, (v, w)) for v in this, or the pair (k, (None, w)) - if no elements in C{self} have key k. - - Hash-partitions the resulting RDD into the given number of partitions. - - >>> x = sc.parallelize([("a", 1), ("b", 4)]) - >>> y = sc.parallelize([("a", 2)]) - >>> sorted(y.rightOuterJoin(x).collect()) - [('a', (2, 1)), ('b', (None, 4))] - """ - return python_right_outer_join(self, other, numPartitions) - - def fullOuterJoin(self, other, numPartitions=None): - """ - Perform a right outer join of C{self} and C{other}. - - For each element (k, v) in C{self}, the resulting RDD will either - contain all pairs (k, (v, w)) for w in C{other}, or the pair - (k, (v, None)) if no elements in C{other} have key k. - - Similarly, for each element (k, w) in C{other}, the resulting RDD will - either contain all pairs (k, (v, w)) for v in C{self}, or the pair - (k, (None, w)) if no elements in C{self} have key k. - - Hash-partitions the resulting RDD into the given number of partitions. - - >>> x = sc.parallelize([("a", 1), ("b", 4)]) - >>> y = sc.parallelize([("a", 2), ("c", 8)]) - >>> sorted(x.fullOuterJoin(y).collect()) - [('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))] - """ - return python_full_outer_join(self, other, numPartitions) - - # TODO: add option to control map-side combining - # portable_hash is used as default, because builtin hash of None is different - # cross machines. - def partitionBy(self, numPartitions, partitionFunc=portable_hash): - """ - Return a copy of the RDD partitioned using the specified partitioner. - - >>> pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x)) - >>> sets = pairs.partitionBy(2).glom().collect() - >>> len(set(sets[0]).intersection(set(sets[1]))) - 0 - """ - if numPartitions is None: - numPartitions = self._defaultReducePartitions() - partitioner = Partitioner(numPartitions, partitionFunc) - if self.partitioner == partitioner: - return self - - # Transferring O(n) objects to Java is too expensive. - # Instead, we'll form the hash buckets in Python, - # transferring O(numPartitions) objects to Java. - # Each object is a (splitNumber, [objects]) pair. - # In order to avoid too huge objects, the objects are - # grouped into chunks. - outputSerializer = self.ctx._unbatched_serializer - - limit = (_parse_memory(self.ctx._conf.get( - "spark.python.worker.memory", "512m")) / 2) - - def add_shuffle_key(split, iterator): - - buckets = defaultdict(list) - c, batch = 0, min(10 * numPartitions, 1000) - - for k, v in iterator: - buckets[partitionFunc(k) % numPartitions].append((k, v)) - c += 1 - - # check used memory and avg size of chunk of objects - if (c % 1000 == 0 and get_used_memory() > limit - or c > batch): - n, size = len(buckets), 0 - for split in list(buckets.keys()): - yield pack_long(split) - d = outputSerializer.dumps(buckets[split]) - del buckets[split] - yield d - size += len(d) - - avg = int(size / n) >> 20 - # let 1M < avg < 10M - if avg < 1: - batch *= 1.5 - elif avg > 10: - batch = max(int(batch / 1.5), 1) - c = 0 - - for split, items in buckets.items(): - yield pack_long(split) - yield outputSerializer.dumps(items) - - keyed = self.mapPartitionsWithIndex(add_shuffle_key, preservesPartitioning=True) - keyed._bypass_serializer = True - with SCCallSiteSync(self.context) as css: - pairRDD = self.ctx._jvm.PairwiseRDD( - keyed._jrdd.rdd()).asJavaPairRDD() - jpartitioner = self.ctx._jvm.PythonPartitioner(numPartitions, - id(partitionFunc)) - jrdd = self.ctx._jvm.PythonRDD.valueOfPair(pairRDD.partitionBy(jpartitioner)) - rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer)) - rdd.partitioner = partitioner - return rdd - - # TODO: add control over map-side aggregation - def combineByKey(self, createCombiner, mergeValue, mergeCombiners, - numPartitions=None, partitionFunc=portable_hash): - """ - Generic function to combine the elements for each key using a custom - set of aggregation functions. - - Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined - type" C. - - Users provide three functions: - - - C{createCombiner}, which turns a V into a C (e.g., creates - a one-element list) - - C{mergeValue}, to merge a V into a C (e.g., adds it to the end of - a list) - - C{mergeCombiners}, to combine two C's into a single one (e.g., merges - the lists) - - To avoid memory allocation, both mergeValue and mergeCombiners are allowed to - modify and return their first argument instead of creating a new C. - - In addition, users can control the partitioning of the output RDD. - - .. note:: V and C can be different -- for example, one might group an RDD of type - (Int, Int) into an RDD of type (Int, List[Int]). - - >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 2)]) - >>> def to_list(a): - ... return [a] - ... - >>> def append(a, b): - ... a.append(b) - ... return a - ... - >>> def extend(a, b): - ... a.extend(b) - ... return a - ... - >>> sorted(x.combineByKey(to_list, append, extend).collect()) - [('a', [1, 2]), ('b', [1])] - """ - if numPartitions is None: - numPartitions = self._defaultReducePartitions() - - serializer = self.ctx.serializer - memory = self._memory_limit() - agg = Aggregator(createCombiner, mergeValue, mergeCombiners) - - def combineLocally(iterator): - merger = ExternalMerger(agg, memory * 0.9, serializer) - merger.mergeValues(iterator) - return merger.items() - - locally_combined = self.mapPartitions(combineLocally, preservesPartitioning=True) - shuffled = locally_combined.partitionBy(numPartitions, partitionFunc) - - def _mergeCombiners(iterator): - merger = ExternalMerger(agg, memory, serializer) - merger.mergeCombiners(iterator) - return merger.items() - - return shuffled.mapPartitions(_mergeCombiners, preservesPartitioning=True) - - def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None, - partitionFunc=portable_hash): - """ - Aggregate the values of each key, using given combine functions and a neutral - "zero value". This function can return a different result type, U, than the type - of the values in this RDD, V. Thus, we need one operation for merging a V into - a U and one operation for merging two U's, The former operation is used for merging - values within a partition, and the latter is used for merging values between - partitions. To avoid memory allocation, both of these functions are - allowed to modify and return their first argument instead of creating a new U. - """ - def createZero(): - return copy.deepcopy(zeroValue) - - return self.combineByKey( - lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions, partitionFunc) - - def foldByKey(self, zeroValue, func, numPartitions=None, partitionFunc=portable_hash): - """ - Merge the values for each key using an associative function "func" - and a neutral "zeroValue" which may be added to the result an - arbitrary number of times, and must not change the result - (e.g., 0 for addition, or 1 for multiplication.). - - >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) - >>> from operator import add - >>> sorted(rdd.foldByKey(0, add).collect()) - [('a', 2), ('b', 1)] - """ - def createZero(): - return copy.deepcopy(zeroValue) - - return self.combineByKey(lambda v: func(createZero(), v), func, func, numPartitions, - partitionFunc) - - def _memory_limit(self): - return _parse_memory(self.ctx._conf.get("spark.python.worker.memory", "512m")) - - # TODO: support variant with custom partitioner - def groupByKey(self, numPartitions=None, partitionFunc=portable_hash): - """ - Group the values for each key in the RDD into a single sequence. - Hash-partitions the resulting RDD with numPartitions partitions. - - .. note:: If you are grouping in order to perform an aggregation (such as a - sum or average) over each key, using reduceByKey or aggregateByKey will - provide much better performance. - - >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) - >>> sorted(rdd.groupByKey().mapValues(len).collect()) - [('a', 2), ('b', 1)] - >>> sorted(rdd.groupByKey().mapValues(list).collect()) - [('a', [1, 1]), ('b', [1])] - """ - def createCombiner(x): - return [x] - - def mergeValue(xs, x): - xs.append(x) - return xs - - def mergeCombiners(a, b): - a.extend(b) - return a - - memory = self._memory_limit() - serializer = self._jrdd_deserializer - agg = Aggregator(createCombiner, mergeValue, mergeCombiners) - - def combine(iterator): - merger = ExternalMerger(agg, memory * 0.9, serializer) - merger.mergeValues(iterator) - return merger.items() - - locally_combined = self.mapPartitions(combine, preservesPartitioning=True) - shuffled = locally_combined.partitionBy(numPartitions, partitionFunc) - - def groupByKey(it): - merger = ExternalGroupBy(agg, memory, serializer) - merger.mergeCombiners(it) - return merger.items() - - return shuffled.mapPartitions(groupByKey, True).mapValues(ResultIterable) - - def flatMapValues(self, f): - """ - Pass each value in the key-value pair RDD through a flatMap function - without changing the keys; this also retains the original RDD's - partitioning. - - >>> x = sc.parallelize([("a", ["x", "y", "z"]), ("b", ["p", "r"])]) - >>> def f(x): return x - >>> x.flatMapValues(f).collect() - [('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')] - """ - flat_map_fn = lambda kv: ((kv[0], x) for x in f(kv[1])) - return self.flatMap(flat_map_fn, preservesPartitioning=True) - - def mapValues(self, f): - """ - Pass each value in the key-value pair RDD through a map function - without changing the keys; this also retains the original RDD's - partitioning. - - >>> x = sc.parallelize([("a", ["apple", "banana", "lemon"]), ("b", ["grapes"])]) - >>> def f(x): return len(x) - >>> x.mapValues(f).collect() - [('a', 3), ('b', 1)] - """ - map_values_fn = lambda kv: (kv[0], f(kv[1])) - return self.map(map_values_fn, preservesPartitioning=True) - - def groupWith(self, other, *others): - """ - Alias for cogroup but with support for multiple RDDs. - - >>> w = sc.parallelize([("a", 5), ("b", 6)]) - >>> x = sc.parallelize([("a", 1), ("b", 4)]) - >>> y = sc.parallelize([("a", 2)]) - >>> z = sc.parallelize([("b", 42)]) - >>> [(x, tuple(map(list, y))) for x, y in sorted(list(w.groupWith(x, y, z).collect()))] - [('a', ([5], [1], [2], [])), ('b', ([6], [4], [], [42]))] - - """ - return python_cogroup((self, other) + others, numPartitions=None) - - # TODO: add variant with custom parittioner - def cogroup(self, other, numPartitions=None): - """ - For each key k in C{self} or C{other}, return a resulting RDD that - contains a tuple with the list of values for that key in C{self} as - well as C{other}. - - >>> x = sc.parallelize([("a", 1), ("b", 4)]) - >>> y = sc.parallelize([("a", 2)]) - >>> [(x, tuple(map(list, y))) for x, y in sorted(list(x.cogroup(y).collect()))] - [('a', ([1], [2])), ('b', ([4], []))] - """ - return python_cogroup((self, other), numPartitions) - - def sampleByKey(self, withReplacement, fractions, seed=None): - """ - Return a subset of this RDD sampled by key (via stratified sampling). - Create a sample of this RDD using variable sampling rates for - different keys as specified by fractions, a key to sampling rate map. - - >>> fractions = {"a": 0.2, "b": 0.1} - >>> rdd = sc.parallelize(fractions.keys()).cartesian(sc.parallelize(range(0, 1000))) - >>> sample = dict(rdd.sampleByKey(False, fractions, 2).groupByKey().collect()) - >>> 100 < len(sample["a"]) < 300 and 50 < len(sample["b"]) < 150 - True - >>> max(sample["a"]) <= 999 and min(sample["a"]) >= 0 - True - >>> max(sample["b"]) <= 999 and min(sample["b"]) >= 0 - True - """ - for fraction in fractions.values(): - assert fraction >= 0.0, "Negative fraction value: %s" % fraction - return self.mapPartitionsWithIndex( - RDDStratifiedSampler(withReplacement, fractions, seed).func, True) - - def subtractByKey(self, other, numPartitions=None): - """ - Return each (key, value) pair in C{self} that has no pair with matching - key in C{other}. - - >>> x = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 2)]) - >>> y = sc.parallelize([("a", 3), ("c", None)]) - >>> sorted(x.subtractByKey(y).collect()) - [('b', 4), ('b', 5)] - """ - def filter_func(pair): - key, (val1, val2) = pair - return val1 and not val2 - return self.cogroup(other, numPartitions).filter(filter_func).flatMapValues(lambda x: x[0]) - - def subtract(self, other, numPartitions=None): - """ - Return each value in C{self} that is not contained in C{other}. - - >>> x = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 3)]) - >>> y = sc.parallelize([("a", 3), ("c", None)]) - >>> sorted(x.subtract(y).collect()) - [('a', 1), ('b', 4), ('b', 5)] - """ - # note: here 'True' is just a placeholder - rdd = other.map(lambda x: (x, True)) - return self.map(lambda x: (x, True)).subtractByKey(rdd, numPartitions).keys() - - def keyBy(self, f): - """ - Creates tuples of the elements in this RDD by applying C{f}. - - >>> x = sc.parallelize(range(0,3)).keyBy(lambda x: x*x) - >>> y = sc.parallelize(zip(range(0,5), range(0,5))) - >>> [(x, list(map(list, y))) for x, y in sorted(x.cogroup(y).collect())] - [(0, [[0], [0]]), (1, [[1], [1]]), (2, [[], [2]]), (3, [[], [3]]), (4, [[2], [4]])] - """ - return self.map(lambda x: (f(x), x)) - - def repartition(self, numPartitions): - """ - Return a new RDD that has exactly numPartitions partitions. - - Can increase or decrease the level of parallelism in this RDD. - Internally, this uses a shuffle to redistribute data. - If you are decreasing the number of partitions in this RDD, consider - using `coalesce`, which can avoid performing a shuffle. - - >>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4) - >>> sorted(rdd.glom().collect()) - [[1], [2, 3], [4, 5], [6, 7]] - >>> len(rdd.repartition(2).glom().collect()) - 2 - >>> len(rdd.repartition(10).glom().collect()) - 10 - """ - return self.coalesce(numPartitions, shuffle=True) - - def coalesce(self, numPartitions, shuffle=False): - """ - Return a new RDD that is reduced into `numPartitions` partitions. - - >>> sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect() - [[1], [2, 3], [4, 5]] - >>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect() - [[1, 2, 3, 4, 5]] - """ - if shuffle: - # Decrease the batch size in order to distribute evenly the elements across output - # partitions. Otherwise, repartition will possibly produce highly skewed partitions. - batchSize = min(10, self.ctx._batchSize or 1024) - ser = BatchedSerializer(PickleSerializer(), batchSize) - selfCopy = self._reserialize(ser) - jrdd_deserializer = selfCopy._jrdd_deserializer - jrdd = selfCopy._jrdd.coalesce(numPartitions, shuffle) - else: - jrdd_deserializer = self._jrdd_deserializer - jrdd = self._jrdd.coalesce(numPartitions, shuffle) - return RDD(jrdd, self.ctx, jrdd_deserializer) - - def zip(self, other): - """ - Zips this RDD with another one, returning key-value pairs with the - first element in each RDD second element in each RDD, etc. Assumes - that the two RDDs have the same number of partitions and the same - number of elements in each partition (e.g. one was made through - a map on the other). - - >>> x = sc.parallelize(range(0,5)) - >>> y = sc.parallelize(range(1000, 1005)) - >>> x.zip(y).collect() - [(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)] - """ - def get_batch_size(ser): - if isinstance(ser, BatchedSerializer): - return ser.batchSize - return 1 # not batched - - def batch_as(rdd, batchSize): - return rdd._reserialize(BatchedSerializer(PickleSerializer(), batchSize)) - - my_batch = get_batch_size(self._jrdd_deserializer) - other_batch = get_batch_size(other._jrdd_deserializer) - if my_batch != other_batch or not my_batch: - # use the smallest batchSize for both of them - batchSize = min(my_batch, other_batch) - if batchSize <= 0: - # auto batched or unlimited - batchSize = 100 - other = batch_as(other, batchSize) - self = batch_as(self, batchSize) - - if self.getNumPartitions() != other.getNumPartitions(): - raise ValueError("Can only zip with RDD which has the same number of partitions") - - # There will be an Exception in JVM if there are different number - # of items in each partitions. - pairRDD = self._jrdd.zip(other._jrdd) - deserializer = PairDeserializer(self._jrdd_deserializer, - other._jrdd_deserializer) - return RDD(pairRDD, self.ctx, deserializer) - - def zipWithIndex(self): - """ - Zips this RDD with its element indices. - - The ordering is first based on the partition index and then the - ordering of items within each partition. So the first item in - the first partition gets index 0, and the last item in the last - partition receives the largest index. - - This method needs to trigger a spark job when this RDD contains - more than one partitions. - - >>> sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect() - [('a', 0), ('b', 1), ('c', 2), ('d', 3)] - """ - starts = [0] - if self.getNumPartitions() > 1: - nums = self.mapPartitions(lambda it: [sum(1 for i in it)]).collect() - for i in range(len(nums) - 1): - starts.append(starts[-1] + nums[i]) - - def func(k, it): - for i, v in enumerate(it, starts[k]): - yield v, i - - return self.mapPartitionsWithIndex(func) - - def zipWithUniqueId(self): - """ - Zips this RDD with generated unique Long ids. - - Items in the kth partition will get ids k, n+k, 2*n+k, ..., where - n is the number of partitions. So there may exist gaps, but this - method won't trigger a spark job, which is different from - L{zipWithIndex} - - >>> sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect() - [('a', 0), ('b', 1), ('c', 4), ('d', 2), ('e', 5)] - """ - n = self.getNumPartitions() - - def func(k, it): - for i, v in enumerate(it): - yield v, i * n + k - - return self.mapPartitionsWithIndex(func) - - def name(self): - """ - Return the name of this RDD. - """ - n = self._jrdd.name() - if n: - return n - - @ignore_unicode_prefix - def setName(self, name): - """ - Assign a name to this RDD. - - >>> rdd1 = sc.parallelize([1, 2]) - >>> rdd1.setName('RDD1').name() - u'RDD1' - """ - self._jrdd.setName(name) - return self - - def toDebugString(self): - """ - A description of this RDD and its recursive dependencies for debugging. - """ - debug_string = self._jrdd.toDebugString() - if debug_string: - return debug_string.encode('utf-8') - - def getStorageLevel(self): - """ - Get the RDD's current storage level. - - >>> rdd1 = sc.parallelize([1,2]) - >>> rdd1.getStorageLevel() - StorageLevel(False, False, False, False, 1) - >>> print(rdd1.getStorageLevel()) - Serialized 1x Replicated - """ - java_storage_level = self._jrdd.getStorageLevel() - storage_level = StorageLevel(java_storage_level.useDisk(), - java_storage_level.useMemory(), - java_storage_level.useOffHeap(), - java_storage_level.deserialized(), - java_storage_level.replication()) - return storage_level - - def _defaultReducePartitions(self): - """ - Returns the default number of partitions to use during reduce tasks (e.g., groupBy). - If spark.default.parallelism is set, then we'll use the value from SparkContext - defaultParallelism, otherwise we'll use the number of partitions in this RDD. - - This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce - the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will - be inherent. - """ - if self.ctx._conf.contains("spark.default.parallelism"): - return self.ctx.defaultParallelism - else: - return self.getNumPartitions() - - def lookup(self, key): - """ - Return the list of values in the RDD for key `key`. This operation - is done efficiently if the RDD has a known partitioner by only - searching the partition that the key maps to. - - >>> l = range(1000) - >>> rdd = sc.parallelize(zip(l, l), 10) - >>> rdd.lookup(42) # slow - [42] - >>> sorted = rdd.sortByKey() - >>> sorted.lookup(42) # fast - [42] - >>> sorted.lookup(1024) - [] - >>> rdd2 = sc.parallelize([(('a', 'b'), 'c')]).groupByKey() - >>> list(rdd2.lookup(('a', 'b'))[0]) - ['c'] - """ - values = self.filter(lambda kv: kv[0] == key).values() - - if self.partitioner is not None: - return self.ctx.runJob(values, lambda x: x, [self.partitioner(key)]) - - return values.collect() - - def _to_java_object_rdd(self): - """ Return a JavaRDD of Object by unpickling - - It will convert each Python object into Java object by Pyrolite, whenever the - RDD is serialized in batch or not. - """ - rdd = self._pickled() - return self.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd, True) - - def countApprox(self, timeout, confidence=0.95): - """ - .. note:: Experimental - - Approximate version of count() that returns a potentially incomplete - result within a timeout, even if not all tasks have finished. - - >>> rdd = sc.parallelize(range(1000), 10) - >>> rdd.countApprox(1000, 1.0) - 1000 - """ - drdd = self.mapPartitions(lambda it: [float(sum(1 for i in it))]) - return int(drdd.sumApprox(timeout, confidence)) - - def sumApprox(self, timeout, confidence=0.95): - """ - .. note:: Experimental - - Approximate operation to return the sum within a timeout - or meet the confidence. - - >>> rdd = sc.parallelize(range(1000), 10) - >>> r = sum(range(1000)) - >>> abs(rdd.sumApprox(1000) - r) / r < 0.05 - True - """ - jrdd = self.mapPartitions(lambda it: [float(sum(it))])._to_java_object_rdd() - jdrdd = self.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd()) - r = jdrdd.sumApprox(timeout, confidence).getFinalValue() - return BoundedFloat(r.mean(), r.confidence(), r.low(), r.high()) - - def meanApprox(self, timeout, confidence=0.95): - """ - .. note:: Experimental - - Approximate operation to return the mean within a timeout - or meet the confidence. - - >>> rdd = sc.parallelize(range(1000), 10) - >>> r = sum(range(1000)) / 1000.0 - >>> abs(rdd.meanApprox(1000) - r) / r < 0.05 - True - """ - jrdd = self.map(float)._to_java_object_rdd() - jdrdd = self.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd()) - r = jdrdd.meanApprox(timeout, confidence).getFinalValue() - return BoundedFloat(r.mean(), r.confidence(), r.low(), r.high()) - - def countApproxDistinct(self, relativeSD=0.05): - """ - .. note:: Experimental - - Return approximate number of distinct elements in the RDD. - - The algorithm used is based on streamlib's implementation of - `"HyperLogLog in Practice: Algorithmic Engineering of a State - of The Art Cardinality Estimation Algorithm", available here - `_. - - :param relativeSD: Relative accuracy. Smaller values create - counters that require more space. - It must be greater than 0.000017. - - >>> n = sc.parallelize(range(1000)).map(str).countApproxDistinct() - >>> 900 < n < 1100 - True - >>> n = sc.parallelize([i % 20 for i in range(1000)]).countApproxDistinct() - >>> 16 < n < 24 - True - """ - if relativeSD < 0.000017: - raise ValueError("relativeSD should be greater than 0.000017") - # the hash space in Java is 2^32 - hashRDD = self.map(lambda x: portable_hash(x) & 0xFFFFFFFF) - return hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD) - - def toLocalIterator(self): - """ - Return an iterator that contains all of the elements in this RDD. - The iterator will consume as much memory as the largest partition in this RDD. - - >>> rdd = sc.parallelize(range(10)) - >>> [x for x in rdd.toLocalIterator()] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - """ - with SCCallSiteSync(self.context) as css: - sock_info = self.ctx._jvm.PythonRDD.toLocalIteratorAndServe(self._jrdd.rdd()) - return _load_from_socket(sock_info, self._jrdd_deserializer) - - def barrier(self): - """ - .. note:: Experimental - - Marks the current stage as a barrier stage, where Spark must launch all tasks together. - In case of a task failure, instead of only restarting the failed task, Spark will abort the - entire stage and relaunch all tasks for this stage. - The barrier execution mode feature is experimental and it only handles limited scenarios. - Please read the linked SPIP and design docs to understand the limitations and future plans. - - :return: an :class:`RDDBarrier` instance that provides actions within a barrier stage. - - .. seealso:: :class:`BarrierTaskContext` - .. seealso:: `SPIP: Barrier Execution Mode - `_ - .. seealso:: `Design Doc `_ - - .. versionadded:: 2.4.0 - """ - return RDDBarrier(self) - - def _is_barrier(self): - """ - Whether this RDD is in a barrier stage. - """ - return self._jrdd.rdd().isBarrier() - - -def _prepare_for_python_RDD(sc, command): - # the serialized command will be compressed by broadcast - ser = CloudPickleSerializer() - pickled_command = ser.dumps(command) - if len(pickled_command) > (1 << 20): # 1M - # The broadcast will have same life cycle as created PythonRDD - broadcast = sc.broadcast(pickled_command) - pickled_command = ser.dumps(broadcast) - broadcast_vars = [x._jbroadcast for x in sc._pickled_broadcast_vars] - sc._pickled_broadcast_vars.clear() - return pickled_command, broadcast_vars, sc.environment, sc._python_includes - - -def _wrap_function(sc, func, deserializer, serializer, profiler=None): - assert deserializer, "deserializer should not be empty" - assert serializer, "serializer should not be empty" - command = (func, profiler, deserializer, serializer) - pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command) - return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec, - sc.pythonVer, broadcast_vars, sc._javaAccumulator) - - -class RDDBarrier(object): - - """ - .. note:: Experimental - - Wraps an RDD in a barrier stage, which forces Spark to launch tasks of this stage together. - :class:`RDDBarrier` instances are created by :func:`RDD.barrier`. - - .. versionadded:: 2.4.0 - """ - - def __init__(self, rdd): - self.rdd = rdd - - def mapPartitions(self, f, preservesPartitioning=False): - """ - .. note:: Experimental - - Returns a new RDD by applying a function to each partition of the wrapped RDD, - where tasks are launched together in a barrier stage. - The interface is the same as :func:`RDD.mapPartitions`. - Please see the API doc there. - - .. versionadded:: 2.4.0 - """ - def func(s, iterator): - return f(iterator) - return PipelinedRDD(self.rdd, func, preservesPartitioning, isFromBarrier=True) - - -class PipelinedRDD(RDD): - - """ - Pipelined maps: - - >>> rdd = sc.parallelize([1, 2, 3, 4]) - >>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect() - [4, 8, 12, 16] - >>> rdd.map(lambda x: 2 * x).map(lambda x: 2 * x).collect() - [4, 8, 12, 16] - - Pipelined reduces: - >>> from operator import add - >>> rdd.map(lambda x: 2 * x).reduce(add) - 20 - >>> rdd.flatMap(lambda x: [x, x]).reduce(add) - 20 - """ - - def __init__(self, prev, func, preservesPartitioning=False, isFromBarrier=False): - if not isinstance(prev, PipelinedRDD) or not prev._is_pipelinable(): - # This transformation is the first in its stage: - self.func = func - self.preservesPartitioning = preservesPartitioning - self._prev_jrdd = prev._jrdd - self._prev_jrdd_deserializer = prev._jrdd_deserializer - else: - prev_func = prev.func - - def pipeline_func(split, iterator): - return func(split, prev_func(split, iterator)) - self.func = pipeline_func - self.preservesPartitioning = \ - prev.preservesPartitioning and preservesPartitioning - self._prev_jrdd = prev._prev_jrdd # maintain the pipeline - self._prev_jrdd_deserializer = prev._prev_jrdd_deserializer - self.is_cached = False - self.is_checkpointed = False - self.ctx = prev.ctx - self.prev = prev - self._jrdd_val = None - self._id = None - self._jrdd_deserializer = self.ctx.serializer - self._bypass_serializer = False - self.partitioner = prev.partitioner if self.preservesPartitioning else None - self.is_barrier = prev._is_barrier() or isFromBarrier - - def getNumPartitions(self): - return self._prev_jrdd.partitions().size() - - @property - def _jrdd(self): - if self._jrdd_val: - return self._jrdd_val - if self._bypass_serializer: - self._jrdd_deserializer = NoOpSerializer() - - if self.ctx.profiler_collector: - profiler = self.ctx.profiler_collector.new_profiler(self.ctx) - else: - profiler = None - - wrapped_func = _wrap_function(self.ctx, self.func, self._prev_jrdd_deserializer, - self._jrdd_deserializer, profiler) - python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func, - self.preservesPartitioning, self.is_barrier) - self._jrdd_val = python_rdd.asJavaRDD() - - if profiler: - self._id = self._jrdd_val.id() - self.ctx.profiler_collector.add_profiler(self._id, profiler) - return self._jrdd_val - - def id(self): - if self._id is None: - self._id = self._jrdd.id() - return self._id - - def _is_pipelinable(self): - return not (self.is_cached or self.is_checkpointed) - - def _is_barrier(self): - return self.is_barrier - - -def _test(): - import doctest - from pyspark.context import SparkContext - globs = globals().copy() - # The small batch size here ensures that we see multiple batches, - # even in these small test examples: - globs['sc'] = SparkContext('local[4]', 'PythonTest') - (failure_count, test_count) = doctest.testmod( - globs=globs, optionflags=doctest.ELLIPSIS) - globs['sc'].stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/rddsampler.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/rddsampler.py deleted file mode 100644 index fe8f873..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/rddsampler.py +++ /dev/null @@ -1,119 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import random -import math - - -class RDDSamplerBase(object): - - def __init__(self, withReplacement, seed=None): - self._seed = seed if seed is not None else random.randint(0, sys.maxsize) - self._withReplacement = withReplacement - self._random = None - - def initRandomGenerator(self, split): - self._random = random.Random(self._seed ^ split) - - # mixing because the initial seeds are close to each other - for _ in range(10): - self._random.randint(0, 1) - - def getUniformSample(self): - return self._random.random() - - def getPoissonSample(self, mean): - # Using Knuth's algorithm described in - # http://en.wikipedia.org/wiki/Poisson_distribution - if mean < 20.0: - # one exp and k+1 random calls - l = math.exp(-mean) - p = self._random.random() - k = 0 - while p > l: - k += 1 - p *= self._random.random() - else: - # switch to the log domain, k+1 expovariate (random + log) calls - p = self._random.expovariate(mean) - k = 0 - while p < 1.0: - k += 1 - p += self._random.expovariate(mean) - return k - - def func(self, split, iterator): - raise NotImplementedError - - -class RDDSampler(RDDSamplerBase): - - def __init__(self, withReplacement, fraction, seed=None): - RDDSamplerBase.__init__(self, withReplacement, seed) - self._fraction = fraction - - def func(self, split, iterator): - self.initRandomGenerator(split) - if self._withReplacement: - for obj in iterator: - # For large datasets, the expected number of occurrences of each element in - # a sample with replacement is Poisson(frac). We use that to get a count for - # each element. - count = self.getPoissonSample(self._fraction) - for _ in range(0, count): - yield obj - else: - for obj in iterator: - if self.getUniformSample() < self._fraction: - yield obj - - -class RDDRangeSampler(RDDSamplerBase): - - def __init__(self, lowerBound, upperBound, seed=None): - RDDSamplerBase.__init__(self, False, seed) - self._lowerBound = lowerBound - self._upperBound = upperBound - - def func(self, split, iterator): - self.initRandomGenerator(split) - for obj in iterator: - if self._lowerBound <= self.getUniformSample() < self._upperBound: - yield obj - - -class RDDStratifiedSampler(RDDSamplerBase): - - def __init__(self, withReplacement, fractions, seed=None): - RDDSamplerBase.__init__(self, withReplacement, seed) - self._fractions = fractions - - def func(self, split, iterator): - self.initRandomGenerator(split) - if self._withReplacement: - for key, val in iterator: - # For large datasets, the expected number of occurrences of each element in - # a sample with replacement is Poisson(frac). We use that to get a count for - # each element. - count = self.getPoissonSample(self._fractions[key]) - for _ in range(0, count): - yield key, val - else: - for key, val in iterator: - if self.getUniformSample() < self._fractions[key]: - yield key, val diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/resultiterable.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/resultiterable.py deleted file mode 100644 index 1ab5ce1..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/resultiterable.py +++ /dev/null @@ -1,39 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import collections - -__all__ = ["ResultIterable"] - - -class ResultIterable(collections.Iterable): - - """ - A special result iterable. This is used because the standard - iterator can not be pickled - """ - - def __init__(self, data): - self.data = data - self.index = 0 - self.maxindex = len(data) - - def __iter__(self): - return iter(self.data) - - def __len__(self): - return len(self.data) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/serializers.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/serializers.py deleted file mode 100644 index 5398bf4..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/serializers.py +++ /dev/null @@ -1,792 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -PySpark supports custom serializers for transferring data; this can improve -performance. - -By default, PySpark uses L{PickleSerializer} to serialize objects using Python's -C{cPickle} serializer, which can serialize nearly any Python object. -Other serializers, like L{MarshalSerializer}, support fewer datatypes but can be -faster. - -The serializer is chosen when creating L{SparkContext}: - ->>> from pyspark.context import SparkContext ->>> from pyspark.serializers import MarshalSerializer ->>> sc = SparkContext('local', 'test', serializer=MarshalSerializer()) ->>> sc.parallelize(list(range(1000))).map(lambda x: 2 * x).take(10) -[0, 2, 4, 6, 8, 10, 12, 14, 16, 18] ->>> sc.stop() - -PySpark serializes objects in batches; by default, the batch size is chosen based -on the size of objects and is also configurable by SparkContext's C{batchSize} -parameter: - ->>> sc = SparkContext('local', 'test', batchSize=2) ->>> rdd = sc.parallelize(range(16), 4).map(lambda x: x) - -Behind the scenes, this creates a JavaRDD with four partitions, each of -which contains two batches of two objects: - ->>> rdd.glom().collect() -[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]] ->>> int(rdd._jrdd.count()) -8 ->>> sc.stop() -""" - -import sys -from itertools import chain, product -import marshal -import struct -import types -import collections -import zlib -import itertools - -if sys.version < '3': - import cPickle as pickle - protocol = 2 - from itertools import izip as zip, imap as map -else: - import pickle - protocol = 3 - xrange = range - -from pyspark import cloudpickle -from pyspark.util import _exception_message - - -__all__ = ["PickleSerializer", "MarshalSerializer", "UTF8Deserializer"] - - -class SpecialLengths(object): - END_OF_DATA_SECTION = -1 - PYTHON_EXCEPTION_THROWN = -2 - TIMING_DATA = -3 - END_OF_STREAM = -4 - NULL = -5 - START_ARROW_STREAM = -6 - - -class Serializer(object): - - def dump_stream(self, iterator, stream): - """ - Serialize an iterator of objects to the output stream. - """ - raise NotImplementedError - - def load_stream(self, stream): - """ - Return an iterator of deserialized objects from the input stream. - """ - raise NotImplementedError - - def _load_stream_without_unbatching(self, stream): - """ - Return an iterator of deserialized batches (iterable) of objects from the input stream. - If the serializer does not operate on batches the default implementation returns an - iterator of single element lists. - """ - return map(lambda x: [x], self.load_stream(stream)) - - # Note: our notion of "equality" is that output generated by - # equal serializers can be deserialized using the same serializer. - - # This default implementation handles the simple cases; - # subclasses should override __eq__ as appropriate. - - def __eq__(self, other): - return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ - - def __ne__(self, other): - return not self.__eq__(other) - - def __repr__(self): - return "%s()" % self.__class__.__name__ - - def __hash__(self): - return hash(str(self)) - - -class FramedSerializer(Serializer): - - """ - Serializer that writes objects as a stream of (length, data) pairs, - where C{length} is a 32-bit integer and data is C{length} bytes. - """ - - def __init__(self): - # On Python 2.6, we can't write bytearrays to streams, so we need to convert them - # to strings first. Check if the version number is that old. - self._only_write_strings = sys.version_info[0:2] <= (2, 6) - - def dump_stream(self, iterator, stream): - for obj in iterator: - self._write_with_length(obj, stream) - - def load_stream(self, stream): - while True: - try: - yield self._read_with_length(stream) - except EOFError: - return - - def _write_with_length(self, obj, stream): - serialized = self.dumps(obj) - if serialized is None: - raise ValueError("serialized value should not be None") - if len(serialized) > (1 << 31): - raise ValueError("can not serialize object larger than 2G") - write_int(len(serialized), stream) - if self._only_write_strings: - stream.write(str(serialized)) - else: - stream.write(serialized) - - def _read_with_length(self, stream): - length = read_int(stream) - if length == SpecialLengths.END_OF_DATA_SECTION: - raise EOFError - elif length == SpecialLengths.NULL: - return None - obj = stream.read(length) - if len(obj) < length: - raise EOFError - return self.loads(obj) - - def dumps(self, obj): - """ - Serialize an object into a byte array. - When batching is used, this will be called with an array of objects. - """ - raise NotImplementedError - - def loads(self, obj): - """ - Deserialize an object from a byte array. - """ - raise NotImplementedError - - -class ArrowStreamSerializer(Serializer): - """ - Serializes Arrow record batches as a stream. - """ - - def dump_stream(self, iterator, stream): - import pyarrow as pa - writer = None - try: - for batch in iterator: - if writer is None: - writer = pa.RecordBatchStreamWriter(stream, batch.schema) - writer.write_batch(batch) - finally: - if writer is not None: - writer.close() - - def load_stream(self, stream): - import pyarrow as pa - reader = pa.open_stream(stream) - for batch in reader: - yield batch - - def __repr__(self): - return "ArrowStreamSerializer" - - -def _create_batch(series, timezone): - """ - Create an Arrow record batch from the given pandas.Series or list of Series, with optional type. - - :param series: A single pandas.Series, list of Series, or list of (series, arrow_type) - :param timezone: A timezone to respect when handling timestamp values - :return: Arrow RecordBatch - """ - import decimal - from distutils.version import LooseVersion - import pyarrow as pa - from pyspark.sql.types import _check_series_convert_timestamps_internal - # Make input conform to [(series1, type1), (series2, type2), ...] - if not isinstance(series, (list, tuple)) or \ - (len(series) == 2 and isinstance(series[1], pa.DataType)): - series = [series] - series = ((s, None) if not isinstance(s, (list, tuple)) else s for s in series) - - def create_array(s, t): - mask = s.isnull() - # Ensure timestamp series are in expected form for Spark internal representation - # TODO: maybe don't need None check anymore as of Arrow 0.9.1 - if t is not None and pa.types.is_timestamp(t): - s = _check_series_convert_timestamps_internal(s.fillna(0), timezone) - # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 - return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) - elif t is not None and pa.types.is_string(t) and sys.version < '3': - # TODO: need decode before converting to Arrow in Python 2 - # TODO: don't need as of Arrow 0.9.1 - return pa.Array.from_pandas(s.apply( - lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t) - elif t is not None and pa.types.is_decimal(t) and \ - LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"): - # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0. - return pa.Array.from_pandas(s.apply( - lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t) - elif LooseVersion(pa.__version__) < LooseVersion("0.11.0"): - # TODO: see ARROW-1949. Remove when the minimum PyArrow version becomes 0.11.0. - return pa.Array.from_pandas(s, mask=mask, type=t) - return pa.Array.from_pandas(s, mask=mask, type=t, safe=False) - - arrs = [create_array(s, t) for s, t in series] - return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))]) - - -class ArrowStreamPandasSerializer(Serializer): - """ - Serializes Pandas.Series as Arrow data with Arrow streaming format. - """ - - def __init__(self, timezone): - super(ArrowStreamPandasSerializer, self).__init__() - self._timezone = timezone - - def arrow_to_pandas(self, arrow_column): - from pyspark.sql.types import from_arrow_type, \ - _check_series_convert_date, _check_series_localize_timestamps - - s = arrow_column.to_pandas() - s = _check_series_convert_date(s, from_arrow_type(arrow_column.type)) - s = _check_series_localize_timestamps(s, self._timezone) - return s - - def dump_stream(self, iterator, stream): - """ - Make ArrowRecordBatches from Pandas Series and serialize. Input is a single series or - a list of series accompanied by an optional pyarrow type to coerce the data to. - """ - import pyarrow as pa - writer = None - try: - for series in iterator: - batch = _create_batch(series, self._timezone) - if writer is None: - write_int(SpecialLengths.START_ARROW_STREAM, stream) - writer = pa.RecordBatchStreamWriter(stream, batch.schema) - writer.write_batch(batch) - finally: - if writer is not None: - writer.close() - - def load_stream(self, stream): - """ - Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series. - """ - import pyarrow as pa - reader = pa.open_stream(stream) - - for batch in reader: - yield [self.arrow_to_pandas(c) for c in pa.Table.from_batches([batch]).itercolumns()] - - def __repr__(self): - return "ArrowStreamPandasSerializer" - - -class BatchedSerializer(Serializer): - - """ - Serializes a stream of objects in batches by calling its wrapped - Serializer with streams of objects. - """ - - UNLIMITED_BATCH_SIZE = -1 - UNKNOWN_BATCH_SIZE = 0 - - def __init__(self, serializer, batchSize=UNLIMITED_BATCH_SIZE): - self.serializer = serializer - self.batchSize = batchSize - - def _batched(self, iterator): - if self.batchSize == self.UNLIMITED_BATCH_SIZE: - yield list(iterator) - elif hasattr(iterator, "__len__") and hasattr(iterator, "__getslice__"): - n = len(iterator) - for i in xrange(0, n, self.batchSize): - yield iterator[i: i + self.batchSize] - else: - items = [] - count = 0 - for item in iterator: - items.append(item) - count += 1 - if count == self.batchSize: - yield items - items = [] - count = 0 - if items: - yield items - - def dump_stream(self, iterator, stream): - self.serializer.dump_stream(self._batched(iterator), stream) - - def load_stream(self, stream): - return chain.from_iterable(self._load_stream_without_unbatching(stream)) - - def _load_stream_without_unbatching(self, stream): - return self.serializer.load_stream(stream) - - def __repr__(self): - return "BatchedSerializer(%s, %d)" % (str(self.serializer), self.batchSize) - - -class FlattenedValuesSerializer(BatchedSerializer): - - """ - Serializes a stream of list of pairs, split the list of values - which contain more than a certain number of objects to make them - have similar sizes. - """ - def __init__(self, serializer, batchSize=10): - BatchedSerializer.__init__(self, serializer, batchSize) - - def _batched(self, iterator): - n = self.batchSize - for key, values in iterator: - for i in range(0, len(values), n): - yield key, values[i:i + n] - - def load_stream(self, stream): - return self.serializer.load_stream(stream) - - def __repr__(self): - return "FlattenedValuesSerializer(%s, %d)" % (self.serializer, self.batchSize) - - -class AutoBatchedSerializer(BatchedSerializer): - """ - Choose the size of batch automatically based on the size of object - """ - - def __init__(self, serializer, bestSize=1 << 16): - BatchedSerializer.__init__(self, serializer, self.UNKNOWN_BATCH_SIZE) - self.bestSize = bestSize - - def dump_stream(self, iterator, stream): - batch, best = 1, self.bestSize - iterator = iter(iterator) - while True: - vs = list(itertools.islice(iterator, batch)) - if not vs: - break - - bytes = self.serializer.dumps(vs) - write_int(len(bytes), stream) - stream.write(bytes) - - size = len(bytes) - if size < best: - batch *= 2 - elif size > best * 10 and batch > 1: - batch //= 2 - - def __repr__(self): - return "AutoBatchedSerializer(%s)" % self.serializer - - -class CartesianDeserializer(Serializer): - - """ - Deserializes the JavaRDD cartesian() of two PythonRDDs. - Due to pyspark batching we cannot simply use the result of the Java RDD cartesian, - we additionally need to do the cartesian within each pair of batches. - """ - - def __init__(self, key_ser, val_ser): - self.key_ser = key_ser - self.val_ser = val_ser - - def _load_stream_without_unbatching(self, stream): - key_batch_stream = self.key_ser._load_stream_without_unbatching(stream) - val_batch_stream = self.val_ser._load_stream_without_unbatching(stream) - for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream): - # for correctness with repeated cartesian/zip this must be returned as one batch - yield product(key_batch, val_batch) - - def load_stream(self, stream): - return chain.from_iterable(self._load_stream_without_unbatching(stream)) - - def __repr__(self): - return "CartesianDeserializer(%s, %s)" % \ - (str(self.key_ser), str(self.val_ser)) - - -class PairDeserializer(Serializer): - - """ - Deserializes the JavaRDD zip() of two PythonRDDs. - Due to pyspark batching we cannot simply use the result of the Java RDD zip, - we additionally need to do the zip within each pair of batches. - """ - - def __init__(self, key_ser, val_ser): - self.key_ser = key_ser - self.val_ser = val_ser - - def _load_stream_without_unbatching(self, stream): - key_batch_stream = self.key_ser._load_stream_without_unbatching(stream) - val_batch_stream = self.val_ser._load_stream_without_unbatching(stream) - for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream): - # For double-zipped RDDs, the batches can be iterators from other PairDeserializer, - # instead of lists. We need to convert them to lists if needed. - key_batch = key_batch if hasattr(key_batch, '__len__') else list(key_batch) - val_batch = val_batch if hasattr(val_batch, '__len__') else list(val_batch) - if len(key_batch) != len(val_batch): - raise ValueError("Can not deserialize PairRDD with different number of items" - " in batches: (%d, %d)" % (len(key_batch), len(val_batch))) - # for correctness with repeated cartesian/zip this must be returned as one batch - yield zip(key_batch, val_batch) - - def load_stream(self, stream): - return chain.from_iterable(self._load_stream_without_unbatching(stream)) - - def __repr__(self): - return "PairDeserializer(%s, %s)" % (str(self.key_ser), str(self.val_ser)) - - -class NoOpSerializer(FramedSerializer): - - def loads(self, obj): - return obj - - def dumps(self, obj): - return obj - - -# Hack namedtuple, make it picklable - -__cls = {} - - -def _restore(name, fields, value): - """ Restore an object of namedtuple""" - k = (name, fields) - cls = __cls.get(k) - if cls is None: - cls = collections.namedtuple(name, fields) - __cls[k] = cls - return cls(*value) - - -def _hack_namedtuple(cls): - """ Make class generated by namedtuple picklable """ - name = cls.__name__ - fields = cls._fields - - def __reduce__(self): - return (_restore, (name, fields, tuple(self))) - cls.__reduce__ = __reduce__ - cls._is_namedtuple_ = True - return cls - - -def _hijack_namedtuple(): - """ Hack namedtuple() to make it picklable """ - # hijack only one time - if hasattr(collections.namedtuple, "__hijack"): - return - - global _old_namedtuple # or it will put in closure - global _old_namedtuple_kwdefaults # or it will put in closure too - - def _copy_func(f): - return types.FunctionType(f.__code__, f.__globals__, f.__name__, - f.__defaults__, f.__closure__) - - def _kwdefaults(f): - # __kwdefaults__ contains the default values of keyword-only arguments which are - # introduced from Python 3. The possible cases for __kwdefaults__ in namedtuple - # are as below: - # - # - Does not exist in Python 2. - # - Returns None in <= Python 3.5.x. - # - Returns a dictionary containing the default values to the keys from Python 3.6.x - # (See https://bugs.python.org/issue25628). - kargs = getattr(f, "__kwdefaults__", None) - if kargs is None: - return {} - else: - return kargs - - _old_namedtuple = _copy_func(collections.namedtuple) - _old_namedtuple_kwdefaults = _kwdefaults(collections.namedtuple) - - def namedtuple(*args, **kwargs): - for k, v in _old_namedtuple_kwdefaults.items(): - kwargs[k] = kwargs.get(k, v) - cls = _old_namedtuple(*args, **kwargs) - return _hack_namedtuple(cls) - - # replace namedtuple with the new one - collections.namedtuple.__globals__["_old_namedtuple_kwdefaults"] = _old_namedtuple_kwdefaults - collections.namedtuple.__globals__["_old_namedtuple"] = _old_namedtuple - collections.namedtuple.__globals__["_hack_namedtuple"] = _hack_namedtuple - collections.namedtuple.__code__ = namedtuple.__code__ - collections.namedtuple.__hijack = 1 - - # hack the cls already generated by namedtuple. - # Those created in other modules can be pickled as normal, - # so only hack those in __main__ module - for n, o in sys.modules["__main__"].__dict__.items(): - if (type(o) is type and o.__base__ is tuple - and hasattr(o, "_fields") - and "__reduce__" not in o.__dict__): - _hack_namedtuple(o) # hack inplace - - -_hijack_namedtuple() - - -class PickleSerializer(FramedSerializer): - - """ - Serializes objects using Python's pickle serializer: - - http://docs.python.org/2/library/pickle.html - - This serializer supports nearly any Python object, but may - not be as fast as more specialized serializers. - """ - - def dumps(self, obj): - return pickle.dumps(obj, protocol) - - if sys.version >= '3': - def loads(self, obj, encoding="bytes"): - return pickle.loads(obj, encoding=encoding) - else: - def loads(self, obj, encoding=None): - return pickle.loads(obj) - - -class CloudPickleSerializer(PickleSerializer): - - def dumps(self, obj): - try: - return cloudpickle.dumps(obj, 2) - except pickle.PickleError: - raise - except Exception as e: - emsg = _exception_message(e) - if "'i' format requires" in emsg: - msg = "Object too large to serialize: %s" % emsg - else: - msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg) - cloudpickle.print_exec(sys.stderr) - raise pickle.PicklingError(msg) - - -class MarshalSerializer(FramedSerializer): - - """ - Serializes objects using Python's Marshal serializer: - - http://docs.python.org/2/library/marshal.html - - This serializer is faster than PickleSerializer but supports fewer datatypes. - """ - - def dumps(self, obj): - return marshal.dumps(obj) - - def loads(self, obj): - return marshal.loads(obj) - - -class AutoSerializer(FramedSerializer): - - """ - Choose marshal or pickle as serialization protocol automatically - """ - - def __init__(self): - FramedSerializer.__init__(self) - self._type = None - - def dumps(self, obj): - if self._type is not None: - return b'P' + pickle.dumps(obj, -1) - try: - return b'M' + marshal.dumps(obj) - except Exception: - self._type = b'P' - return b'P' + pickle.dumps(obj, -1) - - def loads(self, obj): - _type = obj[0] - if _type == b'M': - return marshal.loads(obj[1:]) - elif _type == b'P': - return pickle.loads(obj[1:]) - else: - raise ValueError("invalid serialization type: %s" % _type) - - -class CompressedSerializer(FramedSerializer): - """ - Compress the serialized data - """ - def __init__(self, serializer): - FramedSerializer.__init__(self) - assert isinstance(serializer, FramedSerializer), "serializer must be a FramedSerializer" - self.serializer = serializer - - def dumps(self, obj): - return zlib.compress(self.serializer.dumps(obj), 1) - - def loads(self, obj): - return self.serializer.loads(zlib.decompress(obj)) - - def __repr__(self): - return "CompressedSerializer(%s)" % self.serializer - - -class UTF8Deserializer(Serializer): - - """ - Deserializes streams written by String.getBytes. - """ - - def __init__(self, use_unicode=True): - self.use_unicode = use_unicode - - def loads(self, stream): - length = read_int(stream) - if length == SpecialLengths.END_OF_DATA_SECTION: - raise EOFError - elif length == SpecialLengths.NULL: - return None - s = stream.read(length) - return s.decode("utf-8") if self.use_unicode else s - - def load_stream(self, stream): - try: - while True: - yield self.loads(stream) - except struct.error: - return - except EOFError: - return - - def __repr__(self): - return "UTF8Deserializer(%s)" % self.use_unicode - - -def read_long(stream): - length = stream.read(8) - if not length: - raise EOFError - return struct.unpack("!q", length)[0] - - -def write_long(value, stream): - stream.write(struct.pack("!q", value)) - - -def pack_long(value): - return struct.pack("!q", value) - - -def read_int(stream): - length = stream.read(4) - if not length: - raise EOFError - return struct.unpack("!i", length)[0] - - -def write_int(value, stream): - stream.write(struct.pack("!i", value)) - - -def read_bool(stream): - length = stream.read(1) - if not length: - raise EOFError - return struct.unpack("!?", length)[0] - - -def write_with_length(obj, stream): - write_int(len(obj), stream) - stream.write(obj) - - -class ChunkedStream(object): - - """ - This is a file-like object takes a stream of data, of unknown length, and breaks it into fixed - length frames. The intended use case is serializing large data and sending it immediately over - a socket -- we do not want to buffer the entire data before sending it, but the receiving end - needs to know whether or not there is more data coming. - - It works by buffering the incoming data in some fixed-size chunks. If the buffer is full, it - first sends the buffer size, then the data. This repeats as long as there is more data to send. - When this is closed, it sends the length of whatever data is in the buffer, then that data, and - finally a "length" of -1 to indicate the stream has completed. - """ - - def __init__(self, wrapped, buffer_size): - self.buffer_size = buffer_size - self.buffer = bytearray(buffer_size) - self.current_pos = 0 - self.wrapped = wrapped - - def write(self, bytes): - byte_pos = 0 - byte_remaining = len(bytes) - while byte_remaining > 0: - new_pos = byte_remaining + self.current_pos - if new_pos < self.buffer_size: - # just put it in our buffer - self.buffer[self.current_pos:new_pos] = bytes[byte_pos:] - self.current_pos = new_pos - byte_remaining = 0 - else: - # fill the buffer, send the length then the contents, and start filling again - space_left = self.buffer_size - self.current_pos - new_byte_pos = byte_pos + space_left - self.buffer[self.current_pos:self.buffer_size] = bytes[byte_pos:new_byte_pos] - write_int(self.buffer_size, self.wrapped) - self.wrapped.write(self.buffer) - byte_remaining -= space_left - byte_pos = new_byte_pos - self.current_pos = 0 - - def close(self): - # if there is anything left in the buffer, write it out first - if self.current_pos > 0: - write_int(self.current_pos, self.wrapped) - self.wrapped.write(self.buffer[:self.current_pos]) - # -1 length indicates to the receiving end that we're done. - write_int(-1, self.wrapped) - self.wrapped.close() - - -if __name__ == '__main__': - import doctest - (failure_count, test_count) = doctest.testmod() - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/shell.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/shell.py deleted file mode 100644 index 65e3bdb..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/shell.py +++ /dev/null @@ -1,76 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -An interactive shell. - -This file is designed to be launched as a PYTHONSTARTUP script. -""" - -import atexit -import os -import platform -import warnings - -import py4j - -from pyspark import SparkConf -from pyspark.context import SparkContext -from pyspark.sql import SparkSession, SQLContext - -if os.environ.get("SPARK_EXECUTOR_URI"): - SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) - -SparkContext._ensure_initialized() - -try: - spark = SparkSession._create_shell_session() -except Exception: - import sys - import traceback - warnings.warn("Failed to initialize Spark session.") - traceback.print_exc(file=sys.stderr) - sys.exit(1) - -sc = spark.sparkContext -sql = spark.sql -atexit.register(lambda: sc.stop()) - -# for compatibility -sqlContext = spark._wrapped -sqlCtx = sqlContext - -print(r"""Welcome to - ____ __ - / __/__ ___ _____/ /__ - _\ \/ _ \/ _ `/ __/ '_/ - /__ / .__/\_,_/_/ /_/\_\ version %s - /_/ -""" % sc.version) -print("Using Python version %s (%s, %s)" % ( - platform.python_version(), - platform.python_build()[0], - platform.python_build()[1])) -print("SparkSession available as 'spark'.") - -# The ./bin/pyspark script stores the old PYTHONSTARTUP value in OLD_PYTHONSTARTUP, -# which allows us to execute the user's PYTHONSTARTUP file: -_pythonstartup = os.environ.get('OLD_PYTHONSTARTUP') -if _pythonstartup and os.path.isfile(_pythonstartup): - with open(_pythonstartup) as f: - code = compile(f.read(), _pythonstartup, 'exec') - exec(code) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/shuffle.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/shuffle.py deleted file mode 100644 index bd0ac00..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/shuffle.py +++ /dev/null @@ -1,815 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import platform -import shutil -import warnings -import gc -import itertools -import operator -import random -import sys - -import pyspark.heapq3 as heapq -from pyspark.serializers import BatchedSerializer, PickleSerializer, FlattenedValuesSerializer, \ - CompressedSerializer, AutoBatchedSerializer -from pyspark.util import fail_on_stopiteration - - -try: - import psutil - - process = None - - def get_used_memory(): - """ Return the used memory in MB """ - global process - if process is None or process._pid != os.getpid(): - process = psutil.Process(os.getpid()) - if hasattr(process, "memory_info"): - info = process.memory_info() - else: - info = process.get_memory_info() - return info.rss >> 20 - -except ImportError: - - def get_used_memory(): - """ Return the used memory in MB """ - if platform.system() == 'Linux': - for line in open('/proc/self/status'): - if line.startswith('VmRSS:'): - return int(line.split()[1]) >> 10 - - else: - warnings.warn("Please install psutil to have better " - "support with spilling") - if platform.system() == "Darwin": - import resource - rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - return rss >> 20 - # TODO: support windows - - return 0 - - -def _get_local_dirs(sub): - """ Get all the directories """ - path = os.environ.get("SPARK_LOCAL_DIRS", "/tmp") - dirs = path.split(",") - if len(dirs) > 1: - # different order in different processes and instances - rnd = random.Random(os.getpid() + id(dirs)) - random.shuffle(dirs, rnd.random) - return [os.path.join(d, "python", str(os.getpid()), sub) for d in dirs] - - -# global stats -MemoryBytesSpilled = 0 -DiskBytesSpilled = 0 - - -class Aggregator(object): - - """ - Aggregator has tree functions to merge values into combiner. - - createCombiner: (value) -> combiner - mergeValue: (combine, value) -> combiner - mergeCombiners: (combiner, combiner) -> combiner - """ - - def __init__(self, createCombiner, mergeValue, mergeCombiners): - self.createCombiner = fail_on_stopiteration(createCombiner) - self.mergeValue = fail_on_stopiteration(mergeValue) - self.mergeCombiners = fail_on_stopiteration(mergeCombiners) - - -class SimpleAggregator(Aggregator): - - """ - SimpleAggregator is useful for the cases that combiners have - same type with values - """ - - def __init__(self, combiner): - Aggregator.__init__(self, lambda x: x, combiner, combiner) - - -class Merger(object): - - """ - Merge shuffled data together by aggregator - """ - - def __init__(self, aggregator): - self.agg = aggregator - - def mergeValues(self, iterator): - """ Combine the items by creator and combiner """ - raise NotImplementedError - - def mergeCombiners(self, iterator): - """ Merge the combined items by mergeCombiner """ - raise NotImplementedError - - def items(self): - """ Return the merged items ad iterator """ - raise NotImplementedError - - -def _compressed_serializer(self, serializer=None): - # always use PickleSerializer to simplify implementation - ser = PickleSerializer() - return AutoBatchedSerializer(CompressedSerializer(ser)) - - -class ExternalMerger(Merger): - - """ - External merger will dump the aggregated data into disks when - memory usage goes above the limit, then merge them together. - - This class works as follows: - - - It repeatedly combine the items and save them in one dict in - memory. - - - When the used memory goes above memory limit, it will split - the combined data into partitions by hash code, dump them - into disk, one file per partition. - - - Then it goes through the rest of the iterator, combine items - into different dict by hash. Until the used memory goes over - memory limit, it dump all the dicts into disks, one file per - dict. Repeat this again until combine all the items. - - - Before return any items, it will load each partition and - combine them separately. Yield them before loading next - partition. - - - During loading a partition, if the memory goes over limit, - it will partition the loaded data and dump them into disks - and load them partition by partition again. - - `data` and `pdata` are used to hold the merged items in memory. - At first, all the data are merged into `data`. Once the used - memory goes over limit, the items in `data` are dumped into - disks, `data` will be cleared, all rest of items will be merged - into `pdata` and then dumped into disks. Before returning, all - the items in `pdata` will be dumped into disks. - - Finally, if any items were spilled into disks, each partition - will be merged into `data` and be yielded, then cleared. - - >>> agg = SimpleAggregator(lambda x, y: x + y) - >>> merger = ExternalMerger(agg, 10) - >>> N = 10000 - >>> merger.mergeValues(zip(range(N), range(N))) - >>> assert merger.spills > 0 - >>> sum(v for k,v in merger.items()) - 49995000 - - >>> merger = ExternalMerger(agg, 10) - >>> merger.mergeCombiners(zip(range(N), range(N))) - >>> assert merger.spills > 0 - >>> sum(v for k,v in merger.items()) - 49995000 - """ - - # the max total partitions created recursively - MAX_TOTAL_PARTITIONS = 4096 - - def __init__(self, aggregator, memory_limit=512, serializer=None, - localdirs=None, scale=1, partitions=59, batch=1000): - Merger.__init__(self, aggregator) - self.memory_limit = memory_limit - self.serializer = _compressed_serializer(serializer) - self.localdirs = localdirs or _get_local_dirs(str(id(self))) - # number of partitions when spill data into disks - self.partitions = partitions - # check the memory after # of items merged - self.batch = batch - # scale is used to scale down the hash of key for recursive hash map - self.scale = scale - # un-partitioned merged data - self.data = {} - # partitioned merged data, list of dicts - self.pdata = [] - # number of chunks dumped into disks - self.spills = 0 - # randomize the hash of key, id(o) is the address of o (aligned by 8) - self._seed = id(self) + 7 - - def _get_spill_dir(self, n): - """ Choose one directory for spill by number n """ - return os.path.join(self.localdirs[n % len(self.localdirs)], str(n)) - - def _next_limit(self): - """ - Return the next memory limit. If the memory is not released - after spilling, it will dump the data only when the used memory - starts to increase. - """ - return max(self.memory_limit, get_used_memory() * 1.05) - - def mergeValues(self, iterator): - """ Combine the items by creator and combiner """ - # speedup attribute lookup - creator, comb = self.agg.createCombiner, self.agg.mergeValue - c, data, pdata, hfun, batch = 0, self.data, self.pdata, self._partition, self.batch - limit = self.memory_limit - - for k, v in iterator: - d = pdata[hfun(k)] if pdata else data - d[k] = comb(d[k], v) if k in d else creator(v) - - c += 1 - if c >= batch: - if get_used_memory() >= limit: - self._spill() - limit = self._next_limit() - batch /= 2 - c = 0 - else: - batch *= 1.5 - - if get_used_memory() >= limit: - self._spill() - - def _partition(self, key): - """ Return the partition for key """ - return hash((key, self._seed)) % self.partitions - - def _object_size(self, obj): - """ How much of memory for this obj, assume that all the objects - consume similar bytes of memory - """ - return 1 - - def mergeCombiners(self, iterator, limit=None): - """ Merge (K,V) pair by mergeCombiner """ - if limit is None: - limit = self.memory_limit - # speedup attribute lookup - comb, hfun, objsize = self.agg.mergeCombiners, self._partition, self._object_size - c, data, pdata, batch = 0, self.data, self.pdata, self.batch - for k, v in iterator: - d = pdata[hfun(k)] if pdata else data - d[k] = comb(d[k], v) if k in d else v - if not limit: - continue - - c += objsize(v) - if c > batch: - if get_used_memory() > limit: - self._spill() - limit = self._next_limit() - batch /= 2 - c = 0 - else: - batch *= 1.5 - - if limit and get_used_memory() >= limit: - self._spill() - - def _spill(self): - """ - dump already partitioned data into disks. - - It will dump the data in batch for better performance. - """ - global MemoryBytesSpilled, DiskBytesSpilled - path = self._get_spill_dir(self.spills) - if not os.path.exists(path): - os.makedirs(path) - - used_memory = get_used_memory() - if not self.pdata: - # The data has not been partitioned, it will iterator the - # dataset once, write them into different files, has no - # additional memory. It only called when the memory goes - # above limit at the first time. - - # open all the files for writing - streams = [open(os.path.join(path, str(i)), 'wb') - for i in range(self.partitions)] - - for k, v in self.data.items(): - h = self._partition(k) - # put one item in batch, make it compatible with load_stream - # it will increase the memory if dump them in batch - self.serializer.dump_stream([(k, v)], streams[h]) - - for s in streams: - DiskBytesSpilled += s.tell() - s.close() - - self.data.clear() - self.pdata.extend([{} for i in range(self.partitions)]) - - else: - for i in range(self.partitions): - p = os.path.join(path, str(i)) - with open(p, "wb") as f: - # dump items in batch - self.serializer.dump_stream(iter(self.pdata[i].items()), f) - self.pdata[i].clear() - DiskBytesSpilled += os.path.getsize(p) - - self.spills += 1 - gc.collect() # release the memory as much as possible - MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20 - - def items(self): - """ Return all merged items as iterator """ - if not self.pdata and not self.spills: - return iter(self.data.items()) - return self._external_items() - - def _external_items(self): - """ Return all partitioned items as iterator """ - assert not self.data - if any(self.pdata): - self._spill() - # disable partitioning and spilling when merge combiners from disk - self.pdata = [] - - try: - for i in range(self.partitions): - for v in self._merged_items(i): - yield v - self.data.clear() - - # remove the merged partition - for j in range(self.spills): - path = self._get_spill_dir(j) - os.remove(os.path.join(path, str(i))) - finally: - self._cleanup() - - def _merged_items(self, index): - self.data = {} - limit = self._next_limit() - for j in range(self.spills): - path = self._get_spill_dir(j) - p = os.path.join(path, str(index)) - # do not check memory during merging - with open(p, "rb") as f: - self.mergeCombiners(self.serializer.load_stream(f), 0) - - # limit the total partitions - if (self.scale * self.partitions < self.MAX_TOTAL_PARTITIONS - and j < self.spills - 1 - and get_used_memory() > limit): - self.data.clear() # will read from disk again - gc.collect() # release the memory as much as possible - return self._recursive_merged_items(index) - - return self.data.items() - - def _recursive_merged_items(self, index): - """ - merge the partitioned items and return the as iterator - - If one partition can not be fit in memory, then them will be - partitioned and merged recursively. - """ - subdirs = [os.path.join(d, "parts", str(index)) for d in self.localdirs] - m = ExternalMerger(self.agg, self.memory_limit, self.serializer, subdirs, - self.scale * self.partitions, self.partitions, self.batch) - m.pdata = [{} for _ in range(self.partitions)] - limit = self._next_limit() - - for j in range(self.spills): - path = self._get_spill_dir(j) - p = os.path.join(path, str(index)) - with open(p, 'rb') as f: - m.mergeCombiners(self.serializer.load_stream(f), 0) - - if get_used_memory() > limit: - m._spill() - limit = self._next_limit() - - return m._external_items() - - def _cleanup(self): - """ Clean up all the files in disks """ - for d in self.localdirs: - shutil.rmtree(d, True) - - -class ExternalSorter(object): - """ - ExtenalSorter will divide the elements into chunks, sort them in - memory and dump them into disks, finally merge them back. - - The spilling will only happen when the used memory goes above - the limit. - - - >>> sorter = ExternalSorter(1) # 1M - >>> import random - >>> l = list(range(1024)) - >>> random.shuffle(l) - >>> sorted(l) == list(sorter.sorted(l)) - True - >>> sorted(l) == list(sorter.sorted(l, key=lambda x: -x, reverse=True)) - True - """ - def __init__(self, memory_limit, serializer=None): - self.memory_limit = memory_limit - self.local_dirs = _get_local_dirs("sort") - self.serializer = _compressed_serializer(serializer) - - def _get_path(self, n): - """ Choose one directory for spill by number n """ - d = self.local_dirs[n % len(self.local_dirs)] - if not os.path.exists(d): - os.makedirs(d) - return os.path.join(d, str(n)) - - def _next_limit(self): - """ - Return the next memory limit. If the memory is not released - after spilling, it will dump the data only when the used memory - starts to increase. - """ - return max(self.memory_limit, get_used_memory() * 1.05) - - def sorted(self, iterator, key=None, reverse=False): - """ - Sort the elements in iterator, do external sort when the memory - goes above the limit. - """ - global MemoryBytesSpilled, DiskBytesSpilled - batch, limit = 100, self._next_limit() - chunks, current_chunk = [], [] - iterator = iter(iterator) - while True: - # pick elements in batch - chunk = list(itertools.islice(iterator, batch)) - current_chunk.extend(chunk) - if len(chunk) < batch: - break - - used_memory = get_used_memory() - if used_memory > limit: - # sort them inplace will save memory - current_chunk.sort(key=key, reverse=reverse) - path = self._get_path(len(chunks)) - with open(path, 'wb') as f: - self.serializer.dump_stream(current_chunk, f) - - def load(f): - for v in self.serializer.load_stream(f): - yield v - # close the file explicit once we consume all the items - # to avoid ResourceWarning in Python3 - f.close() - chunks.append(load(open(path, 'rb'))) - current_chunk = [] - MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20 - DiskBytesSpilled += os.path.getsize(path) - os.unlink(path) # data will be deleted after close - - elif not chunks: - batch = min(int(batch * 1.5), 10000) - - current_chunk.sort(key=key, reverse=reverse) - if not chunks: - return current_chunk - - if current_chunk: - chunks.append(iter(current_chunk)) - - return heapq.merge(chunks, key=key, reverse=reverse) - - -class ExternalList(object): - """ - ExternalList can have many items which cannot be hold in memory in - the same time. - - >>> l = ExternalList(list(range(100))) - >>> len(l) - 100 - >>> l.append(10) - >>> len(l) - 101 - >>> for i in range(20240): - ... l.append(i) - >>> len(l) - 20341 - >>> import pickle - >>> l2 = pickle.loads(pickle.dumps(l)) - >>> len(l2) - 20341 - >>> list(l2)[100] - 10 - """ - LIMIT = 10240 - - def __init__(self, values): - self.values = values - self.count = len(values) - self._file = None - self._ser = None - - def __getstate__(self): - if self._file is not None: - self._file.flush() - with os.fdopen(os.dup(self._file.fileno()), "rb") as f: - f.seek(0) - serialized = f.read() - else: - serialized = b'' - return self.values, self.count, serialized - - def __setstate__(self, item): - self.values, self.count, serialized = item - if serialized: - self._open_file() - self._file.write(serialized) - else: - self._file = None - self._ser = None - - def __iter__(self): - if self._file is not None: - self._file.flush() - # read all items from disks first - with os.fdopen(os.dup(self._file.fileno()), 'rb') as f: - f.seek(0) - for v in self._ser.load_stream(f): - yield v - - for v in self.values: - yield v - - def __len__(self): - return self.count - - def append(self, value): - self.values.append(value) - self.count += 1 - # dump them into disk if the key is huge - if len(self.values) >= self.LIMIT: - self._spill() - - def _open_file(self): - dirs = _get_local_dirs("objects") - d = dirs[id(self) % len(dirs)] - if not os.path.exists(d): - os.makedirs(d) - p = os.path.join(d, str(id(self))) - self._file = open(p, "w+b", 65536) - self._ser = BatchedSerializer(CompressedSerializer(PickleSerializer()), 1024) - os.unlink(p) - - def __del__(self): - if self._file: - self._file.close() - self._file = None - - def _spill(self): - """ dump the values into disk """ - global MemoryBytesSpilled, DiskBytesSpilled - if self._file is None: - self._open_file() - - used_memory = get_used_memory() - pos = self._file.tell() - self._ser.dump_stream(self.values, self._file) - self.values = [] - gc.collect() - DiskBytesSpilled += self._file.tell() - pos - MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20 - - -class ExternalListOfList(ExternalList): - """ - An external list for list. - - >>> l = ExternalListOfList([[i, i] for i in range(100)]) - >>> len(l) - 200 - >>> l.append(range(10)) - >>> len(l) - 210 - >>> len(list(l)) - 210 - """ - - def __init__(self, values): - ExternalList.__init__(self, values) - self.count = sum(len(i) for i in values) - - def append(self, value): - ExternalList.append(self, value) - # already counted 1 in ExternalList.append - self.count += len(value) - 1 - - def __iter__(self): - for values in ExternalList.__iter__(self): - for v in values: - yield v - - -class GroupByKey(object): - """ - Group a sorted iterator as [(k1, it1), (k2, it2), ...] - - >>> k = [i // 3 for i in range(6)] - >>> v = [[i] for i in range(6)] - >>> g = GroupByKey(zip(k, v)) - >>> [(k, list(it)) for k, it in g] - [(0, [0, 1, 2]), (1, [3, 4, 5])] - """ - - def __init__(self, iterator): - self.iterator = iterator - - def __iter__(self): - key, values = None, None - for k, v in self.iterator: - if values is not None and k == key: - values.append(v) - else: - if values is not None: - yield (key, values) - key = k - values = ExternalListOfList([v]) - if values is not None: - yield (key, values) - - -class ExternalGroupBy(ExternalMerger): - - """ - Group by the items by key. If any partition of them can not been - hold in memory, it will do sort based group by. - - This class works as follows: - - - It repeatedly group the items by key and save them in one dict in - memory. - - - When the used memory goes above memory limit, it will split - the combined data into partitions by hash code, dump them - into disk, one file per partition. If the number of keys - in one partitions is smaller than 1000, it will sort them - by key before dumping into disk. - - - Then it goes through the rest of the iterator, group items - by key into different dict by hash. Until the used memory goes over - memory limit, it dump all the dicts into disks, one file per - dict. Repeat this again until combine all the items. It - also will try to sort the items by key in each partition - before dumping into disks. - - - It will yield the grouped items partitions by partitions. - If the data in one partitions can be hold in memory, then it - will load and combine them in memory and yield. - - - If the dataset in one partition cannot be hold in memory, - it will sort them first. If all the files are already sorted, - it merge them by heap.merge(), so it will do external sort - for all the files. - - - After sorting, `GroupByKey` class will put all the continuous - items with the same key as a group, yield the values as - an iterator. - """ - SORT_KEY_LIMIT = 1000 - - def flattened_serializer(self): - assert isinstance(self.serializer, BatchedSerializer) - ser = self.serializer - return FlattenedValuesSerializer(ser, 20) - - def _object_size(self, obj): - return len(obj) - - def _spill(self): - """ - dump already partitioned data into disks. - """ - global MemoryBytesSpilled, DiskBytesSpilled - path = self._get_spill_dir(self.spills) - if not os.path.exists(path): - os.makedirs(path) - - used_memory = get_used_memory() - if not self.pdata: - # The data has not been partitioned, it will iterator the - # data once, write them into different files, has no - # additional memory. It only called when the memory goes - # above limit at the first time. - - # open all the files for writing - streams = [open(os.path.join(path, str(i)), 'wb') - for i in range(self.partitions)] - - # If the number of keys is small, then the overhead of sort is small - # sort them before dumping into disks - self._sorted = len(self.data) < self.SORT_KEY_LIMIT - if self._sorted: - self.serializer = self.flattened_serializer() - for k in sorted(self.data.keys()): - h = self._partition(k) - self.serializer.dump_stream([(k, self.data[k])], streams[h]) - else: - for k, v in self.data.items(): - h = self._partition(k) - self.serializer.dump_stream([(k, v)], streams[h]) - - for s in streams: - DiskBytesSpilled += s.tell() - s.close() - - self.data.clear() - # self.pdata is cached in `mergeValues` and `mergeCombiners` - self.pdata.extend([{} for i in range(self.partitions)]) - - else: - for i in range(self.partitions): - p = os.path.join(path, str(i)) - with open(p, "wb") as f: - # dump items in batch - if self._sorted: - # sort by key only (stable) - sorted_items = sorted(self.pdata[i].items(), key=operator.itemgetter(0)) - self.serializer.dump_stream(sorted_items, f) - else: - self.serializer.dump_stream(self.pdata[i].items(), f) - self.pdata[i].clear() - DiskBytesSpilled += os.path.getsize(p) - - self.spills += 1 - gc.collect() # release the memory as much as possible - MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20 - - def _merged_items(self, index): - size = sum(os.path.getsize(os.path.join(self._get_spill_dir(j), str(index))) - for j in range(self.spills)) - # if the memory can not hold all the partition, - # then use sort based merge. Because of compression, - # the data on disks will be much smaller than needed memory - if size >= self.memory_limit << 17: # * 1M / 8 - return self._merge_sorted_items(index) - - self.data = {} - for j in range(self.spills): - path = self._get_spill_dir(j) - p = os.path.join(path, str(index)) - # do not check memory during merging - with open(p, "rb") as f: - self.mergeCombiners(self.serializer.load_stream(f), 0) - return self.data.items() - - def _merge_sorted_items(self, index): - """ load a partition from disk, then sort and group by key """ - def load_partition(j): - path = self._get_spill_dir(j) - p = os.path.join(path, str(index)) - with open(p, 'rb', 65536) as f: - for v in self.serializer.load_stream(f): - yield v - - disk_items = [load_partition(j) for j in range(self.spills)] - - if self._sorted: - # all the partitions are already sorted - sorted_items = heapq.merge(disk_items, key=operator.itemgetter(0)) - - else: - # Flatten the combined values, so it will not consume huge - # memory during merging sort. - ser = self.flattened_serializer() - sorter = ExternalSorter(self.memory_limit, ser) - sorted_items = sorter.sorted(itertools.chain(*disk_items), - key=operator.itemgetter(0)) - return ((k, vs) for k, vs in GroupByKey(sorted_items)) - - -if __name__ == "__main__": - import doctest - (failure_count, test_count) = doctest.testmod() - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/__init__.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/__init__.py deleted file mode 100644 index c3c06c8..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Important classes of Spark SQL and DataFrames: - - - :class:`pyspark.sql.SparkSession` - Main entry point for :class:`DataFrame` and SQL functionality. - - :class:`pyspark.sql.DataFrame` - A distributed collection of data grouped into named columns. - - :class:`pyspark.sql.Column` - A column expression in a :class:`DataFrame`. - - :class:`pyspark.sql.Row` - A row of data in a :class:`DataFrame`. - - :class:`pyspark.sql.GroupedData` - Aggregation methods, returned by :func:`DataFrame.groupBy`. - - :class:`pyspark.sql.DataFrameNaFunctions` - Methods for handling missing data (null values). - - :class:`pyspark.sql.DataFrameStatFunctions` - Methods for statistics functionality. - - :class:`pyspark.sql.functions` - List of built-in functions available for :class:`DataFrame`. - - :class:`pyspark.sql.types` - List of data types available. - - :class:`pyspark.sql.Window` - For working with window functions. -""" -from __future__ import absolute_import - - -from pyspark.sql.types import Row -from pyspark.sql.context import SQLContext, HiveContext, UDFRegistration -from pyspark.sql.session import SparkSession -from pyspark.sql.column import Column -from pyspark.sql.catalog import Catalog -from pyspark.sql.dataframe import DataFrame, DataFrameNaFunctions, DataFrameStatFunctions -from pyspark.sql.group import GroupedData -from pyspark.sql.readwriter import DataFrameReader, DataFrameWriter -from pyspark.sql.window import Window, WindowSpec - - -__all__ = [ - 'SparkSession', 'SQLContext', 'HiveContext', 'UDFRegistration', - 'DataFrame', 'GroupedData', 'Column', 'Catalog', 'Row', - 'DataFrameNaFunctions', 'DataFrameStatFunctions', 'Window', 'WindowSpec', - 'DataFrameReader', 'DataFrameWriter' -] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/catalog.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/catalog.py deleted file mode 100644 index 974251f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/catalog.py +++ /dev/null @@ -1,312 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import warnings -from collections import namedtuple - -from pyspark import since -from pyspark.rdd import ignore_unicode_prefix, PythonEvalType -from pyspark.sql.dataframe import DataFrame -from pyspark.sql.udf import UserDefinedFunction -from pyspark.sql.types import IntegerType, StringType, StructType - - -Database = namedtuple("Database", "name description locationUri") -Table = namedtuple("Table", "name database description tableType isTemporary") -Column = namedtuple("Column", "name description dataType nullable isPartition isBucket") -Function = namedtuple("Function", "name description className isTemporary") - - -class Catalog(object): - """User-facing catalog API, accessible through `SparkSession.catalog`. - - This is a thin wrapper around its Scala implementation org.apache.spark.sql.catalog.Catalog. - """ - - def __init__(self, sparkSession): - """Create a new Catalog that wraps the underlying JVM object.""" - self._sparkSession = sparkSession - self._jsparkSession = sparkSession._jsparkSession - self._jcatalog = sparkSession._jsparkSession.catalog() - - @ignore_unicode_prefix - @since(2.0) - def currentDatabase(self): - """Returns the current default database in this session.""" - return self._jcatalog.currentDatabase() - - @ignore_unicode_prefix - @since(2.0) - def setCurrentDatabase(self, dbName): - """Sets the current default database in this session.""" - return self._jcatalog.setCurrentDatabase(dbName) - - @ignore_unicode_prefix - @since(2.0) - def listDatabases(self): - """Returns a list of databases available across all sessions.""" - iter = self._jcatalog.listDatabases().toLocalIterator() - databases = [] - while iter.hasNext(): - jdb = iter.next() - databases.append(Database( - name=jdb.name(), - description=jdb.description(), - locationUri=jdb.locationUri())) - return databases - - @ignore_unicode_prefix - @since(2.0) - def listTables(self, dbName=None): - """Returns a list of tables/views in the specified database. - - If no database is specified, the current database is used. - This includes all temporary views. - """ - if dbName is None: - dbName = self.currentDatabase() - iter = self._jcatalog.listTables(dbName).toLocalIterator() - tables = [] - while iter.hasNext(): - jtable = iter.next() - tables.append(Table( - name=jtable.name(), - database=jtable.database(), - description=jtable.description(), - tableType=jtable.tableType(), - isTemporary=jtable.isTemporary())) - return tables - - @ignore_unicode_prefix - @since(2.0) - def listFunctions(self, dbName=None): - """Returns a list of functions registered in the specified database. - - If no database is specified, the current database is used. - This includes all temporary functions. - """ - if dbName is None: - dbName = self.currentDatabase() - iter = self._jcatalog.listFunctions(dbName).toLocalIterator() - functions = [] - while iter.hasNext(): - jfunction = iter.next() - functions.append(Function( - name=jfunction.name(), - description=jfunction.description(), - className=jfunction.className(), - isTemporary=jfunction.isTemporary())) - return functions - - @ignore_unicode_prefix - @since(2.0) - def listColumns(self, tableName, dbName=None): - """Returns a list of columns for the given table/view in the specified database. - - If no database is specified, the current database is used. - - Note: the order of arguments here is different from that of its JVM counterpart - because Python does not support method overloading. - """ - if dbName is None: - dbName = self.currentDatabase() - iter = self._jcatalog.listColumns(dbName, tableName).toLocalIterator() - columns = [] - while iter.hasNext(): - jcolumn = iter.next() - columns.append(Column( - name=jcolumn.name(), - description=jcolumn.description(), - dataType=jcolumn.dataType(), - nullable=jcolumn.nullable(), - isPartition=jcolumn.isPartition(), - isBucket=jcolumn.isBucket())) - return columns - - @since(2.0) - def createExternalTable(self, tableName, path=None, source=None, schema=None, **options): - """Creates a table based on the dataset in a data source. - - It returns the DataFrame associated with the external table. - - The data source is specified by the ``source`` and a set of ``options``. - If ``source`` is not specified, the default data source configured by - ``spark.sql.sources.default`` will be used. - - Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and - created external table. - - :return: :class:`DataFrame` - """ - warnings.warn( - "createExternalTable is deprecated since Spark 2.2, please use createTable instead.", - DeprecationWarning) - return self.createTable(tableName, path, source, schema, **options) - - @since(2.2) - def createTable(self, tableName, path=None, source=None, schema=None, **options): - """Creates a table based on the dataset in a data source. - - It returns the DataFrame associated with the table. - - The data source is specified by the ``source`` and a set of ``options``. - If ``source`` is not specified, the default data source configured by - ``spark.sql.sources.default`` will be used. When ``path`` is specified, an external table is - created from the data at the given path. Otherwise a managed table is created. - - Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and - created table. - - :return: :class:`DataFrame` - """ - if path is not None: - options["path"] = path - if source is None: - source = self._sparkSession._wrapped._conf.defaultDataSourceName() - if schema is None: - df = self._jcatalog.createTable(tableName, source, options) - else: - if not isinstance(schema, StructType): - raise TypeError("schema should be StructType") - scala_datatype = self._jsparkSession.parseDataType(schema.json()) - df = self._jcatalog.createTable(tableName, source, scala_datatype, options) - return DataFrame(df, self._sparkSession._wrapped) - - @since(2.0) - def dropTempView(self, viewName): - """Drops the local temporary view with the given view name in the catalog. - If the view has been cached before, then it will also be uncached. - Returns true if this view is dropped successfully, false otherwise. - - Note that, the return type of this method was None in Spark 2.0, but changed to Boolean - in Spark 2.1. - - >>> spark.createDataFrame([(1, 1)]).createTempView("my_table") - >>> spark.table("my_table").collect() - [Row(_1=1, _2=1)] - >>> spark.catalog.dropTempView("my_table") - >>> spark.table("my_table") # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - AnalysisException: ... - """ - self._jcatalog.dropTempView(viewName) - - @since(2.1) - def dropGlobalTempView(self, viewName): - """Drops the global temporary view with the given view name in the catalog. - If the view has been cached before, then it will also be uncached. - Returns true if this view is dropped successfully, false otherwise. - - >>> spark.createDataFrame([(1, 1)]).createGlobalTempView("my_table") - >>> spark.table("global_temp.my_table").collect() - [Row(_1=1, _2=1)] - >>> spark.catalog.dropGlobalTempView("my_table") - >>> spark.table("global_temp.my_table") # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - AnalysisException: ... - """ - self._jcatalog.dropGlobalTempView(viewName) - - @since(2.0) - def registerFunction(self, name, f, returnType=None): - """An alias for :func:`spark.udf.register`. - See :meth:`pyspark.sql.UDFRegistration.register`. - - .. note:: Deprecated in 2.3.0. Use :func:`spark.udf.register` instead. - """ - warnings.warn( - "Deprecated in 2.3.0. Use spark.udf.register instead.", - DeprecationWarning) - return self._sparkSession.udf.register(name, f, returnType) - - @since(2.0) - def isCached(self, tableName): - """Returns true if the table is currently cached in-memory.""" - return self._jcatalog.isCached(tableName) - - @since(2.0) - def cacheTable(self, tableName): - """Caches the specified table in-memory.""" - self._jcatalog.cacheTable(tableName) - - @since(2.0) - def uncacheTable(self, tableName): - """Removes the specified table from the in-memory cache.""" - self._jcatalog.uncacheTable(tableName) - - @since(2.0) - def clearCache(self): - """Removes all cached tables from the in-memory cache.""" - self._jcatalog.clearCache() - - @since(2.0) - def refreshTable(self, tableName): - """Invalidates and refreshes all the cached data and metadata of the given table.""" - self._jcatalog.refreshTable(tableName) - - @since('2.1.1') - def recoverPartitions(self, tableName): - """Recovers all the partitions of the given table and update the catalog. - - Only works with a partitioned table, and not a view. - """ - self._jcatalog.recoverPartitions(tableName) - - @since('2.2.0') - def refreshByPath(self, path): - """Invalidates and refreshes all the cached data (and the associated metadata) for any - DataFrame that contains the given data source path. - """ - self._jcatalog.refreshByPath(path) - - def _reset(self): - """(Internal use only) Drop all existing databases (except "default"), tables, - partitions and functions, and set the current database to "default". - - This is mainly used for tests. - """ - self._jsparkSession.sessionState().catalog().reset() - - -def _test(): - import os - import doctest - from pyspark.sql import SparkSession - import pyspark.sql.catalog - - os.chdir(os.environ["SPARK_HOME"]) - - globs = pyspark.sql.catalog.__dict__.copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("sql.catalog tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - globs['spark'] = spark - (failure_count, test_count) = doctest.testmod( - pyspark.sql.catalog, - globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) - spark.stop() - if failure_count: - sys.exit(-1) - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/column.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/column.py deleted file mode 100644 index e7dec11..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/column.py +++ /dev/null @@ -1,714 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import json - -if sys.version >= '3': - basestring = str - long = int - -from pyspark import copy_func, since -from pyspark.context import SparkContext -from pyspark.rdd import ignore_unicode_prefix -from pyspark.sql.types import * - -__all__ = ["Column"] - - -def _create_column_from_literal(literal): - sc = SparkContext._active_spark_context - return sc._jvm.functions.lit(literal) - - -def _create_column_from_name(name): - sc = SparkContext._active_spark_context - return sc._jvm.functions.col(name) - - -def _to_java_column(col): - if isinstance(col, Column): - jcol = col._jc - elif isinstance(col, basestring): - jcol = _create_column_from_name(col) - else: - raise TypeError( - "Invalid argument, not a string or column: " - "{0} of type {1}. " - "For column literals, use 'lit', 'array', 'struct' or 'create_map' " - "function.".format(col, type(col))) - return jcol - - -def _to_seq(sc, cols, converter=None): - """ - Convert a list of Column (or names) into a JVM Seq of Column. - - An optional `converter` could be used to convert items in `cols` - into JVM Column objects. - """ - if converter: - cols = [converter(c) for c in cols] - return sc._jvm.PythonUtils.toSeq(cols) - - -def _to_list(sc, cols, converter=None): - """ - Convert a list of Column (or names) into a JVM (Scala) List of Column. - - An optional `converter` could be used to convert items in `cols` - into JVM Column objects. - """ - if converter: - cols = [converter(c) for c in cols] - return sc._jvm.PythonUtils.toList(cols) - - -def _unary_op(name, doc="unary operator"): - """ Create a method for given unary operator """ - def _(self): - jc = getattr(self._jc, name)() - return Column(jc) - _.__doc__ = doc - return _ - - -def _func_op(name, doc=''): - def _(self): - sc = SparkContext._active_spark_context - jc = getattr(sc._jvm.functions, name)(self._jc) - return Column(jc) - _.__doc__ = doc - return _ - - -def _bin_func_op(name, reverse=False, doc="binary function"): - def _(self, other): - sc = SparkContext._active_spark_context - fn = getattr(sc._jvm.functions, name) - jc = other._jc if isinstance(other, Column) else _create_column_from_literal(other) - njc = fn(self._jc, jc) if not reverse else fn(jc, self._jc) - return Column(njc) - _.__doc__ = doc - return _ - - -def _bin_op(name, doc="binary operator"): - """ Create a method for given binary operator - """ - def _(self, other): - jc = other._jc if isinstance(other, Column) else other - njc = getattr(self._jc, name)(jc) - return Column(njc) - _.__doc__ = doc - return _ - - -def _reverse_op(name, doc="binary operator"): - """ Create a method for binary operator (this object is on right side) - """ - def _(self, other): - jother = _create_column_from_literal(other) - jc = getattr(jother, name)(self._jc) - return Column(jc) - _.__doc__ = doc - return _ - - -class Column(object): - - """ - A column in a DataFrame. - - :class:`Column` instances can be created by:: - - # 1. Select a column out of a DataFrame - - df.colName - df["colName"] - - # 2. Create from an expression - df.colName + 1 - 1 / df.colName - - .. versionadded:: 1.3 - """ - - def __init__(self, jc): - self._jc = jc - - # arithmetic operators - __neg__ = _func_op("negate") - __add__ = _bin_op("plus") - __sub__ = _bin_op("minus") - __mul__ = _bin_op("multiply") - __div__ = _bin_op("divide") - __truediv__ = _bin_op("divide") - __mod__ = _bin_op("mod") - __radd__ = _bin_op("plus") - __rsub__ = _reverse_op("minus") - __rmul__ = _bin_op("multiply") - __rdiv__ = _reverse_op("divide") - __rtruediv__ = _reverse_op("divide") - __rmod__ = _reverse_op("mod") - __pow__ = _bin_func_op("pow") - __rpow__ = _bin_func_op("pow", reverse=True) - - # logistic operators - __eq__ = _bin_op("equalTo") - __ne__ = _bin_op("notEqual") - __lt__ = _bin_op("lt") - __le__ = _bin_op("leq") - __ge__ = _bin_op("geq") - __gt__ = _bin_op("gt") - - _eqNullSafe_doc = """ - Equality test that is safe for null values. - - :param other: a value or :class:`Column` - - >>> from pyspark.sql import Row - >>> df1 = spark.createDataFrame([ - ... Row(id=1, value='foo'), - ... Row(id=2, value=None) - ... ]) - >>> df1.select( - ... df1['value'] == 'foo', - ... df1['value'].eqNullSafe('foo'), - ... df1['value'].eqNullSafe(None) - ... ).show() - +-------------+---------------+----------------+ - |(value = foo)|(value <=> foo)|(value <=> NULL)| - +-------------+---------------+----------------+ - | true| true| false| - | null| false| true| - +-------------+---------------+----------------+ - >>> df2 = spark.createDataFrame([ - ... Row(value = 'bar'), - ... Row(value = None) - ... ]) - >>> df1.join(df2, df1["value"] == df2["value"]).count() - 0 - >>> df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count() - 1 - >>> df2 = spark.createDataFrame([ - ... Row(id=1, value=float('NaN')), - ... Row(id=2, value=42.0), - ... Row(id=3, value=None) - ... ]) - >>> df2.select( - ... df2['value'].eqNullSafe(None), - ... df2['value'].eqNullSafe(float('NaN')), - ... df2['value'].eqNullSafe(42.0) - ... ).show() - +----------------+---------------+----------------+ - |(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)| - +----------------+---------------+----------------+ - | false| true| false| - | false| false| true| - | true| false| false| - +----------------+---------------+----------------+ - - .. note:: Unlike Pandas, PySpark doesn't consider NaN values to be NULL. - See the `NaN Semantics`_ for details. - .. _NaN Semantics: - https://spark.apache.org/docs/latest/sql-programming-guide.html#nan-semantics - .. versionadded:: 2.3.0 - """ - eqNullSafe = _bin_op("eqNullSafe", _eqNullSafe_doc) - - # `and`, `or`, `not` cannot be overloaded in Python, - # so use bitwise operators as boolean operators - __and__ = _bin_op('and') - __or__ = _bin_op('or') - __invert__ = _func_op('not') - __rand__ = _bin_op("and") - __ror__ = _bin_op("or") - - # container operators - def __contains__(self, item): - raise ValueError("Cannot apply 'in' operator against a column: please use 'contains' " - "in a string column or 'array_contains' function for an array column.") - - # bitwise operators - _bitwiseOR_doc = """ - Compute bitwise OR of this expression with another expression. - - :param other: a value or :class:`Column` to calculate bitwise or(|) against - this :class:`Column`. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(a=170, b=75)]) - >>> df.select(df.a.bitwiseOR(df.b)).collect() - [Row((a | b)=235)] - """ - _bitwiseAND_doc = """ - Compute bitwise AND of this expression with another expression. - - :param other: a value or :class:`Column` to calculate bitwise and(&) against - this :class:`Column`. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(a=170, b=75)]) - >>> df.select(df.a.bitwiseAND(df.b)).collect() - [Row((a & b)=10)] - """ - _bitwiseXOR_doc = """ - Compute bitwise XOR of this expression with another expression. - - :param other: a value or :class:`Column` to calculate bitwise xor(^) against - this :class:`Column`. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(a=170, b=75)]) - >>> df.select(df.a.bitwiseXOR(df.b)).collect() - [Row((a ^ b)=225)] - """ - - bitwiseOR = _bin_op("bitwiseOR", _bitwiseOR_doc) - bitwiseAND = _bin_op("bitwiseAND", _bitwiseAND_doc) - bitwiseXOR = _bin_op("bitwiseXOR", _bitwiseXOR_doc) - - @since(1.3) - def getItem(self, key): - """ - An expression that gets an item at position ``ordinal`` out of a list, - or gets an item by key out of a dict. - - >>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"]) - >>> df.select(df.l.getItem(0), df.d.getItem("key")).show() - +----+------+ - |l[0]|d[key]| - +----+------+ - | 1| value| - +----+------+ - >>> df.select(df.l[0], df.d["key"]).show() - +----+------+ - |l[0]|d[key]| - +----+------+ - | 1| value| - +----+------+ - """ - return self[key] - - @since(1.3) - def getField(self, name): - """ - An expression that gets a field by name in a StructField. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(r=Row(a=1, b="b"))]) - >>> df.select(df.r.getField("b")).show() - +---+ - |r.b| - +---+ - | b| - +---+ - >>> df.select(df.r.a).show() - +---+ - |r.a| - +---+ - | 1| - +---+ - """ - return self[name] - - def __getattr__(self, item): - if item.startswith("__"): - raise AttributeError(item) - return self.getField(item) - - def __getitem__(self, k): - if isinstance(k, slice): - if k.step is not None: - raise ValueError("slice with step is not supported.") - return self.substr(k.start, k.stop) - else: - return _bin_op("apply")(self, k) - - def __iter__(self): - raise TypeError("Column is not iterable") - - # string methods - _contains_doc = """ - Contains the other element. Returns a boolean :class:`Column` based on a string match. - - :param other: string in line - - >>> df.filter(df.name.contains('o')).collect() - [Row(age=5, name=u'Bob')] - """ - _rlike_doc = """ - SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex - match. - - :param other: an extended regex expression - - >>> df.filter(df.name.rlike('ice$')).collect() - [Row(age=2, name=u'Alice')] - """ - _like_doc = """ - SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match. - - :param other: a SQL LIKE pattern - - See :func:`rlike` for a regex version - - >>> df.filter(df.name.like('Al%')).collect() - [Row(age=2, name=u'Alice')] - """ - _startswith_doc = """ - String starts with. Returns a boolean :class:`Column` based on a string match. - - :param other: string at start of line (do not use a regex `^`) - - >>> df.filter(df.name.startswith('Al')).collect() - [Row(age=2, name=u'Alice')] - >>> df.filter(df.name.startswith('^Al')).collect() - [] - """ - _endswith_doc = """ - String ends with. Returns a boolean :class:`Column` based on a string match. - - :param other: string at end of line (do not use a regex `$`) - - >>> df.filter(df.name.endswith('ice')).collect() - [Row(age=2, name=u'Alice')] - >>> df.filter(df.name.endswith('ice$')).collect() - [] - """ - - contains = ignore_unicode_prefix(_bin_op("contains", _contains_doc)) - rlike = ignore_unicode_prefix(_bin_op("rlike", _rlike_doc)) - like = ignore_unicode_prefix(_bin_op("like", _like_doc)) - startswith = ignore_unicode_prefix(_bin_op("startsWith", _startswith_doc)) - endswith = ignore_unicode_prefix(_bin_op("endsWith", _endswith_doc)) - - @ignore_unicode_prefix - @since(1.3) - def substr(self, startPos, length): - """ - Return a :class:`Column` which is a substring of the column. - - :param startPos: start position (int or Column) - :param length: length of the substring (int or Column) - - >>> df.select(df.name.substr(1, 3).alias("col")).collect() - [Row(col=u'Ali'), Row(col=u'Bob')] - """ - if type(startPos) != type(length): - raise TypeError( - "startPos and length must be the same type. " - "Got {startPos_t} and {length_t}, respectively." - .format( - startPos_t=type(startPos), - length_t=type(length), - )) - if isinstance(startPos, int): - jc = self._jc.substr(startPos, length) - elif isinstance(startPos, Column): - jc = self._jc.substr(startPos._jc, length._jc) - else: - raise TypeError("Unexpected type: %s" % type(startPos)) - return Column(jc) - - @ignore_unicode_prefix - @since(1.5) - def isin(self, *cols): - """ - A boolean expression that is evaluated to true if the value of this - expression is contained by the evaluated values of the arguments. - - >>> df[df.name.isin("Bob", "Mike")].collect() - [Row(age=5, name=u'Bob')] - >>> df[df.age.isin([1, 2, 3])].collect() - [Row(age=2, name=u'Alice')] - """ - if len(cols) == 1 and isinstance(cols[0], (list, set)): - cols = cols[0] - cols = [c._jc if isinstance(c, Column) else _create_column_from_literal(c) for c in cols] - sc = SparkContext._active_spark_context - jc = getattr(self._jc, "isin")(_to_seq(sc, cols)) - return Column(jc) - - # order - _asc_doc = """ - Returns a sort expression based on ascending order of the column. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.asc()).collect() - [Row(name=u'Alice'), Row(name=u'Tom')] - """ - _asc_nulls_first_doc = """ - Returns a sort expression based on ascending order of the column, and null values - return before non-null values. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect() - [Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')] - - .. versionadded:: 2.4 - """ - _asc_nulls_last_doc = """ - Returns a sort expression based on ascending order of the column, and null values - appear after non-null values. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect() - [Row(name=u'Alice'), Row(name=u'Tom'), Row(name=None)] - - .. versionadded:: 2.4 - """ - _desc_doc = """ - Returns a sort expression based on the descending order of the column. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.desc()).collect() - [Row(name=u'Tom'), Row(name=u'Alice')] - """ - _desc_nulls_first_doc = """ - Returns a sort expression based on the descending order of the column, and null values - appear before non-null values. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect() - [Row(name=None), Row(name=u'Tom'), Row(name=u'Alice')] - - .. versionadded:: 2.4 - """ - _desc_nulls_last_doc = """ - Returns a sort expression based on the descending order of the column, and null values - appear after non-null values. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) - >>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect() - [Row(name=u'Tom'), Row(name=u'Alice'), Row(name=None)] - - .. versionadded:: 2.4 - """ - - asc = ignore_unicode_prefix(_unary_op("asc", _asc_doc)) - asc_nulls_first = ignore_unicode_prefix(_unary_op("asc_nulls_first", _asc_nulls_first_doc)) - asc_nulls_last = ignore_unicode_prefix(_unary_op("asc_nulls_last", _asc_nulls_last_doc)) - desc = ignore_unicode_prefix(_unary_op("desc", _desc_doc)) - desc_nulls_first = ignore_unicode_prefix(_unary_op("desc_nulls_first", _desc_nulls_first_doc)) - desc_nulls_last = ignore_unicode_prefix(_unary_op("desc_nulls_last", _desc_nulls_last_doc)) - - _isNull_doc = """ - True if the current expression is null. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) - >>> df.filter(df.height.isNull()).collect() - [Row(height=None, name=u'Alice')] - """ - _isNotNull_doc = """ - True if the current expression is NOT null. - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) - >>> df.filter(df.height.isNotNull()).collect() - [Row(height=80, name=u'Tom')] - """ - - isNull = ignore_unicode_prefix(_unary_op("isNull", _isNull_doc)) - isNotNull = ignore_unicode_prefix(_unary_op("isNotNull", _isNotNull_doc)) - - @since(1.3) - def alias(self, *alias, **kwargs): - """ - Returns this column aliased with a new name or names (in the case of expressions that - return more than one column, such as explode). - - :param alias: strings of desired column names (collects all positional arguments passed) - :param metadata: a dict of information to be stored in ``metadata`` attribute of the - corresponding :class: `StructField` (optional, keyword only argument) - - .. versionchanged:: 2.2 - Added optional ``metadata`` argument. - - >>> df.select(df.age.alias("age2")).collect() - [Row(age2=2), Row(age2=5)] - >>> df.select(df.age.alias("age3", metadata={'max': 99})).schema['age3'].metadata['max'] - 99 - """ - - metadata = kwargs.pop('metadata', None) - assert not kwargs, 'Unexpected kwargs where passed: %s' % kwargs - - sc = SparkContext._active_spark_context - if len(alias) == 1: - if metadata: - jmeta = sc._jvm.org.apache.spark.sql.types.Metadata.fromJson( - json.dumps(metadata)) - return Column(getattr(self._jc, "as")(alias[0], jmeta)) - else: - return Column(getattr(self._jc, "as")(alias[0])) - else: - if metadata: - raise ValueError('metadata can only be provided for a single column') - return Column(getattr(self._jc, "as")(_to_seq(sc, list(alias)))) - - name = copy_func(alias, sinceversion=2.0, doc=":func:`name` is an alias for :func:`alias`.") - - @ignore_unicode_prefix - @since(1.3) - def cast(self, dataType): - """ Convert the column into type ``dataType``. - - >>> df.select(df.age.cast("string").alias('ages')).collect() - [Row(ages=u'2'), Row(ages=u'5')] - >>> df.select(df.age.cast(StringType()).alias('ages')).collect() - [Row(ages=u'2'), Row(ages=u'5')] - """ - if isinstance(dataType, basestring): - jc = self._jc.cast(dataType) - elif isinstance(dataType, DataType): - from pyspark.sql import SparkSession - spark = SparkSession.builder.getOrCreate() - jdt = spark._jsparkSession.parseDataType(dataType.json()) - jc = self._jc.cast(jdt) - else: - raise TypeError("unexpected type: %s" % type(dataType)) - return Column(jc) - - astype = copy_func(cast, sinceversion=1.4, doc=":func:`astype` is an alias for :func:`cast`.") - - @since(1.3) - def between(self, lowerBound, upperBound): - """ - A boolean expression that is evaluated to true if the value of this - expression is between the given columns. - - >>> df.select(df.name, df.age.between(2, 4)).show() - +-----+---------------------------+ - | name|((age >= 2) AND (age <= 4))| - +-----+---------------------------+ - |Alice| true| - | Bob| false| - +-----+---------------------------+ - """ - return (self >= lowerBound) & (self <= upperBound) - - @since(1.4) - def when(self, condition, value): - """ - Evaluates a list of conditions and returns one of multiple possible result expressions. - If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions. - - See :func:`pyspark.sql.functions.when` for example usage. - - :param condition: a boolean :class:`Column` expression. - :param value: a literal value, or a :class:`Column` expression. - - >>> from pyspark.sql import functions as F - >>> df.select(df.name, F.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show() - +-----+------------------------------------------------------------+ - | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0 END| - +-----+------------------------------------------------------------+ - |Alice| -1| - | Bob| 1| - +-----+------------------------------------------------------------+ - """ - if not isinstance(condition, Column): - raise TypeError("condition should be a Column") - v = value._jc if isinstance(value, Column) else value - jc = self._jc.when(condition._jc, v) - return Column(jc) - - @since(1.4) - def otherwise(self, value): - """ - Evaluates a list of conditions and returns one of multiple possible result expressions. - If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions. - - See :func:`pyspark.sql.functions.when` for example usage. - - :param value: a literal value, or a :class:`Column` expression. - - >>> from pyspark.sql import functions as F - >>> df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show() - +-----+-------------------------------------+ - | name|CASE WHEN (age > 3) THEN 1 ELSE 0 END| - +-----+-------------------------------------+ - |Alice| 0| - | Bob| 1| - +-----+-------------------------------------+ - """ - v = value._jc if isinstance(value, Column) else value - jc = self._jc.otherwise(v) - return Column(jc) - - @since(1.4) - def over(self, window): - """ - Define a windowing column. - - :param window: a :class:`WindowSpec` - :return: a Column - - >>> from pyspark.sql import Window - >>> window = Window.partitionBy("name").orderBy("age").rowsBetween(-1, 1) - >>> from pyspark.sql.functions import rank, min - >>> # df.select(rank().over(window), min('age').over(window)) - """ - from pyspark.sql.window import WindowSpec - if not isinstance(window, WindowSpec): - raise TypeError("window should be WindowSpec") - jc = self._jc.over(window._jspec) - return Column(jc) - - def __nonzero__(self): - raise ValueError("Cannot convert column into bool: please use '&' for 'and', '|' for 'or', " - "'~' for 'not' when building DataFrame boolean expressions.") - __bool__ = __nonzero__ - - def __repr__(self): - return 'Column<%s>' % self._jc.toString().encode('utf8') - - -def _test(): - import doctest - from pyspark.sql import SparkSession - import pyspark.sql.column - globs = pyspark.sql.column.__dict__.copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("sql.column tests")\ - .getOrCreate() - sc = spark.sparkContext - globs['spark'] = spark - globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ - .toDF(StructType([StructField('age', IntegerType()), - StructField('name', StringType())])) - - (failure_count, test_count) = doctest.testmod( - pyspark.sql.column, globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/conf.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/conf.py deleted file mode 100644 index 71ea163..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/conf.py +++ /dev/null @@ -1,99 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys - -from pyspark import since, _NoValue -from pyspark.rdd import ignore_unicode_prefix - -if sys.version_info[0] >= 3: - basestring = str - - -class RuntimeConfig(object): - """User-facing configuration API, accessible through `SparkSession.conf`. - - Options set here are automatically propagated to the Hadoop configuration during I/O. - """ - - def __init__(self, jconf): - """Create a new RuntimeConfig that wraps the underlying JVM object.""" - self._jconf = jconf - - @ignore_unicode_prefix - @since(2.0) - def set(self, key, value): - """Sets the given Spark runtime configuration property.""" - self._jconf.set(key, value) - - @ignore_unicode_prefix - @since(2.0) - def get(self, key, default=_NoValue): - """Returns the value of Spark runtime configuration property for the given key, - assuming it is set. - """ - self._checkType(key, "key") - if default is _NoValue: - return self._jconf.get(key) - else: - if default is not None: - self._checkType(default, "default") - return self._jconf.get(key, default) - - @ignore_unicode_prefix - @since(2.0) - def unset(self, key): - """Resets the configuration property for the given key.""" - self._jconf.unset(key) - - def _checkType(self, obj, identifier): - """Assert that an object is of type str.""" - if not isinstance(obj, basestring): - raise TypeError("expected %s '%s' to be a string (was '%s')" % - (identifier, obj, type(obj).__name__)) - - @ignore_unicode_prefix - @since(2.4) - def isModifiable(self, key): - """Indicates whether the configuration property with the given key - is modifiable in the current session. - """ - return self._jconf.isModifiable(key) - - -def _test(): - import os - import doctest - from pyspark.sql.session import SparkSession - import pyspark.sql.conf - - os.chdir(os.environ["SPARK_HOME"]) - - globs = pyspark.sql.conf.__dict__.copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("sql.conf tests")\ - .getOrCreate() - globs['sc'] = spark.sparkContext - globs['spark'] = spark - (failure_count, test_count) = doctest.testmod(pyspark.sql.conf, globs=globs) - spark.stop() - if failure_count: - sys.exit(-1) - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/context.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/context.py deleted file mode 100644 index 9c094dd..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/context.py +++ /dev/null @@ -1,555 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function -import sys -import warnings - -if sys.version >= '3': - basestring = unicode = str - -from pyspark import since, _NoValue -from pyspark.rdd import ignore_unicode_prefix -from pyspark.sql.session import _monkey_patch_RDD, SparkSession -from pyspark.sql.dataframe import DataFrame -from pyspark.sql.readwriter import DataFrameReader -from pyspark.sql.streaming import DataStreamReader -from pyspark.sql.types import IntegerType, Row, StringType -from pyspark.sql.udf import UDFRegistration -from pyspark.sql.utils import install_exception_handler - -__all__ = ["SQLContext", "HiveContext"] - - -class SQLContext(object): - """The entry point for working with structured data (rows and columns) in Spark, in Spark 1.x. - - As of Spark 2.0, this is replaced by :class:`SparkSession`. However, we are keeping the class - here for backward compatibility. - - A SQLContext can be used create :class:`DataFrame`, register :class:`DataFrame` as - tables, execute SQL over tables, cache tables, and read parquet files. - - :param sparkContext: The :class:`SparkContext` backing this SQLContext. - :param sparkSession: The :class:`SparkSession` around which this SQLContext wraps. - :param jsqlContext: An optional JVM Scala SQLContext. If set, we do not instantiate a new - SQLContext in the JVM, instead we make all calls to this object. - """ - - _instantiatedContext = None - - @ignore_unicode_prefix - def __init__(self, sparkContext, sparkSession=None, jsqlContext=None): - """Creates a new SQLContext. - - >>> from datetime import datetime - >>> sqlContext = SQLContext(sc) - >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1, - ... b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1), - ... time=datetime(2014, 8, 1, 14, 1, 5))]) - >>> df = allTypes.toDF() - >>> df.createOrReplaceTempView("allTypes") - >>> sqlContext.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a ' - ... 'from allTypes where b and i > 0').collect() - [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \ - dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)] - >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect() - [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] - """ - self._sc = sparkContext - self._jsc = self._sc._jsc - self._jvm = self._sc._jvm - if sparkSession is None: - sparkSession = SparkSession.builder.getOrCreate() - if jsqlContext is None: - jsqlContext = sparkSession._jwrapped - self.sparkSession = sparkSession - self._jsqlContext = jsqlContext - _monkey_patch_RDD(self.sparkSession) - install_exception_handler() - if SQLContext._instantiatedContext is None: - SQLContext._instantiatedContext = self - - @property - def _ssql_ctx(self): - """Accessor for the JVM Spark SQL context. - - Subclasses can override this property to provide their own - JVM Contexts. - """ - return self._jsqlContext - - @property - def _conf(self): - """Accessor for the JVM SQL-specific configurations""" - return self.sparkSession._jsparkSession.sessionState().conf() - - @classmethod - @since(1.6) - def getOrCreate(cls, sc): - """ - Get the existing SQLContext or create a new one with given SparkContext. - - :param sc: SparkContext - """ - if cls._instantiatedContext is None: - jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc()) - sparkSession = SparkSession(sc, jsqlContext.sparkSession()) - cls(sc, sparkSession, jsqlContext) - return cls._instantiatedContext - - @since(1.6) - def newSession(self): - """ - Returns a new SQLContext as new session, that has separate SQLConf, - registered temporary views and UDFs, but shared SparkContext and - table cache. - """ - return self.__class__(self._sc, self.sparkSession.newSession()) - - @since(1.3) - def setConf(self, key, value): - """Sets the given Spark SQL configuration property. - """ - self.sparkSession.conf.set(key, value) - - @ignore_unicode_prefix - @since(1.3) - def getConf(self, key, defaultValue=_NoValue): - """Returns the value of Spark SQL configuration property for the given key. - - If the key is not set and defaultValue is set, return - defaultValue. If the key is not set and defaultValue is not set, return - the system default value. - - >>> sqlContext.getConf("spark.sql.shuffle.partitions") - u'200' - >>> sqlContext.getConf("spark.sql.shuffle.partitions", u"10") - u'10' - >>> sqlContext.setConf("spark.sql.shuffle.partitions", u"50") - >>> sqlContext.getConf("spark.sql.shuffle.partitions", u"10") - u'50' - """ - return self.sparkSession.conf.get(key, defaultValue) - - @property - @since("1.3.1") - def udf(self): - """Returns a :class:`UDFRegistration` for UDF registration. - - :return: :class:`UDFRegistration` - """ - return self.sparkSession.udf - - @since(1.4) - def range(self, start, end=None, step=1, numPartitions=None): - """ - Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named - ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with - step value ``step``. - - :param start: the start value - :param end: the end value (exclusive) - :param step: the incremental step (default: 1) - :param numPartitions: the number of partitions of the DataFrame - :return: :class:`DataFrame` - - >>> sqlContext.range(1, 7, 2).collect() - [Row(id=1), Row(id=3), Row(id=5)] - - If only one argument is specified, it will be used as the end value. - - >>> sqlContext.range(3).collect() - [Row(id=0), Row(id=1), Row(id=2)] - """ - return self.sparkSession.range(start, end, step, numPartitions) - - @since(1.2) - def registerFunction(self, name, f, returnType=None): - """An alias for :func:`spark.udf.register`. - See :meth:`pyspark.sql.UDFRegistration.register`. - - .. note:: Deprecated in 2.3.0. Use :func:`spark.udf.register` instead. - """ - warnings.warn( - "Deprecated in 2.3.0. Use spark.udf.register instead.", - DeprecationWarning) - return self.sparkSession.udf.register(name, f, returnType) - - @since(2.1) - def registerJavaFunction(self, name, javaClassName, returnType=None): - """An alias for :func:`spark.udf.registerJavaFunction`. - See :meth:`pyspark.sql.UDFRegistration.registerJavaFunction`. - - .. note:: Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead. - """ - warnings.warn( - "Deprecated in 2.3.0. Use spark.udf.registerJavaFunction instead.", - DeprecationWarning) - return self.sparkSession.udf.registerJavaFunction(name, javaClassName, returnType) - - # TODO(andrew): delete this once we refactor things to take in SparkSession - def _inferSchema(self, rdd, samplingRatio=None): - """ - Infer schema from an RDD of Row or tuple. - - :param rdd: an RDD of Row or tuple - :param samplingRatio: sampling ratio, or no sampling (default) - :return: :class:`pyspark.sql.types.StructType` - """ - return self.sparkSession._inferSchema(rdd, samplingRatio) - - @since(1.3) - @ignore_unicode_prefix - def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): - """ - Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. - - When ``schema`` is a list of column names, the type of each column - will be inferred from ``data``. - - When ``schema`` is ``None``, it will try to infer the schema (column names and types) - from ``data``, which should be an RDD of :class:`Row`, - or :class:`namedtuple`, or :class:`dict`. - - When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string it must match - the real data, or an exception will be thrown at runtime. If the given schema is not - :class:`pyspark.sql.types.StructType`, it will be wrapped into a - :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", - each record will also be wrapped into a tuple, which can be converted to row later. - - If schema inference is needed, ``samplingRatio`` is used to determined the ratio of - rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. - - :param data: an RDD of any kind of SQL data representation(e.g. :class:`Row`, - :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`, or - :class:`pandas.DataFrame`. - :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of - column names, default is None. The data type string format equals to - :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can - omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use - ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. - We can also use ``int`` as a short name for :class:`pyspark.sql.types.IntegerType`. - :param samplingRatio: the sample ratio of rows used for inferring - :param verifySchema: verify data types of every row against schema. - :return: :class:`DataFrame` - - .. versionchanged:: 2.0 - The ``schema`` parameter can be a :class:`pyspark.sql.types.DataType` or a - datatype string after 2.0. - If it's not a :class:`pyspark.sql.types.StructType`, it will be wrapped into a - :class:`pyspark.sql.types.StructType` and each record will also be wrapped into a tuple. - - .. versionchanged:: 2.1 - Added verifySchema. - - >>> l = [('Alice', 1)] - >>> sqlContext.createDataFrame(l).collect() - [Row(_1=u'Alice', _2=1)] - >>> sqlContext.createDataFrame(l, ['name', 'age']).collect() - [Row(name=u'Alice', age=1)] - - >>> d = [{'name': 'Alice', 'age': 1}] - >>> sqlContext.createDataFrame(d).collect() - [Row(age=1, name=u'Alice')] - - >>> rdd = sc.parallelize(l) - >>> sqlContext.createDataFrame(rdd).collect() - [Row(_1=u'Alice', _2=1)] - >>> df = sqlContext.createDataFrame(rdd, ['name', 'age']) - >>> df.collect() - [Row(name=u'Alice', age=1)] - - >>> from pyspark.sql import Row - >>> Person = Row('name', 'age') - >>> person = rdd.map(lambda r: Person(*r)) - >>> df2 = sqlContext.createDataFrame(person) - >>> df2.collect() - [Row(name=u'Alice', age=1)] - - >>> from pyspark.sql.types import * - >>> schema = StructType([ - ... StructField("name", StringType(), True), - ... StructField("age", IntegerType(), True)]) - >>> df3 = sqlContext.createDataFrame(rdd, schema) - >>> df3.collect() - [Row(name=u'Alice', age=1)] - - >>> sqlContext.createDataFrame(df.toPandas()).collect() # doctest: +SKIP - [Row(name=u'Alice', age=1)] - >>> sqlContext.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP - [Row(0=1, 1=2)] - - >>> sqlContext.createDataFrame(rdd, "a: string, b: int").collect() - [Row(a=u'Alice', b=1)] - >>> rdd = rdd.map(lambda row: row[1]) - >>> sqlContext.createDataFrame(rdd, "int").collect() - [Row(value=1)] - >>> sqlContext.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - Py4JJavaError: ... - """ - return self.sparkSession.createDataFrame(data, schema, samplingRatio, verifySchema) - - @since(1.3) - def registerDataFrameAsTable(self, df, tableName): - """Registers the given :class:`DataFrame` as a temporary table in the catalog. - - Temporary tables exist only during the lifetime of this instance of :class:`SQLContext`. - - >>> sqlContext.registerDataFrameAsTable(df, "table1") - """ - df.createOrReplaceTempView(tableName) - - @since(1.6) - def dropTempTable(self, tableName): - """ Remove the temp table from catalog. - - >>> sqlContext.registerDataFrameAsTable(df, "table1") - >>> sqlContext.dropTempTable("table1") - """ - self.sparkSession.catalog.dropTempView(tableName) - - @since(1.3) - def createExternalTable(self, tableName, path=None, source=None, schema=None, **options): - """Creates an external table based on the dataset in a data source. - - It returns the DataFrame associated with the external table. - - The data source is specified by the ``source`` and a set of ``options``. - If ``source`` is not specified, the default data source configured by - ``spark.sql.sources.default`` will be used. - - Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and - created external table. - - :return: :class:`DataFrame` - """ - return self.sparkSession.catalog.createExternalTable( - tableName, path, source, schema, **options) - - @ignore_unicode_prefix - @since(1.0) - def sql(self, sqlQuery): - """Returns a :class:`DataFrame` representing the result of the given query. - - :return: :class:`DataFrame` - - >>> sqlContext.registerDataFrameAsTable(df, "table1") - >>> df2 = sqlContext.sql("SELECT field1 AS f1, field2 as f2 from table1") - >>> df2.collect() - [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')] - """ - return self.sparkSession.sql(sqlQuery) - - @since(1.0) - def table(self, tableName): - """Returns the specified table or view as a :class:`DataFrame`. - - :return: :class:`DataFrame` - - >>> sqlContext.registerDataFrameAsTable(df, "table1") - >>> df2 = sqlContext.table("table1") - >>> sorted(df.collect()) == sorted(df2.collect()) - True - """ - return self.sparkSession.table(tableName) - - @ignore_unicode_prefix - @since(1.3) - def tables(self, dbName=None): - """Returns a :class:`DataFrame` containing names of tables in the given database. - - If ``dbName`` is not specified, the current database will be used. - - The returned DataFrame has two columns: ``tableName`` and ``isTemporary`` - (a column with :class:`BooleanType` indicating if a table is a temporary one or not). - - :param dbName: string, name of the database to use. - :return: :class:`DataFrame` - - >>> sqlContext.registerDataFrameAsTable(df, "table1") - >>> df2 = sqlContext.tables() - >>> df2.filter("tableName = 'table1'").first() - Row(database=u'', tableName=u'table1', isTemporary=True) - """ - if dbName is None: - return DataFrame(self._ssql_ctx.tables(), self) - else: - return DataFrame(self._ssql_ctx.tables(dbName), self) - - @since(1.3) - def tableNames(self, dbName=None): - """Returns a list of names of tables in the database ``dbName``. - - :param dbName: string, name of the database to use. Default to the current database. - :return: list of table names, in string - - >>> sqlContext.registerDataFrameAsTable(df, "table1") - >>> "table1" in sqlContext.tableNames() - True - >>> "table1" in sqlContext.tableNames("default") - True - """ - if dbName is None: - return [name for name in self._ssql_ctx.tableNames()] - else: - return [name for name in self._ssql_ctx.tableNames(dbName)] - - @since(1.0) - def cacheTable(self, tableName): - """Caches the specified table in-memory.""" - self._ssql_ctx.cacheTable(tableName) - - @since(1.0) - def uncacheTable(self, tableName): - """Removes the specified table from the in-memory cache.""" - self._ssql_ctx.uncacheTable(tableName) - - @since(1.3) - def clearCache(self): - """Removes all cached tables from the in-memory cache. """ - self._ssql_ctx.clearCache() - - @property - @since(1.4) - def read(self): - """ - Returns a :class:`DataFrameReader` that can be used to read data - in as a :class:`DataFrame`. - - :return: :class:`DataFrameReader` - """ - return DataFrameReader(self) - - @property - @since(2.0) - def readStream(self): - """ - Returns a :class:`DataStreamReader` that can be used to read data streams - as a streaming :class:`DataFrame`. - - .. note:: Evolving. - - :return: :class:`DataStreamReader` - - >>> text_sdf = sqlContext.readStream.text(tempfile.mkdtemp()) - >>> text_sdf.isStreaming - True - """ - return DataStreamReader(self) - - @property - @since(2.0) - def streams(self): - """Returns a :class:`StreamingQueryManager` that allows managing all the - :class:`StreamingQuery` StreamingQueries active on `this` context. - - .. note:: Evolving. - """ - from pyspark.sql.streaming import StreamingQueryManager - return StreamingQueryManager(self._ssql_ctx.streams()) - - -class HiveContext(SQLContext): - """A variant of Spark SQL that integrates with data stored in Hive. - - Configuration for Hive is read from ``hive-site.xml`` on the classpath. - It supports running both SQL and HiveQL commands. - - :param sparkContext: The SparkContext to wrap. - :param jhiveContext: An optional JVM Scala HiveContext. If set, we do not instantiate a new - :class:`HiveContext` in the JVM, instead we make all calls to this object. - - .. note:: Deprecated in 2.0.0. Use SparkSession.builder.enableHiveSupport().getOrCreate(). - """ - - def __init__(self, sparkContext, jhiveContext=None): - warnings.warn( - "HiveContext is deprecated in Spark 2.0.0. Please use " + - "SparkSession.builder.enableHiveSupport().getOrCreate() instead.", - DeprecationWarning) - if jhiveContext is None: - sparkSession = SparkSession.builder.enableHiveSupport().getOrCreate() - else: - sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession()) - SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext) - - @classmethod - def _createForTesting(cls, sparkContext): - """(Internal use only) Create a new HiveContext for testing. - - All test code that touches HiveContext *must* go through this method. Otherwise, - you may end up launching multiple derby instances and encounter with incredibly - confusing error messages. - """ - jsc = sparkContext._jsc.sc() - jtestHive = sparkContext._jvm.org.apache.spark.sql.hive.test.TestHiveContext(jsc, False) - return cls(sparkContext, jtestHive) - - def refreshTable(self, tableName): - """Invalidate and refresh all the cached the metadata of the given - table. For performance reasons, Spark SQL or the external data source - library it uses might cache certain metadata about a table, such as the - location of blocks. When those change outside of Spark SQL, users should - call this function to invalidate the cache. - """ - self._ssql_ctx.refreshTable(tableName) - - -def _test(): - import os - import doctest - import tempfile - from pyspark.context import SparkContext - from pyspark.sql import Row, SQLContext - import pyspark.sql.context - - os.chdir(os.environ["SPARK_HOME"]) - - globs = pyspark.sql.context.__dict__.copy() - sc = SparkContext('local[4]', 'PythonTest') - globs['tempfile'] = tempfile - globs['os'] = os - globs['sc'] = sc - globs['sqlContext'] = SQLContext(sc) - globs['rdd'] = rdd = sc.parallelize( - [Row(field1=1, field2="row1"), - Row(field1=2, field2="row2"), - Row(field1=3, field2="row3")] - ) - globs['df'] = rdd.toDF() - jsonStrings = [ - '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', - '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' - '"field6":[{"field7": "row2"}]}', - '{"field1" : null, "field2": "row3", ' - '"field3":{"field4":33, "field5": []}}' - ] - globs['jsonStrings'] = jsonStrings - globs['json'] = sc.parallelize(jsonStrings) - (failure_count, test_count) = doctest.testmod( - pyspark.sql.context, globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) - globs['sc'].stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/dataframe.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/dataframe.py deleted file mode 100644 index 1affc9b..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/dataframe.py +++ /dev/null @@ -1,2327 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import random - -if sys.version >= '3': - basestring = unicode = str - long = int - from functools import reduce -else: - from itertools import imap as map - -import warnings - -from pyspark import copy_func, since, _NoValue -from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix -from pyspark.serializers import ArrowStreamSerializer, BatchedSerializer, PickleSerializer, \ - UTF8Deserializer -from pyspark.storagelevel import StorageLevel -from pyspark.traceback_utils import SCCallSiteSync -from pyspark.sql.types import _parse_datatype_json_string -from pyspark.sql.column import Column, _to_seq, _to_list, _to_java_column -from pyspark.sql.readwriter import DataFrameWriter -from pyspark.sql.streaming import DataStreamWriter -from pyspark.sql.types import IntegralType -from pyspark.sql.types import * -from pyspark.util import _exception_message - -__all__ = ["DataFrame", "DataFrameNaFunctions", "DataFrameStatFunctions"] - - -class DataFrame(object): - """A distributed collection of data grouped into named columns. - - A :class:`DataFrame` is equivalent to a relational table in Spark SQL, - and can be created using various functions in :class:`SparkSession`:: - - people = spark.read.parquet("...") - - Once created, it can be manipulated using the various domain-specific-language - (DSL) functions defined in: :class:`DataFrame`, :class:`Column`. - - To select a column from the data frame, use the apply method:: - - ageCol = people.age - - A more concrete example:: - - # To create DataFrame using SparkSession - people = spark.read.parquet("...") - department = spark.read.parquet("...") - - people.filter(people.age > 30).join(department, people.deptId == department.id) \\ - .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"}) - - .. versionadded:: 1.3 - """ - - def __init__(self, jdf, sql_ctx): - self._jdf = jdf - self.sql_ctx = sql_ctx - self._sc = sql_ctx and sql_ctx._sc - self.is_cached = False - self._schema = None # initialized lazily - self._lazy_rdd = None - # Check whether _repr_html is supported or not, we use it to avoid calling _jdf twice - # by __repr__ and _repr_html_ while eager evaluation opened. - self._support_repr_html = False - - @property - @since(1.3) - def rdd(self): - """Returns the content as an :class:`pyspark.RDD` of :class:`Row`. - """ - if self._lazy_rdd is None: - jrdd = self._jdf.javaToPython() - self._lazy_rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer())) - return self._lazy_rdd - - @property - @since("1.3.1") - def na(self): - """Returns a :class:`DataFrameNaFunctions` for handling missing values. - """ - return DataFrameNaFunctions(self) - - @property - @since(1.4) - def stat(self): - """Returns a :class:`DataFrameStatFunctions` for statistic functions. - """ - return DataFrameStatFunctions(self) - - @ignore_unicode_prefix - @since(1.3) - def toJSON(self, use_unicode=True): - """Converts a :class:`DataFrame` into a :class:`RDD` of string. - - Each row is turned into a JSON document as one element in the returned RDD. - - >>> df.toJSON().first() - u'{"age":2,"name":"Alice"}' - """ - rdd = self._jdf.toJSON() - return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode)) - - @since(1.3) - def registerTempTable(self, name): - """Registers this DataFrame as a temporary table using the given name. - - The lifetime of this temporary table is tied to the :class:`SparkSession` - that was used to create this :class:`DataFrame`. - - >>> df.registerTempTable("people") - >>> df2 = spark.sql("select * from people") - >>> sorted(df.collect()) == sorted(df2.collect()) - True - >>> spark.catalog.dropTempView("people") - - .. note:: Deprecated in 2.0, use createOrReplaceTempView instead. - """ - warnings.warn( - "Deprecated in 2.0, use createOrReplaceTempView instead.", DeprecationWarning) - self._jdf.createOrReplaceTempView(name) - - @since(2.0) - def createTempView(self, name): - """Creates a local temporary view with this DataFrame. - - The lifetime of this temporary table is tied to the :class:`SparkSession` - that was used to create this :class:`DataFrame`. - throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the - catalog. - - >>> df.createTempView("people") - >>> df2 = spark.sql("select * from people") - >>> sorted(df.collect()) == sorted(df2.collect()) - True - >>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - AnalysisException: u"Temporary table 'people' already exists;" - >>> spark.catalog.dropTempView("people") - - """ - self._jdf.createTempView(name) - - @since(2.0) - def createOrReplaceTempView(self, name): - """Creates or replaces a local temporary view with this DataFrame. - - The lifetime of this temporary table is tied to the :class:`SparkSession` - that was used to create this :class:`DataFrame`. - - >>> df.createOrReplaceTempView("people") - >>> df2 = df.filter(df.age > 3) - >>> df2.createOrReplaceTempView("people") - >>> df3 = spark.sql("select * from people") - >>> sorted(df3.collect()) == sorted(df2.collect()) - True - >>> spark.catalog.dropTempView("people") - - """ - self._jdf.createOrReplaceTempView(name) - - @since(2.1) - def createGlobalTempView(self, name): - """Creates a global temporary view with this DataFrame. - - The lifetime of this temporary view is tied to this Spark application. - throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the - catalog. - - >>> df.createGlobalTempView("people") - >>> df2 = spark.sql("select * from global_temp.people") - >>> sorted(df.collect()) == sorted(df2.collect()) - True - >>> df.createGlobalTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - AnalysisException: u"Temporary table 'people' already exists;" - >>> spark.catalog.dropGlobalTempView("people") - - """ - self._jdf.createGlobalTempView(name) - - @since(2.2) - def createOrReplaceGlobalTempView(self, name): - """Creates or replaces a global temporary view using the given name. - - The lifetime of this temporary view is tied to this Spark application. - - >>> df.createOrReplaceGlobalTempView("people") - >>> df2 = df.filter(df.age > 3) - >>> df2.createOrReplaceGlobalTempView("people") - >>> df3 = spark.sql("select * from global_temp.people") - >>> sorted(df3.collect()) == sorted(df2.collect()) - True - >>> spark.catalog.dropGlobalTempView("people") - - """ - self._jdf.createOrReplaceGlobalTempView(name) - - @property - @since(1.4) - def write(self): - """ - Interface for saving the content of the non-streaming :class:`DataFrame` out into external - storage. - - :return: :class:`DataFrameWriter` - """ - return DataFrameWriter(self) - - @property - @since(2.0) - def writeStream(self): - """ - Interface for saving the content of the streaming :class:`DataFrame` out into external - storage. - - .. note:: Evolving. - - :return: :class:`DataStreamWriter` - """ - return DataStreamWriter(self) - - @property - @since(1.3) - def schema(self): - """Returns the schema of this :class:`DataFrame` as a :class:`pyspark.sql.types.StructType`. - - >>> df.schema - StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true))) - """ - if self._schema is None: - try: - self._schema = _parse_datatype_json_string(self._jdf.schema().json()) - except AttributeError as e: - raise Exception( - "Unable to parse datatype from schema. %s" % e) - return self._schema - - @since(1.3) - def printSchema(self): - """Prints out the schema in the tree format. - - >>> df.printSchema() - root - |-- age: integer (nullable = true) - |-- name: string (nullable = true) - - """ - print(self._jdf.schema().treeString()) - - @since(1.3) - def explain(self, extended=False): - """Prints the (logical and physical) plans to the console for debugging purpose. - - :param extended: boolean, default ``False``. If ``False``, prints only the physical plan. - - >>> df.explain() - == Physical Plan == - Scan ExistingRDD[age#0,name#1] - - >>> df.explain(True) - == Parsed Logical Plan == - ... - == Analyzed Logical Plan == - ... - == Optimized Logical Plan == - ... - == Physical Plan == - ... - """ - if extended: - print(self._jdf.queryExecution().toString()) - else: - print(self._jdf.queryExecution().simpleString()) - - @since(2.4) - def exceptAll(self, other): - """Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but - not in another :class:`DataFrame` while preserving duplicates. - - This is equivalent to `EXCEPT ALL` in SQL. - - >>> df1 = spark.createDataFrame( - ... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"]) - >>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"]) - - >>> df1.exceptAll(df2).show() - +---+---+ - | C1| C2| - +---+---+ - | a| 1| - | a| 1| - | a| 2| - | c| 4| - +---+---+ - - Also as standard in SQL, this function resolves columns by position (not by name). - """ - return DataFrame(self._jdf.exceptAll(other._jdf), self.sql_ctx) - - @since(1.3) - def isLocal(self): - """Returns ``True`` if the :func:`collect` and :func:`take` methods can be run locally - (without any Spark executors). - """ - return self._jdf.isLocal() - - @property - @since(2.0) - def isStreaming(self): - """Returns true if this :class:`Dataset` contains one or more sources that continuously - return data as it arrives. A :class:`Dataset` that reads data from a streaming source - must be executed as a :class:`StreamingQuery` using the :func:`start` method in - :class:`DataStreamWriter`. Methods that return a single answer, (e.g., :func:`count` or - :func:`collect`) will throw an :class:`AnalysisException` when there is a streaming - source present. - - .. note:: Evolving - """ - return self._jdf.isStreaming() - - @since(1.3) - def show(self, n=20, truncate=True, vertical=False): - """Prints the first ``n`` rows to the console. - - :param n: Number of rows to show. - :param truncate: If set to True, truncate strings longer than 20 chars by default. - If set to a number greater than one, truncates long strings to length ``truncate`` - and align cells right. - :param vertical: If set to True, print output rows vertically (one line - per column value). - - >>> df - DataFrame[age: int, name: string] - >>> df.show() - +---+-----+ - |age| name| - +---+-----+ - | 2|Alice| - | 5| Bob| - +---+-----+ - >>> df.show(truncate=3) - +---+----+ - |age|name| - +---+----+ - | 2| Ali| - | 5| Bob| - +---+----+ - >>> df.show(vertical=True) - -RECORD 0----- - age | 2 - name | Alice - -RECORD 1----- - age | 5 - name | Bob - """ - if isinstance(truncate, bool) and truncate: - print(self._jdf.showString(n, 20, vertical)) - else: - print(self._jdf.showString(n, int(truncate), vertical)) - - def __repr__(self): - if not self._support_repr_html and self.sql_ctx._conf.isReplEagerEvalEnabled(): - vertical = False - return self._jdf.showString( - self.sql_ctx._conf.replEagerEvalMaxNumRows(), - self.sql_ctx._conf.replEagerEvalTruncate(), vertical) - else: - return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes)) - - def _repr_html_(self): - """Returns a dataframe with html code when you enabled eager evaluation - by 'spark.sql.repl.eagerEval.enabled', this only called by REPL you are - using support eager evaluation with HTML. - """ - import cgi - if not self._support_repr_html: - self._support_repr_html = True - if self.sql_ctx._conf.isReplEagerEvalEnabled(): - max_num_rows = max(self.sql_ctx._conf.replEagerEvalMaxNumRows(), 0) - sock_info = self._jdf.getRowsToPython( - max_num_rows, self.sql_ctx._conf.replEagerEvalTruncate()) - rows = list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer()))) - head = rows[0] - row_data = rows[1:] - has_more_data = len(row_data) > max_num_rows - row_data = row_data[:max_num_rows] - - html = "\n" - # generate table head - html += "\n" % "\n" % "
      %s
      ".join(map(lambda x: cgi.escape(x), head)) - # generate table rows - for row in row_data: - html += "
      %s
      ".join( - map(lambda x: cgi.escape(x), row)) - html += "
      \n" - if has_more_data: - html += "only showing top %d %s\n" % ( - max_num_rows, "row" if max_num_rows == 1 else "rows") - return html - else: - return None - - @since(2.1) - def checkpoint(self, eager=True): - """Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the - logical plan of this DataFrame, which is especially useful in iterative algorithms where the - plan may grow exponentially. It will be saved to files inside the checkpoint - directory set with L{SparkContext.setCheckpointDir()}. - - :param eager: Whether to checkpoint this DataFrame immediately - - .. note:: Experimental - """ - jdf = self._jdf.checkpoint(eager) - return DataFrame(jdf, self.sql_ctx) - - @since(2.3) - def localCheckpoint(self, eager=True): - """Returns a locally checkpointed version of this Dataset. Checkpointing can be used to - truncate the logical plan of this DataFrame, which is especially useful in iterative - algorithms where the plan may grow exponentially. Local checkpoints are stored in the - executors using the caching subsystem and therefore they are not reliable. - - :param eager: Whether to checkpoint this DataFrame immediately - - .. note:: Experimental - """ - jdf = self._jdf.localCheckpoint(eager) - return DataFrame(jdf, self.sql_ctx) - - @since(2.1) - def withWatermark(self, eventTime, delayThreshold): - """Defines an event time watermark for this :class:`DataFrame`. A watermark tracks a point - in time before which we assume no more late data is going to arrive. - - Spark will use this watermark for several purposes: - - To know when a given time window aggregation can be finalized and thus can be emitted - when using output modes that do not allow updates. - - - To minimize the amount of state that we need to keep for on-going aggregations. - - The current watermark is computed by looking at the `MAX(eventTime)` seen across - all of the partitions in the query minus a user specified `delayThreshold`. Due to the cost - of coordinating this value across partitions, the actual watermark used is only guaranteed - to be at least `delayThreshold` behind the actual event time. In some cases we may still - process records that arrive more than `delayThreshold` late. - - :param eventTime: the name of the column that contains the event time of the row. - :param delayThreshold: the minimum delay to wait to data to arrive late, relative to the - latest record that has been processed in the form of an interval - (e.g. "1 minute" or "5 hours"). - - .. note:: Evolving - - >>> sdf.select('name', sdf.time.cast('timestamp')).withWatermark('time', '10 minutes') - DataFrame[name: string, time: timestamp] - """ - if not eventTime or type(eventTime) is not str: - raise TypeError("eventTime should be provided as a string") - if not delayThreshold or type(delayThreshold) is not str: - raise TypeError("delayThreshold should be provided as a string interval") - jdf = self._jdf.withWatermark(eventTime, delayThreshold) - return DataFrame(jdf, self.sql_ctx) - - @since(2.2) - def hint(self, name, *parameters): - """Specifies some hint on the current DataFrame. - - :param name: A name of the hint. - :param parameters: Optional parameters. - :return: :class:`DataFrame` - - >>> df.join(df2.hint("broadcast"), "name").show() - +----+---+------+ - |name|age|height| - +----+---+------+ - | Bob| 5| 85| - +----+---+------+ - """ - if len(parameters) == 1 and isinstance(parameters[0], list): - parameters = parameters[0] - - if not isinstance(name, str): - raise TypeError("name should be provided as str, got {0}".format(type(name))) - - for p in parameters: - if not isinstance(p, str): - raise TypeError( - "all parameters should be str, got {0} of type {1}".format(p, type(p))) - - jdf = self._jdf.hint(name, self._jseq(parameters)) - return DataFrame(jdf, self.sql_ctx) - - @since(1.3) - def count(self): - """Returns the number of rows in this :class:`DataFrame`. - - >>> df.count() - 2 - """ - return int(self._jdf.count()) - - @ignore_unicode_prefix - @since(1.3) - def collect(self): - """Returns all the records as a list of :class:`Row`. - - >>> df.collect() - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] - """ - with SCCallSiteSync(self._sc) as css: - sock_info = self._jdf.collectToPython() - return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer()))) - - @ignore_unicode_prefix - @since(2.0) - def toLocalIterator(self): - """ - Returns an iterator that contains all of the rows in this :class:`DataFrame`. - The iterator will consume as much memory as the largest partition in this DataFrame. - - >>> list(df.toLocalIterator()) - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] - """ - with SCCallSiteSync(self._sc) as css: - sock_info = self._jdf.toPythonIterator() - return _load_from_socket(sock_info, BatchedSerializer(PickleSerializer())) - - @ignore_unicode_prefix - @since(1.3) - def limit(self, num): - """Limits the result count to the number specified. - - >>> df.limit(1).collect() - [Row(age=2, name=u'Alice')] - >>> df.limit(0).collect() - [] - """ - jdf = self._jdf.limit(num) - return DataFrame(jdf, self.sql_ctx) - - @ignore_unicode_prefix - @since(1.3) - def take(self, num): - """Returns the first ``num`` rows as a :class:`list` of :class:`Row`. - - >>> df.take(2) - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] - """ - return self.limit(num).collect() - - @since(1.3) - def foreach(self, f): - """Applies the ``f`` function to all :class:`Row` of this :class:`DataFrame`. - - This is a shorthand for ``df.rdd.foreach()``. - - >>> def f(person): - ... print(person.name) - >>> df.foreach(f) - """ - self.rdd.foreach(f) - - @since(1.3) - def foreachPartition(self, f): - """Applies the ``f`` function to each partition of this :class:`DataFrame`. - - This a shorthand for ``df.rdd.foreachPartition()``. - - >>> def f(people): - ... for person in people: - ... print(person.name) - >>> df.foreachPartition(f) - """ - self.rdd.foreachPartition(f) - - @since(1.3) - def cache(self): - """Persists the :class:`DataFrame` with the default storage level (C{MEMORY_AND_DISK}). - - .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0. - """ - self.is_cached = True - self._jdf.cache() - return self - - @since(1.3) - def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK): - """Sets the storage level to persist the contents of the :class:`DataFrame` across - operations after the first time it is computed. This can only be used to assign - a new storage level if the :class:`DataFrame` does not have a storage level set yet. - If no storage level is specified defaults to (C{MEMORY_AND_DISK}). - - .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0. - """ - self.is_cached = True - javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel) - self._jdf.persist(javaStorageLevel) - return self - - @property - @since(2.1) - def storageLevel(self): - """Get the :class:`DataFrame`'s current storage level. - - >>> df.storageLevel - StorageLevel(False, False, False, False, 1) - >>> df.cache().storageLevel - StorageLevel(True, True, False, True, 1) - >>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel - StorageLevel(True, False, False, False, 2) - """ - java_storage_level = self._jdf.storageLevel() - storage_level = StorageLevel(java_storage_level.useDisk(), - java_storage_level.useMemory(), - java_storage_level.useOffHeap(), - java_storage_level.deserialized(), - java_storage_level.replication()) - return storage_level - - @since(1.3) - def unpersist(self, blocking=False): - """Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from - memory and disk. - - .. note:: `blocking` default has changed to False to match Scala in 2.0. - """ - self.is_cached = False - self._jdf.unpersist(blocking) - return self - - @since(1.4) - def coalesce(self, numPartitions): - """ - Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions. - - :param numPartitions: int, to specify the target number of partitions - - Similar to coalesce defined on an :class:`RDD`, this operation results in a - narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, - there will not be a shuffle, instead each of the 100 new partitions will - claim 10 of the current partitions. If a larger number of partitions is requested, - it will stay at the current number of partitions. - - However, if you're doing a drastic coalesce, e.g. to numPartitions = 1, - this may result in your computation taking place on fewer nodes than - you like (e.g. one node in the case of numPartitions = 1). To avoid this, - you can call repartition(). This will add a shuffle step, but means the - current upstream partitions will be executed in parallel (per whatever - the current partitioning is). - - >>> df.coalesce(1).rdd.getNumPartitions() - 1 - """ - return DataFrame(self._jdf.coalesce(numPartitions), self.sql_ctx) - - @since(1.3) - def repartition(self, numPartitions, *cols): - """ - Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The - resulting DataFrame is hash partitioned. - - :param numPartitions: - can be an int to specify the target number of partitions or a Column. - If it is a Column, it will be used as the first partitioning column. If not specified, - the default number of partitions is used. - - .. versionchanged:: 1.6 - Added optional arguments to specify the partitioning columns. Also made numPartitions - optional if partitioning columns are specified. - - >>> df.repartition(10).rdd.getNumPartitions() - 10 - >>> data = df.union(df).repartition("age") - >>> data.show() - +---+-----+ - |age| name| - +---+-----+ - | 5| Bob| - | 5| Bob| - | 2|Alice| - | 2|Alice| - +---+-----+ - >>> data = data.repartition(7, "age") - >>> data.show() - +---+-----+ - |age| name| - +---+-----+ - | 2|Alice| - | 5| Bob| - | 2|Alice| - | 5| Bob| - +---+-----+ - >>> data.rdd.getNumPartitions() - 7 - >>> data = data.repartition("name", "age") - >>> data.show() - +---+-----+ - |age| name| - +---+-----+ - | 5| Bob| - | 5| Bob| - | 2|Alice| - | 2|Alice| - +---+-----+ - """ - if isinstance(numPartitions, int): - if len(cols) == 0: - return DataFrame(self._jdf.repartition(numPartitions), self.sql_ctx) - else: - return DataFrame( - self._jdf.repartition(numPartitions, self._jcols(*cols)), self.sql_ctx) - elif isinstance(numPartitions, (basestring, Column)): - cols = (numPartitions, ) + cols - return DataFrame(self._jdf.repartition(self._jcols(*cols)), self.sql_ctx) - else: - raise TypeError("numPartitions should be an int or Column") - - @since("2.4.0") - def repartitionByRange(self, numPartitions, *cols): - """ - Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The - resulting DataFrame is range partitioned. - - :param numPartitions: - can be an int to specify the target number of partitions or a Column. - If it is a Column, it will be used as the first partitioning column. If not specified, - the default number of partitions is used. - - At least one partition-by expression must be specified. - When no explicit sort order is specified, "ascending nulls first" is assumed. - - >>> df.repartitionByRange(2, "age").rdd.getNumPartitions() - 2 - >>> df.show() - +---+-----+ - |age| name| - +---+-----+ - | 2|Alice| - | 5| Bob| - +---+-----+ - >>> df.repartitionByRange(1, "age").rdd.getNumPartitions() - 1 - >>> data = df.repartitionByRange("age") - >>> df.show() - +---+-----+ - |age| name| - +---+-----+ - | 2|Alice| - | 5| Bob| - +---+-----+ - """ - if isinstance(numPartitions, int): - if len(cols) == 0: - return ValueError("At least one partition-by expression must be specified.") - else: - return DataFrame( - self._jdf.repartitionByRange(numPartitions, self._jcols(*cols)), self.sql_ctx) - elif isinstance(numPartitions, (basestring, Column)): - cols = (numPartitions,) + cols - return DataFrame(self._jdf.repartitionByRange(self._jcols(*cols)), self.sql_ctx) - else: - raise TypeError("numPartitions should be an int, string or Column") - - @since(1.3) - def distinct(self): - """Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`. - - >>> df.distinct().count() - 2 - """ - return DataFrame(self._jdf.distinct(), self.sql_ctx) - - @since(1.3) - def sample(self, withReplacement=None, fraction=None, seed=None): - """Returns a sampled subset of this :class:`DataFrame`. - - :param withReplacement: Sample with replacement or not (default False). - :param fraction: Fraction of rows to generate, range [0.0, 1.0]. - :param seed: Seed for sampling (default a random seed). - - .. note:: This is not guaranteed to provide exactly the fraction specified of the total - count of the given :class:`DataFrame`. - - .. note:: `fraction` is required and, `withReplacement` and `seed` are optional. - - >>> df = spark.range(10) - >>> df.sample(0.5, 3).count() - 4 - >>> df.sample(fraction=0.5, seed=3).count() - 4 - >>> df.sample(withReplacement=True, fraction=0.5, seed=3).count() - 1 - >>> df.sample(1.0).count() - 10 - >>> df.sample(fraction=1.0).count() - 10 - >>> df.sample(False, fraction=1.0).count() - 10 - """ - - # For the cases below: - # sample(True, 0.5 [, seed]) - # sample(True, fraction=0.5 [, seed]) - # sample(withReplacement=False, fraction=0.5 [, seed]) - is_withReplacement_set = \ - type(withReplacement) == bool and isinstance(fraction, float) - - # For the case below: - # sample(faction=0.5 [, seed]) - is_withReplacement_omitted_kwargs = \ - withReplacement is None and isinstance(fraction, float) - - # For the case below: - # sample(0.5 [, seed]) - is_withReplacement_omitted_args = isinstance(withReplacement, float) - - if not (is_withReplacement_set - or is_withReplacement_omitted_kwargs - or is_withReplacement_omitted_args): - argtypes = [ - str(type(arg)) for arg in [withReplacement, fraction, seed] if arg is not None] - raise TypeError( - "withReplacement (optional), fraction (required) and seed (optional)" - " should be a bool, float and number; however, " - "got [%s]." % ", ".join(argtypes)) - - if is_withReplacement_omitted_args: - if fraction is not None: - seed = fraction - fraction = withReplacement - withReplacement = None - - seed = long(seed) if seed is not None else None - args = [arg for arg in [withReplacement, fraction, seed] if arg is not None] - jdf = self._jdf.sample(*args) - return DataFrame(jdf, self.sql_ctx) - - @since(1.5) - def sampleBy(self, col, fractions, seed=None): - """ - Returns a stratified sample without replacement based on the - fraction given on each stratum. - - :param col: column that defines strata - :param fractions: - sampling fraction for each stratum. If a stratum is not - specified, we treat its fraction as zero. - :param seed: random seed - :return: a new DataFrame that represents the stratified sample - - >>> from pyspark.sql.functions import col - >>> dataset = sqlContext.range(0, 100).select((col("id") % 3).alias("key")) - >>> sampled = dataset.sampleBy("key", fractions={0: 0.1, 1: 0.2}, seed=0) - >>> sampled.groupBy("key").count().orderBy("key").show() - +---+-----+ - |key|count| - +---+-----+ - | 0| 5| - | 1| 9| - +---+-----+ - - """ - if not isinstance(col, basestring): - raise ValueError("col must be a string, but got %r" % type(col)) - if not isinstance(fractions, dict): - raise ValueError("fractions must be a dict but got %r" % type(fractions)) - for k, v in fractions.items(): - if not isinstance(k, (float, int, long, basestring)): - raise ValueError("key must be float, int, long, or string, but got %r" % type(k)) - fractions[k] = float(v) - seed = seed if seed is not None else random.randint(0, sys.maxsize) - return DataFrame(self._jdf.stat().sampleBy(col, self._jmap(fractions), seed), self.sql_ctx) - - @since(1.4) - def randomSplit(self, weights, seed=None): - """Randomly splits this :class:`DataFrame` with the provided weights. - - :param weights: list of doubles as weights with which to split the DataFrame. Weights will - be normalized if they don't sum up to 1.0. - :param seed: The seed for sampling. - - >>> splits = df4.randomSplit([1.0, 2.0], 24) - >>> splits[0].count() - 1 - - >>> splits[1].count() - 3 - """ - for w in weights: - if w < 0.0: - raise ValueError("Weights must be positive. Found weight value: %s" % w) - seed = seed if seed is not None else random.randint(0, sys.maxsize) - rdd_array = self._jdf.randomSplit(_to_list(self.sql_ctx._sc, weights), long(seed)) - return [DataFrame(rdd, self.sql_ctx) for rdd in rdd_array] - - @property - @since(1.3) - def dtypes(self): - """Returns all column names and their data types as a list. - - >>> df.dtypes - [('age', 'int'), ('name', 'string')] - """ - return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields] - - @property - @since(1.3) - def columns(self): - """Returns all column names as a list. - - >>> df.columns - ['age', 'name'] - """ - return [f.name for f in self.schema.fields] - - @since(2.3) - def colRegex(self, colName): - """ - Selects column based on the column name specified as a regex and returns it - as :class:`Column`. - - :param colName: string, column name specified as a regex. - - >>> df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["Col1", "Col2"]) - >>> df.select(df.colRegex("`(Col1)?+.+`")).show() - +----+ - |Col2| - +----+ - | 1| - | 2| - | 3| - +----+ - """ - if not isinstance(colName, basestring): - raise ValueError("colName should be provided as string") - jc = self._jdf.colRegex(colName) - return Column(jc) - - @ignore_unicode_prefix - @since(1.3) - def alias(self, alias): - """Returns a new :class:`DataFrame` with an alias set. - - :param alias: string, an alias name to be set for the DataFrame. - - >>> from pyspark.sql.functions import * - >>> df_as1 = df.alias("df_as1") - >>> df_as2 = df.alias("df_as2") - >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner') - >>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age").collect() - [Row(name=u'Bob', name=u'Bob', age=5), Row(name=u'Alice', name=u'Alice', age=2)] - """ - assert isinstance(alias, basestring), "alias should be a string" - return DataFrame(getattr(self._jdf, "as")(alias), self.sql_ctx) - - @ignore_unicode_prefix - @since(2.1) - def crossJoin(self, other): - """Returns the cartesian product with another :class:`DataFrame`. - - :param other: Right side of the cartesian product. - - >>> df.select("age", "name").collect() - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] - >>> df2.select("name", "height").collect() - [Row(name=u'Tom', height=80), Row(name=u'Bob', height=85)] - >>> df.crossJoin(df2.select("height")).select("age", "name", "height").collect() - [Row(age=2, name=u'Alice', height=80), Row(age=2, name=u'Alice', height=85), - Row(age=5, name=u'Bob', height=80), Row(age=5, name=u'Bob', height=85)] - """ - - jdf = self._jdf.crossJoin(other._jdf) - return DataFrame(jdf, self.sql_ctx) - - @ignore_unicode_prefix - @since(1.3) - def join(self, other, on=None, how=None): - """Joins with another :class:`DataFrame`, using the given join expression. - - :param other: Right side of the join - :param on: a string for the join column name, a list of column names, - a join expression (Column), or a list of Columns. - If `on` is a string or a list of strings indicating the name of the join column(s), - the column(s) must exist on both sides, and this performs an equi-join. - :param how: str, default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``, - ``full``, ``full_outer``, ``left``, ``left_outer``, ``right``, ``right_outer``, - ``left_semi``, and ``left_anti``. - - The following performs a full outer join between ``df1`` and ``df2``. - - >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).collect() - [Row(name=None, height=80), Row(name=u'Bob', height=85), Row(name=u'Alice', height=None)] - - >>> df.join(df2, 'name', 'outer').select('name', 'height').collect() - [Row(name=u'Tom', height=80), Row(name=u'Bob', height=85), Row(name=u'Alice', height=None)] - - >>> cond = [df.name == df3.name, df.age == df3.age] - >>> df.join(df3, cond, 'outer').select(df.name, df3.age).collect() - [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)] - - >>> df.join(df2, 'name').select(df.name, df2.height).collect() - [Row(name=u'Bob', height=85)] - - >>> df.join(df4, ['name', 'age']).select(df.name, df.age).collect() - [Row(name=u'Bob', age=5)] - """ - - if on is not None and not isinstance(on, list): - on = [on] - - if on is not None: - if isinstance(on[0], basestring): - on = self._jseq(on) - else: - assert isinstance(on[0], Column), "on should be Column or list of Column" - on = reduce(lambda x, y: x.__and__(y), on) - on = on._jc - - if on is None and how is None: - jdf = self._jdf.join(other._jdf) - else: - if how is None: - how = "inner" - if on is None: - on = self._jseq([]) - assert isinstance(how, basestring), "how should be basestring" - jdf = self._jdf.join(other._jdf, on, how) - return DataFrame(jdf, self.sql_ctx) - - @since(1.6) - def sortWithinPartitions(self, *cols, **kwargs): - """Returns a new :class:`DataFrame` with each partition sorted by the specified column(s). - - :param cols: list of :class:`Column` or column names to sort by. - :param ascending: boolean or list of boolean (default True). - Sort ascending vs. descending. Specify list for multiple sort orders. - If a list is specified, length of the list must equal length of the `cols`. - - >>> df.sortWithinPartitions("age", ascending=False).show() - +---+-----+ - |age| name| - +---+-----+ - | 2|Alice| - | 5| Bob| - +---+-----+ - """ - jdf = self._jdf.sortWithinPartitions(self._sort_cols(cols, kwargs)) - return DataFrame(jdf, self.sql_ctx) - - @ignore_unicode_prefix - @since(1.3) - def sort(self, *cols, **kwargs): - """Returns a new :class:`DataFrame` sorted by the specified column(s). - - :param cols: list of :class:`Column` or column names to sort by. - :param ascending: boolean or list of boolean (default True). - Sort ascending vs. descending. Specify list for multiple sort orders. - If a list is specified, length of the list must equal length of the `cols`. - - >>> df.sort(df.age.desc()).collect() - [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')] - >>> df.sort("age", ascending=False).collect() - [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')] - >>> df.orderBy(df.age.desc()).collect() - [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')] - >>> from pyspark.sql.functions import * - >>> df.sort(asc("age")).collect() - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] - >>> df.orderBy(desc("age"), "name").collect() - [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')] - >>> df.orderBy(["age", "name"], ascending=[0, 1]).collect() - [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')] - """ - jdf = self._jdf.sort(self._sort_cols(cols, kwargs)) - return DataFrame(jdf, self.sql_ctx) - - orderBy = sort - - def _jseq(self, cols, converter=None): - """Return a JVM Seq of Columns from a list of Column or names""" - return _to_seq(self.sql_ctx._sc, cols, converter) - - def _jmap(self, jm): - """Return a JVM Scala Map from a dict""" - return _to_scala_map(self.sql_ctx._sc, jm) - - def _jcols(self, *cols): - """Return a JVM Seq of Columns from a list of Column or column names - - If `cols` has only one list in it, cols[0] will be used as the list. - """ - if len(cols) == 1 and isinstance(cols[0], list): - cols = cols[0] - return self._jseq(cols, _to_java_column) - - def _sort_cols(self, cols, kwargs): - """ Return a JVM Seq of Columns that describes the sort order - """ - if not cols: - raise ValueError("should sort by at least one column") - if len(cols) == 1 and isinstance(cols[0], list): - cols = cols[0] - jcols = [_to_java_column(c) for c in cols] - ascending = kwargs.get('ascending', True) - if isinstance(ascending, (bool, int)): - if not ascending: - jcols = [jc.desc() for jc in jcols] - elif isinstance(ascending, list): - jcols = [jc if asc else jc.desc() - for asc, jc in zip(ascending, jcols)] - else: - raise TypeError("ascending can only be boolean or list, but got %s" % type(ascending)) - return self._jseq(jcols) - - @since("1.3.1") - def describe(self, *cols): - """Computes basic statistics for numeric and string columns. - - This include count, mean, stddev, min, and max. If no columns are - given, this function computes statistics for all numerical or string columns. - - .. note:: This function is meant for exploratory data analysis, as we make no - guarantee about the backward compatibility of the schema of the resulting DataFrame. - - >>> df.describe(['age']).show() - +-------+------------------+ - |summary| age| - +-------+------------------+ - | count| 2| - | mean| 3.5| - | stddev|2.1213203435596424| - | min| 2| - | max| 5| - +-------+------------------+ - >>> df.describe().show() - +-------+------------------+-----+ - |summary| age| name| - +-------+------------------+-----+ - | count| 2| 2| - | mean| 3.5| null| - | stddev|2.1213203435596424| null| - | min| 2|Alice| - | max| 5| Bob| - +-------+------------------+-----+ - - Use summary for expanded statistics and control over which statistics to compute. - """ - if len(cols) == 1 and isinstance(cols[0], list): - cols = cols[0] - jdf = self._jdf.describe(self._jseq(cols)) - return DataFrame(jdf, self.sql_ctx) - - @since("2.3.0") - def summary(self, *statistics): - """Computes specified statistics for numeric and string columns. Available statistics are: - - count - - mean - - stddev - - min - - max - - arbitrary approximate percentiles specified as a percentage (eg, 75%) - - If no statistics are given, this function computes count, mean, stddev, min, - approximate quartiles (percentiles at 25%, 50%, and 75%), and max. - - .. note:: This function is meant for exploratory data analysis, as we make no - guarantee about the backward compatibility of the schema of the resulting DataFrame. - - >>> df.summary().show() - +-------+------------------+-----+ - |summary| age| name| - +-------+------------------+-----+ - | count| 2| 2| - | mean| 3.5| null| - | stddev|2.1213203435596424| null| - | min| 2|Alice| - | 25%| 2| null| - | 50%| 2| null| - | 75%| 5| null| - | max| 5| Bob| - +-------+------------------+-----+ - - >>> df.summary("count", "min", "25%", "75%", "max").show() - +-------+---+-----+ - |summary|age| name| - +-------+---+-----+ - | count| 2| 2| - | min| 2|Alice| - | 25%| 2| null| - | 75%| 5| null| - | max| 5| Bob| - +-------+---+-----+ - - To do a summary for specific columns first select them: - - >>> df.select("age", "name").summary("count").show() - +-------+---+----+ - |summary|age|name| - +-------+---+----+ - | count| 2| 2| - +-------+---+----+ - - See also describe for basic statistics. - """ - if len(statistics) == 1 and isinstance(statistics[0], list): - statistics = statistics[0] - jdf = self._jdf.summary(self._jseq(statistics)) - return DataFrame(jdf, self.sql_ctx) - - @ignore_unicode_prefix - @since(1.3) - def head(self, n=None): - """Returns the first ``n`` rows. - - .. note:: This method should only be used if the resulting array is expected - to be small, as all the data is loaded into the driver's memory. - - :param n: int, default 1. Number of rows to return. - :return: If n is greater than 1, return a list of :class:`Row`. - If n is 1, return a single Row. - - >>> df.head() - Row(age=2, name=u'Alice') - >>> df.head(1) - [Row(age=2, name=u'Alice')] - """ - if n is None: - rs = self.head(1) - return rs[0] if rs else None - return self.take(n) - - @ignore_unicode_prefix - @since(1.3) - def first(self): - """Returns the first row as a :class:`Row`. - - >>> df.first() - Row(age=2, name=u'Alice') - """ - return self.head() - - @ignore_unicode_prefix - @since(1.3) - def __getitem__(self, item): - """Returns the column as a :class:`Column`. - - >>> df.select(df['age']).collect() - [Row(age=2), Row(age=5)] - >>> df[ ["name", "age"]].collect() - [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)] - >>> df[ df.age > 3 ].collect() - [Row(age=5, name=u'Bob')] - >>> df[df[0] > 3].collect() - [Row(age=5, name=u'Bob')] - """ - if isinstance(item, basestring): - jc = self._jdf.apply(item) - return Column(jc) - elif isinstance(item, Column): - return self.filter(item) - elif isinstance(item, (list, tuple)): - return self.select(*item) - elif isinstance(item, int): - jc = self._jdf.apply(self.columns[item]) - return Column(jc) - else: - raise TypeError("unexpected item type: %s" % type(item)) - - @since(1.3) - def __getattr__(self, name): - """Returns the :class:`Column` denoted by ``name``. - - >>> df.select(df.age).collect() - [Row(age=2), Row(age=5)] - """ - if name not in self.columns: - raise AttributeError( - "'%s' object has no attribute '%s'" % (self.__class__.__name__, name)) - jc = self._jdf.apply(name) - return Column(jc) - - @ignore_unicode_prefix - @since(1.3) - def select(self, *cols): - """Projects a set of expressions and returns a new :class:`DataFrame`. - - :param cols: list of column names (string) or expressions (:class:`Column`). - If one of the column names is '*', that column is expanded to include all columns - in the current DataFrame. - - >>> df.select('*').collect() - [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')] - >>> df.select('name', 'age').collect() - [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)] - >>> df.select(df.name, (df.age + 10).alias('age')).collect() - [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)] - """ - jdf = self._jdf.select(self._jcols(*cols)) - return DataFrame(jdf, self.sql_ctx) - - @since(1.3) - def selectExpr(self, *expr): - """Projects a set of SQL expressions and returns a new :class:`DataFrame`. - - This is a variant of :func:`select` that accepts SQL expressions. - - >>> df.selectExpr("age * 2", "abs(age)").collect() - [Row((age * 2)=4, abs(age)=2), Row((age * 2)=10, abs(age)=5)] - """ - if len(expr) == 1 and isinstance(expr[0], list): - expr = expr[0] - jdf = self._jdf.selectExpr(self._jseq(expr)) - return DataFrame(jdf, self.sql_ctx) - - @ignore_unicode_prefix - @since(1.3) - def filter(self, condition): - """Filters rows using the given condition. - - :func:`where` is an alias for :func:`filter`. - - :param condition: a :class:`Column` of :class:`types.BooleanType` - or a string of SQL expression. - - >>> df.filter(df.age > 3).collect() - [Row(age=5, name=u'Bob')] - >>> df.where(df.age == 2).collect() - [Row(age=2, name=u'Alice')] - - >>> df.filter("age > 3").collect() - [Row(age=5, name=u'Bob')] - >>> df.where("age = 2").collect() - [Row(age=2, name=u'Alice')] - """ - if isinstance(condition, basestring): - jdf = self._jdf.filter(condition) - elif isinstance(condition, Column): - jdf = self._jdf.filter(condition._jc) - else: - raise TypeError("condition should be string or Column") - return DataFrame(jdf, self.sql_ctx) - - @ignore_unicode_prefix - @since(1.3) - def groupBy(self, *cols): - """Groups the :class:`DataFrame` using the specified columns, - so we can run aggregation on them. See :class:`GroupedData` - for all the available aggregate functions. - - :func:`groupby` is an alias for :func:`groupBy`. - - :param cols: list of columns to group by. - Each element should be a column name (string) or an expression (:class:`Column`). - - >>> df.groupBy().avg().collect() - [Row(avg(age)=3.5)] - >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect()) - [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)] - >>> sorted(df.groupBy(df.name).avg().collect()) - [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)] - >>> sorted(df.groupBy(['name', df.age]).count().collect()) - [Row(name=u'Alice', age=2, count=1), Row(name=u'Bob', age=5, count=1)] - """ - jgd = self._jdf.groupBy(self._jcols(*cols)) - from pyspark.sql.group import GroupedData - return GroupedData(jgd, self) - - @since(1.4) - def rollup(self, *cols): - """ - Create a multi-dimensional rollup for the current :class:`DataFrame` using - the specified columns, so we can run aggregation on them. - - >>> df.rollup("name", df.age).count().orderBy("name", "age").show() - +-----+----+-----+ - | name| age|count| - +-----+----+-----+ - | null|null| 2| - |Alice|null| 1| - |Alice| 2| 1| - | Bob|null| 1| - | Bob| 5| 1| - +-----+----+-----+ - """ - jgd = self._jdf.rollup(self._jcols(*cols)) - from pyspark.sql.group import GroupedData - return GroupedData(jgd, self) - - @since(1.4) - def cube(self, *cols): - """ - Create a multi-dimensional cube for the current :class:`DataFrame` using - the specified columns, so we can run aggregation on them. - - >>> df.cube("name", df.age).count().orderBy("name", "age").show() - +-----+----+-----+ - | name| age|count| - +-----+----+-----+ - | null|null| 2| - | null| 2| 1| - | null| 5| 1| - |Alice|null| 1| - |Alice| 2| 1| - | Bob|null| 1| - | Bob| 5| 1| - +-----+----+-----+ - """ - jgd = self._jdf.cube(self._jcols(*cols)) - from pyspark.sql.group import GroupedData - return GroupedData(jgd, self) - - @since(1.3) - def agg(self, *exprs): - """ Aggregate on the entire :class:`DataFrame` without groups - (shorthand for ``df.groupBy.agg()``). - - >>> df.agg({"age": "max"}).collect() - [Row(max(age)=5)] - >>> from pyspark.sql import functions as F - >>> df.agg(F.min(df.age)).collect() - [Row(min(age)=2)] - """ - return self.groupBy().agg(*exprs) - - @since(2.0) - def union(self, other): - """ Return a new :class:`DataFrame` containing union of rows in this and another frame. - - This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union - (that does deduplication of elements), use this function followed by :func:`distinct`. - - Also as standard in SQL, this function resolves columns by position (not by name). - """ - return DataFrame(self._jdf.union(other._jdf), self.sql_ctx) - - @since(1.3) - def unionAll(self, other): - """ Return a new :class:`DataFrame` containing union of rows in this and another frame. - - This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union - (that does deduplication of elements), use this function followed by :func:`distinct`. - - Also as standard in SQL, this function resolves columns by position (not by name). - - .. note:: Deprecated in 2.0, use :func:`union` instead. - """ - warnings.warn("Deprecated in 2.0, use union instead.", DeprecationWarning) - return self.union(other) - - @since(2.3) - def unionByName(self, other): - """ Returns a new :class:`DataFrame` containing union of rows in this and another frame. - - This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set - union (that does deduplication of elements), use this function followed by :func:`distinct`. - - The difference between this function and :func:`union` is that this function - resolves columns by name (not by position): - - >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"]) - >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"]) - >>> df1.unionByName(df2).show() - +----+----+----+ - |col0|col1|col2| - +----+----+----+ - | 1| 2| 3| - | 6| 4| 5| - +----+----+----+ - """ - return DataFrame(self._jdf.unionByName(other._jdf), self.sql_ctx) - - @since(1.3) - def intersect(self, other): - """ Return a new :class:`DataFrame` containing rows only in - both this frame and another frame. - - This is equivalent to `INTERSECT` in SQL. - """ - return DataFrame(self._jdf.intersect(other._jdf), self.sql_ctx) - - @since(2.4) - def intersectAll(self, other): - """ Return a new :class:`DataFrame` containing rows in both this dataframe and other - dataframe while preserving duplicates. - - This is equivalent to `INTERSECT ALL` in SQL. - >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"]) - >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"]) - - >>> df1.intersectAll(df2).sort("C1", "C2").show() - +---+---+ - | C1| C2| - +---+---+ - | a| 1| - | a| 1| - | b| 3| - +---+---+ - - Also as standard in SQL, this function resolves columns by position (not by name). - """ - return DataFrame(self._jdf.intersectAll(other._jdf), self.sql_ctx) - - @since(1.3) - def subtract(self, other): - """ Return a new :class:`DataFrame` containing rows in this frame - but not in another frame. - - This is equivalent to `EXCEPT DISTINCT` in SQL. - - """ - return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx) - - @since(1.4) - def dropDuplicates(self, subset=None): - """Return a new :class:`DataFrame` with duplicate rows removed, - optionally only considering certain columns. - - For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming - :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop - duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can - be and system will accordingly limit the state. In addition, too late data older than - watermark will be dropped to avoid any possibility of duplicates. - - :func:`drop_duplicates` is an alias for :func:`dropDuplicates`. - - >>> from pyspark.sql import Row - >>> df = sc.parallelize([ \\ - ... Row(name='Alice', age=5, height=80), \\ - ... Row(name='Alice', age=5, height=80), \\ - ... Row(name='Alice', age=10, height=80)]).toDF() - >>> df.dropDuplicates().show() - +---+------+-----+ - |age|height| name| - +---+------+-----+ - | 5| 80|Alice| - | 10| 80|Alice| - +---+------+-----+ - - >>> df.dropDuplicates(['name', 'height']).show() - +---+------+-----+ - |age|height| name| - +---+------+-----+ - | 5| 80|Alice| - +---+------+-----+ - """ - if subset is None: - jdf = self._jdf.dropDuplicates() - else: - jdf = self._jdf.dropDuplicates(self._jseq(subset)) - return DataFrame(jdf, self.sql_ctx) - - @since("1.3.1") - def dropna(self, how='any', thresh=None, subset=None): - """Returns a new :class:`DataFrame` omitting rows with null values. - :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other. - - :param how: 'any' or 'all'. - If 'any', drop a row if it contains any nulls. - If 'all', drop a row only if all its values are null. - :param thresh: int, default None - If specified, drop rows that have less than `thresh` non-null values. - This overwrites the `how` parameter. - :param subset: optional list of column names to consider. - - >>> df4.na.drop().show() - +---+------+-----+ - |age|height| name| - +---+------+-----+ - | 10| 80|Alice| - +---+------+-----+ - """ - if how is not None and how not in ['any', 'all']: - raise ValueError("how ('" + how + "') should be 'any' or 'all'") - - if subset is None: - subset = self.columns - elif isinstance(subset, basestring): - subset = [subset] - elif not isinstance(subset, (list, tuple)): - raise ValueError("subset should be a list or tuple of column names") - - if thresh is None: - thresh = len(subset) if how == 'any' else 1 - - return DataFrame(self._jdf.na().drop(thresh, self._jseq(subset)), self.sql_ctx) - - @since("1.3.1") - def fillna(self, value, subset=None): - """Replace null values, alias for ``na.fill()``. - :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other. - - :param value: int, long, float, string, bool or dict. - Value to replace null values with. - If the value is a dict, then `subset` is ignored and `value` must be a mapping - from column name (string) to replacement value. The replacement value must be - an int, long, float, boolean, or string. - :param subset: optional list of column names to consider. - Columns specified in subset that do not have matching data type are ignored. - For example, if `value` is a string, and subset contains a non-string column, - then the non-string column is simply ignored. - - >>> df4.na.fill(50).show() - +---+------+-----+ - |age|height| name| - +---+------+-----+ - | 10| 80|Alice| - | 5| 50| Bob| - | 50| 50| Tom| - | 50| 50| null| - +---+------+-----+ - - >>> df5.na.fill(False).show() - +----+-------+-----+ - | age| name| spy| - +----+-------+-----+ - | 10| Alice|false| - | 5| Bob|false| - |null|Mallory| true| - +----+-------+-----+ - - >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show() - +---+------+-------+ - |age|height| name| - +---+------+-------+ - | 10| 80| Alice| - | 5| null| Bob| - | 50| null| Tom| - | 50| null|unknown| - +---+------+-------+ - """ - if not isinstance(value, (float, int, long, basestring, bool, dict)): - raise ValueError("value should be a float, int, long, string, bool or dict") - - # Note that bool validates isinstance(int), but we don't want to - # convert bools to floats - - if not isinstance(value, bool) and isinstance(value, (int, long)): - value = float(value) - - if isinstance(value, dict): - return DataFrame(self._jdf.na().fill(value), self.sql_ctx) - elif subset is None: - return DataFrame(self._jdf.na().fill(value), self.sql_ctx) - else: - if isinstance(subset, basestring): - subset = [subset] - elif not isinstance(subset, (list, tuple)): - raise ValueError("subset should be a list or tuple of column names") - - return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx) - - @since(1.4) - def replace(self, to_replace, value=_NoValue, subset=None): - """Returns a new :class:`DataFrame` replacing a value with another value. - :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are - aliases of each other. - Values to_replace and value must have the same type and can only be numerics, booleans, - or strings. Value can have None. When replacing, the new value will be cast - to the type of the existing column. - For numeric replacements all values to be replaced should have unique - floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`) - and arbitrary replacement will be used. - - :param to_replace: bool, int, long, float, string, list or dict. - Value to be replaced. - If the value is a dict, then `value` is ignored or can be omitted, and `to_replace` - must be a mapping between a value and a replacement. - :param value: bool, int, long, float, string, list or None. - The replacement value must be a bool, int, long, float, string or None. If `value` is a - list, `value` should be of the same length and type as `to_replace`. - If `value` is a scalar and `to_replace` is a sequence, then `value` is - used as a replacement for each item in `to_replace`. - :param subset: optional list of column names to consider. - Columns specified in subset that do not have matching data type are ignored. - For example, if `value` is a string, and subset contains a non-string column, - then the non-string column is simply ignored. - - >>> df4.na.replace(10, 20).show() - +----+------+-----+ - | age|height| name| - +----+------+-----+ - | 20| 80|Alice| - | 5| null| Bob| - |null| null| Tom| - |null| null| null| - +----+------+-----+ - - >>> df4.na.replace('Alice', None).show() - +----+------+----+ - | age|height|name| - +----+------+----+ - | 10| 80|null| - | 5| null| Bob| - |null| null| Tom| - |null| null|null| - +----+------+----+ - - >>> df4.na.replace({'Alice': None}).show() - +----+------+----+ - | age|height|name| - +----+------+----+ - | 10| 80|null| - | 5| null| Bob| - |null| null| Tom| - |null| null|null| - +----+------+----+ - - >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show() - +----+------+----+ - | age|height|name| - +----+------+----+ - | 10| 80| A| - | 5| null| B| - |null| null| Tom| - |null| null|null| - +----+------+----+ - """ - if value is _NoValue: - if isinstance(to_replace, dict): - value = None - else: - raise TypeError("value argument is required when to_replace is not a dictionary.") - - # Helper functions - def all_of(types): - """Given a type or tuple of types and a sequence of xs - check if each x is instance of type(s) - - >>> all_of(bool)([True, False]) - True - >>> all_of(basestring)(["a", 1]) - False - """ - def all_of_(xs): - return all(isinstance(x, types) for x in xs) - return all_of_ - - all_of_bool = all_of(bool) - all_of_str = all_of(basestring) - all_of_numeric = all_of((float, int, long)) - - # Validate input types - valid_types = (bool, float, int, long, basestring, list, tuple) - if not isinstance(to_replace, valid_types + (dict, )): - raise ValueError( - "to_replace should be a bool, float, int, long, string, list, tuple, or dict. " - "Got {0}".format(type(to_replace))) - - if not isinstance(value, valid_types) and value is not None \ - and not isinstance(to_replace, dict): - raise ValueError("If to_replace is not a dict, value should be " - "a bool, float, int, long, string, list, tuple or None. " - "Got {0}".format(type(value))) - - if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)): - if len(to_replace) != len(value): - raise ValueError("to_replace and value lists should be of the same length. " - "Got {0} and {1}".format(len(to_replace), len(value))) - - if not (subset is None or isinstance(subset, (list, tuple, basestring))): - raise ValueError("subset should be a list or tuple of column names, " - "column name or None. Got {0}".format(type(subset))) - - # Reshape input arguments if necessary - if isinstance(to_replace, (float, int, long, basestring)): - to_replace = [to_replace] - - if isinstance(to_replace, dict): - rep_dict = to_replace - if value is not None: - warnings.warn("to_replace is a dict and value is not None. value will be ignored.") - else: - if isinstance(value, (float, int, long, basestring)) or value is None: - value = [value for _ in range(len(to_replace))] - rep_dict = dict(zip(to_replace, value)) - - if isinstance(subset, basestring): - subset = [subset] - - # Verify we were not passed in mixed type generics. - if not any(all_of_type(rep_dict.keys()) - and all_of_type(x for x in rep_dict.values() if x is not None) - for all_of_type in [all_of_bool, all_of_str, all_of_numeric]): - raise ValueError("Mixed type replacements are not supported") - - if subset is None: - return DataFrame(self._jdf.na().replace('*', rep_dict), self.sql_ctx) - else: - return DataFrame( - self._jdf.na().replace(self._jseq(subset), self._jmap(rep_dict)), self.sql_ctx) - - @since(2.0) - def approxQuantile(self, col, probabilities, relativeError): - """ - Calculates the approximate quantiles of numerical columns of a - DataFrame. - - The result of this algorithm has the following deterministic bound: - If the DataFrame has N elements and if we request the quantile at - probability `p` up to error `err`, then the algorithm will return - a sample `x` from the DataFrame so that the *exact* rank of `x` is - close to (p * N). More precisely, - - floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). - - This method implements a variation of the Greenwald-Khanna - algorithm (with some speed optimizations). The algorithm was first - present in [[http://dx.doi.org/10.1145/375663.375670 - Space-efficient Online Computation of Quantile Summaries]] - by Greenwald and Khanna. - - Note that null values will be ignored in numerical columns before calculation. - For columns only containing null values, an empty list is returned. - - :param col: str, list. - Can be a single column name, or a list of names for multiple columns. - :param probabilities: a list of quantile probabilities - Each number must belong to [0, 1]. - For example 0 is the minimum, 0.5 is the median, 1 is the maximum. - :param relativeError: The relative target precision to achieve - (>= 0). If set to zero, the exact quantiles are computed, which - could be very expensive. Note that values greater than 1 are - accepted but give the same result as 1. - :return: the approximate quantiles at the given probabilities. If - the input `col` is a string, the output is a list of floats. If the - input `col` is a list or tuple of strings, the output is also a - list, but each element in it is a list of floats, i.e., the output - is a list of list of floats. - - .. versionchanged:: 2.2 - Added support for multiple columns. - """ - - if not isinstance(col, (basestring, list, tuple)): - raise ValueError("col should be a string, list or tuple, but got %r" % type(col)) - - isStr = isinstance(col, basestring) - - if isinstance(col, tuple): - col = list(col) - elif isStr: - col = [col] - - for c in col: - if not isinstance(c, basestring): - raise ValueError("columns should be strings, but got %r" % type(c)) - col = _to_list(self._sc, col) - - if not isinstance(probabilities, (list, tuple)): - raise ValueError("probabilities should be a list or tuple") - if isinstance(probabilities, tuple): - probabilities = list(probabilities) - for p in probabilities: - if not isinstance(p, (float, int, long)) or p < 0 or p > 1: - raise ValueError("probabilities should be numerical (float, int, long) in [0,1].") - probabilities = _to_list(self._sc, probabilities) - - if not isinstance(relativeError, (float, int, long)) or relativeError < 0: - raise ValueError("relativeError should be numerical (float, int, long) >= 0.") - relativeError = float(relativeError) - - jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError) - jaq_list = [list(j) for j in jaq] - return jaq_list[0] if isStr else jaq_list - - @since(1.4) - def corr(self, col1, col2, method=None): - """ - Calculates the correlation of two columns of a DataFrame as a double value. - Currently only supports the Pearson Correlation Coefficient. - :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other. - - :param col1: The name of the first column - :param col2: The name of the second column - :param method: The correlation method. Currently only supports "pearson" - """ - if not isinstance(col1, basestring): - raise ValueError("col1 should be a string.") - if not isinstance(col2, basestring): - raise ValueError("col2 should be a string.") - if not method: - method = "pearson" - if not method == "pearson": - raise ValueError("Currently only the calculation of the Pearson Correlation " + - "coefficient is supported.") - return self._jdf.stat().corr(col1, col2, method) - - @since(1.4) - def cov(self, col1, col2): - """ - Calculate the sample covariance for the given columns, specified by their names, as a - double value. :func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases. - - :param col1: The name of the first column - :param col2: The name of the second column - """ - if not isinstance(col1, basestring): - raise ValueError("col1 should be a string.") - if not isinstance(col2, basestring): - raise ValueError("col2 should be a string.") - return self._jdf.stat().cov(col1, col2) - - @since(1.4) - def crosstab(self, col1, col2): - """ - Computes a pair-wise frequency table of the given columns. Also known as a contingency - table. The number of distinct values for each column should be less than 1e4. At most 1e6 - non-zero pair frequencies will be returned. - The first column of each row will be the distinct values of `col1` and the column names - will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. - Pairs that have no occurrences will have zero as their counts. - :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases. - - :param col1: The name of the first column. Distinct items will make the first item of - each row. - :param col2: The name of the second column. Distinct items will make the column names - of the DataFrame. - """ - if not isinstance(col1, basestring): - raise ValueError("col1 should be a string.") - if not isinstance(col2, basestring): - raise ValueError("col2 should be a string.") - return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx) - - @since(1.4) - def freqItems(self, cols, support=None): - """ - Finding frequent items for columns, possibly with false positives. Using the - frequent element count algorithm described in - "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou". - :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases. - - .. note:: This function is meant for exploratory data analysis, as we make no - guarantee about the backward compatibility of the schema of the resulting DataFrame. - - :param cols: Names of the columns to calculate frequent items for as a list or tuple of - strings. - :param support: The frequency with which to consider an item 'frequent'. Default is 1%. - The support must be greater than 1e-4. - """ - if isinstance(cols, tuple): - cols = list(cols) - if not isinstance(cols, list): - raise ValueError("cols must be a list or tuple of column names as strings.") - if not support: - support = 0.01 - return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx) - - @ignore_unicode_prefix - @since(1.3) - def withColumn(self, colName, col): - """ - Returns a new :class:`DataFrame` by adding a column or replacing the - existing column that has the same name. - - The column expression must be an expression over this DataFrame; attempting to add - a column from some other dataframe will raise an error. - - :param colName: string, name of the new column. - :param col: a :class:`Column` expression for the new column. - - >>> df.withColumn('age2', df.age + 2).collect() - [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)] - - """ - assert isinstance(col, Column), "col should be Column" - return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx) - - @ignore_unicode_prefix - @since(1.3) - def withColumnRenamed(self, existing, new): - """Returns a new :class:`DataFrame` by renaming an existing column. - This is a no-op if schema doesn't contain the given column name. - - :param existing: string, name of the existing column to rename. - :param new: string, new name of the column. - - >>> df.withColumnRenamed('age', 'age2').collect() - [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')] - """ - return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sql_ctx) - - @since(1.4) - @ignore_unicode_prefix - def drop(self, *cols): - """Returns a new :class:`DataFrame` that drops the specified column. - This is a no-op if schema doesn't contain the given column name(s). - - :param cols: a string name of the column to drop, or a - :class:`Column` to drop, or a list of string name of the columns to drop. - - >>> df.drop('age').collect() - [Row(name=u'Alice'), Row(name=u'Bob')] - - >>> df.drop(df.age).collect() - [Row(name=u'Alice'), Row(name=u'Bob')] - - >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect() - [Row(age=5, height=85, name=u'Bob')] - - >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect() - [Row(age=5, name=u'Bob', height=85)] - - >>> df.join(df2, 'name', 'inner').drop('age', 'height').collect() - [Row(name=u'Bob')] - """ - if len(cols) == 1: - col = cols[0] - if isinstance(col, basestring): - jdf = self._jdf.drop(col) - elif isinstance(col, Column): - jdf = self._jdf.drop(col._jc) - else: - raise TypeError("col should be a string or a Column") - else: - for col in cols: - if not isinstance(col, basestring): - raise TypeError("each col in the param list should be a string") - jdf = self._jdf.drop(self._jseq(cols)) - - return DataFrame(jdf, self.sql_ctx) - - @ignore_unicode_prefix - def toDF(self, *cols): - """Returns a new class:`DataFrame` that with new specified column names - - :param cols: list of new column names (string) - - >>> df.toDF('f1', 'f2').collect() - [Row(f1=2, f2=u'Alice'), Row(f1=5, f2=u'Bob')] - """ - jdf = self._jdf.toDF(self._jseq(cols)) - return DataFrame(jdf, self.sql_ctx) - - @since(1.3) - def toPandas(self): - """ - Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``. - - This is only available if Pandas is installed and available. - - .. note:: This method should only be used if the resulting Pandas's DataFrame is expected - to be small, as all the data is loaded into the driver's memory. - - .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental. - - >>> df.toPandas() # doctest: +SKIP - age name - 0 2 Alice - 1 5 Bob - """ - from pyspark.sql.utils import require_minimum_pandas_version - require_minimum_pandas_version() - - import pandas as pd - - if self.sql_ctx._conf.pandasRespectSessionTimeZone(): - timezone = self.sql_ctx._conf.sessionLocalTimeZone() - else: - timezone = None - - if self.sql_ctx._conf.arrowEnabled(): - use_arrow = True - try: - from pyspark.sql.types import to_arrow_schema - from pyspark.sql.utils import require_minimum_pyarrow_version - - require_minimum_pyarrow_version() - to_arrow_schema(self.schema) - except Exception as e: - - if self.sql_ctx._conf.arrowFallbackEnabled(): - msg = ( - "toPandas attempted Arrow optimization because " - "'spark.sql.execution.arrow.enabled' is set to true; however, " - "failed by the reason below:\n %s\n" - "Attempting non-optimization as " - "'spark.sql.execution.arrow.fallback.enabled' is set to " - "true." % _exception_message(e)) - warnings.warn(msg) - use_arrow = False - else: - msg = ( - "toPandas attempted Arrow optimization because " - "'spark.sql.execution.arrow.enabled' is set to true, but has reached " - "the error below and will not continue because automatic fallback " - "with 'spark.sql.execution.arrow.fallback.enabled' has been set to " - "false.\n %s" % _exception_message(e)) - warnings.warn(msg) - raise - - # Try to use Arrow optimization when the schema is supported and the required version - # of PyArrow is found, if 'spark.sql.execution.arrow.enabled' is enabled. - if use_arrow: - try: - from pyspark.sql.types import _check_dataframe_convert_date, \ - _check_dataframe_localize_timestamps - import pyarrow - batches = self._collectAsArrow() - if len(batches) > 0: - table = pyarrow.Table.from_batches(batches) - pdf = table.to_pandas() - pdf = _check_dataframe_convert_date(pdf, self.schema) - return _check_dataframe_localize_timestamps(pdf, timezone) - else: - return pd.DataFrame.from_records([], columns=self.columns) - except Exception as e: - # We might have to allow fallback here as well but multiple Spark jobs can - # be executed. So, simply fail in this case for now. - msg = ( - "toPandas attempted Arrow optimization because " - "'spark.sql.execution.arrow.enabled' is set to true, but has reached " - "the error below and can not continue. Note that " - "'spark.sql.execution.arrow.fallback.enabled' does not have an effect " - "on failures in the middle of computation.\n %s" % _exception_message(e)) - warnings.warn(msg) - raise - - # Below is toPandas without Arrow optimization. - pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns) - - dtype = {} - for field in self.schema: - pandas_type = _to_corrected_pandas_type(field.dataType) - # SPARK-21766: if an integer field is nullable and has null values, it can be - # inferred by pandas as float column. Once we convert the column with NaN back - # to integer type e.g., np.int16, we will hit exception. So we use the inferred - # float type, not the corrected type from the schema in this case. - if pandas_type is not None and \ - not(isinstance(field.dataType, IntegralType) and field.nullable and - pdf[field.name].isnull().any()): - dtype[field.name] = pandas_type - - for f, t in dtype.items(): - pdf[f] = pdf[f].astype(t, copy=False) - - if timezone is None: - return pdf - else: - from pyspark.sql.types import _check_series_convert_timestamps_local_tz - for field in self.schema: - # TODO: handle nested timestamps, such as ArrayType(TimestampType())? - if isinstance(field.dataType, TimestampType): - pdf[field.name] = \ - _check_series_convert_timestamps_local_tz(pdf[field.name], timezone) - return pdf - - def _collectAsArrow(self): - """ - Returns all records as a list of ArrowRecordBatches, pyarrow must be installed - and available on driver and worker Python environments. - - .. note:: Experimental. - """ - with SCCallSiteSync(self._sc) as css: - sock_info = self._jdf.collectAsArrowToPython() - return list(_load_from_socket(sock_info, ArrowStreamSerializer())) - - ########################################################################################## - # Pandas compatibility - ########################################################################################## - - groupby = copy_func( - groupBy, - sinceversion=1.4, - doc=":func:`groupby` is an alias for :func:`groupBy`.") - - drop_duplicates = copy_func( - dropDuplicates, - sinceversion=1.4, - doc=":func:`drop_duplicates` is an alias for :func:`dropDuplicates`.") - - where = copy_func( - filter, - sinceversion=1.3, - doc=":func:`where` is an alias for :func:`filter`.") - - -def _to_scala_map(sc, jm): - """ - Convert a dict into a JVM Map. - """ - return sc._jvm.PythonUtils.toScalaMap(jm) - - -def _to_corrected_pandas_type(dt): - """ - When converting Spark SQL records to Pandas DataFrame, the inferred data type may be wrong. - This method gets the corrected data type for Pandas if that type may be inferred uncorrectly. - """ - import numpy as np - if type(dt) == ByteType: - return np.int8 - elif type(dt) == ShortType: - return np.int16 - elif type(dt) == IntegerType: - return np.int32 - elif type(dt) == FloatType: - return np.float32 - else: - return None - - -class DataFrameNaFunctions(object): - """Functionality for working with missing data in :class:`DataFrame`. - - .. versionadded:: 1.4 - """ - - def __init__(self, df): - self.df = df - - def drop(self, how='any', thresh=None, subset=None): - return self.df.dropna(how=how, thresh=thresh, subset=subset) - - drop.__doc__ = DataFrame.dropna.__doc__ - - def fill(self, value, subset=None): - return self.df.fillna(value=value, subset=subset) - - fill.__doc__ = DataFrame.fillna.__doc__ - - def replace(self, to_replace, value=_NoValue, subset=None): - return self.df.replace(to_replace, value, subset) - - replace.__doc__ = DataFrame.replace.__doc__ - - -class DataFrameStatFunctions(object): - """Functionality for statistic functions with :class:`DataFrame`. - - .. versionadded:: 1.4 - """ - - def __init__(self, df): - self.df = df - - def approxQuantile(self, col, probabilities, relativeError): - return self.df.approxQuantile(col, probabilities, relativeError) - - approxQuantile.__doc__ = DataFrame.approxQuantile.__doc__ - - def corr(self, col1, col2, method=None): - return self.df.corr(col1, col2, method) - - corr.__doc__ = DataFrame.corr.__doc__ - - def cov(self, col1, col2): - return self.df.cov(col1, col2) - - cov.__doc__ = DataFrame.cov.__doc__ - - def crosstab(self, col1, col2): - return self.df.crosstab(col1, col2) - - crosstab.__doc__ = DataFrame.crosstab.__doc__ - - def freqItems(self, cols, support=None): - return self.df.freqItems(cols, support) - - freqItems.__doc__ = DataFrame.freqItems.__doc__ - - def sampleBy(self, col, fractions, seed=None): - return self.df.sampleBy(col, fractions, seed) - - sampleBy.__doc__ = DataFrame.sampleBy.__doc__ - - -def _test(): - import doctest - from pyspark.context import SparkContext - from pyspark.sql import Row, SQLContext, SparkSession - import pyspark.sql.dataframe - from pyspark.sql.functions import from_unixtime - globs = pyspark.sql.dataframe.__dict__.copy() - sc = SparkContext('local[4]', 'PythonTest') - globs['sc'] = sc - globs['sqlContext'] = SQLContext(sc) - globs['spark'] = SparkSession(sc) - globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\ - .toDF(StructType([StructField('age', IntegerType()), - StructField('name', StringType())])) - globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF() - globs['df3'] = sc.parallelize([Row(name='Alice', age=2), - Row(name='Bob', age=5)]).toDF() - globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80), - Row(name='Bob', age=5, height=None), - Row(name='Tom', age=None, height=None), - Row(name=None, age=None, height=None)]).toDF() - globs['df5'] = sc.parallelize([Row(name='Alice', spy=False, age=10), - Row(name='Bob', spy=None, age=5), - Row(name='Mallory', spy=True, age=None)]).toDF() - globs['sdf'] = sc.parallelize([Row(name='Tom', time=1479441846), - Row(name='Bob', time=1479442946)]).toDF() - - (failure_count, test_count) = doctest.testmod( - pyspark.sql.dataframe, globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) - globs['sc'].stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/functions.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/functions.py deleted file mode 100644 index e1d6ea3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/functions.py +++ /dev/null @@ -1,2954 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -A collections of builtin functions -""" -import sys -import functools -import warnings - -if sys.version < "3": - from itertools import imap as map - -if sys.version >= '3': - basestring = str - -from pyspark import since, SparkContext -from pyspark.rdd import ignore_unicode_prefix, PythonEvalType -from pyspark.sql.column import Column, _to_java_column, _to_seq, _create_column_from_literal -from pyspark.sql.dataframe import DataFrame -from pyspark.sql.types import StringType, DataType -# Keep UserDefinedFunction import for backwards compatible import; moved in SPARK-22409 -from pyspark.sql.udf import UserDefinedFunction, _create_udf - - -def _create_function(name, doc=""): - """ Create a function for aggregator by name""" - def _(col): - sc = SparkContext._active_spark_context - jc = getattr(sc._jvm.functions, name)(col._jc if isinstance(col, Column) else col) - return Column(jc) - _.__name__ = name - _.__doc__ = doc - return _ - - -def _wrap_deprecated_function(func, message): - """ Wrap the deprecated function to print out deprecation warnings""" - def _(col): - warnings.warn(message, DeprecationWarning) - return func(col) - return functools.wraps(func)(_) - - -def _create_binary_mathfunction(name, doc=""): - """ Create a binary mathfunction by name""" - def _(col1, col2): - sc = SparkContext._active_spark_context - # users might write ints for simplicity. This would throw an error on the JVM side. - jc = getattr(sc._jvm.functions, name)(col1._jc if isinstance(col1, Column) else float(col1), - col2._jc if isinstance(col2, Column) else float(col2)) - return Column(jc) - _.__name__ = name - _.__doc__ = doc - return _ - - -def _create_window_function(name, doc=''): - """ Create a window function by name """ - def _(): - sc = SparkContext._active_spark_context - jc = getattr(sc._jvm.functions, name)() - return Column(jc) - _.__name__ = name - _.__doc__ = 'Window function: ' + doc - return _ - -_lit_doc = """ - Creates a :class:`Column` of literal value. - - >>> df.select(lit(5).alias('height')).withColumn('spark_user', lit(True)).take(1) - [Row(height=5, spark_user=True)] - """ -_functions = { - 'lit': _lit_doc, - 'col': 'Returns a :class:`Column` based on the given column name.', - 'column': 'Returns a :class:`Column` based on the given column name.', - 'asc': 'Returns a sort expression based on the ascending order of the given column name.', - 'desc': 'Returns a sort expression based on the descending order of the given column name.', - - 'upper': 'Converts a string expression to upper case.', - 'lower': 'Converts a string expression to upper case.', - 'sqrt': 'Computes the square root of the specified float value.', - 'abs': 'Computes the absolute value.', - - 'max': 'Aggregate function: returns the maximum value of the expression in a group.', - 'min': 'Aggregate function: returns the minimum value of the expression in a group.', - 'count': 'Aggregate function: returns the number of items in a group.', - 'sum': 'Aggregate function: returns the sum of all values in the expression.', - 'avg': 'Aggregate function: returns the average of the values in a group.', - 'mean': 'Aggregate function: returns the average of the values in a group.', - 'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.', -} - -_functions_1_4 = { - # unary math functions - 'acos': ':return: inverse cosine of `col`, as if computed by `java.lang.Math.acos()`', - 'asin': ':return: inverse sine of `col`, as if computed by `java.lang.Math.asin()`', - 'atan': ':return: inverse tangent of `col`, as if computed by `java.lang.Math.atan()`', - 'cbrt': 'Computes the cube-root of the given value.', - 'ceil': 'Computes the ceiling of the given value.', - 'cos': """:param col: angle in radians - :return: cosine of the angle, as if computed by `java.lang.Math.cos()`.""", - 'cosh': """:param col: hyperbolic angle - :return: hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh()`""", - 'exp': 'Computes the exponential of the given value.', - 'expm1': 'Computes the exponential of the given value minus one.', - 'floor': 'Computes the floor of the given value.', - 'log': 'Computes the natural logarithm of the given value.', - 'log10': 'Computes the logarithm of the given value in Base 10.', - 'log1p': 'Computes the natural logarithm of the given value plus one.', - 'rint': 'Returns the double value that is closest in value to the argument and' + - ' is equal to a mathematical integer.', - 'signum': 'Computes the signum of the given value.', - 'sin': """:param col: angle in radians - :return: sine of the angle, as if computed by `java.lang.Math.sin()`""", - 'sinh': """:param col: hyperbolic angle - :return: hyperbolic sine of the given value, - as if computed by `java.lang.Math.sinh()`""", - 'tan': """:param col: angle in radians - :return: tangent of the given value, as if computed by `java.lang.Math.tan()`""", - 'tanh': """:param col: hyperbolic angle - :return: hyperbolic tangent of the given value, - as if computed by `java.lang.Math.tanh()`""", - 'toDegrees': '.. note:: Deprecated in 2.1, use :func:`degrees` instead.', - 'toRadians': '.. note:: Deprecated in 2.1, use :func:`radians` instead.', - 'bitwiseNOT': 'Computes bitwise not.', -} - -_functions_2_4 = { - 'asc_nulls_first': 'Returns a sort expression based on the ascending order of the given' + - ' column name, and null values return before non-null values.', - 'asc_nulls_last': 'Returns a sort expression based on the ascending order of the given' + - ' column name, and null values appear after non-null values.', - 'desc_nulls_first': 'Returns a sort expression based on the descending order of the given' + - ' column name, and null values appear before non-null values.', - 'desc_nulls_last': 'Returns a sort expression based on the descending order of the given' + - ' column name, and null values appear after non-null values', -} - -_collect_list_doc = """ - Aggregate function: returns a list of objects with duplicates. - - .. note:: The function is non-deterministic because the order of collected results depends - on order of rows which may be non-deterministic after a shuffle. - - >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) - >>> df2.agg(collect_list('age')).collect() - [Row(collect_list(age)=[2, 5, 5])] - """ -_collect_set_doc = """ - Aggregate function: returns a set of objects with duplicate elements eliminated. - - .. note:: The function is non-deterministic because the order of collected results depends - on order of rows which may be non-deterministic after a shuffle. - - >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) - >>> df2.agg(collect_set('age')).collect() - [Row(collect_set(age)=[5, 2])] - """ -_functions_1_6 = { - # unary math functions - 'stddev': 'Aggregate function: returns the unbiased sample standard deviation of' + - ' the expression in a group.', - 'stddev_samp': 'Aggregate function: returns the unbiased sample standard deviation of' + - ' the expression in a group.', - 'stddev_pop': 'Aggregate function: returns population standard deviation of' + - ' the expression in a group.', - 'variance': 'Aggregate function: returns the population variance of the values in a group.', - 'var_samp': 'Aggregate function: returns the unbiased variance of the values in a group.', - 'var_pop': 'Aggregate function: returns the population variance of the values in a group.', - 'skewness': 'Aggregate function: returns the skewness of the values in a group.', - 'kurtosis': 'Aggregate function: returns the kurtosis of the values in a group.', - 'collect_list': _collect_list_doc, - 'collect_set': _collect_set_doc -} - -_functions_2_1 = { - # unary math functions - 'degrees': """ - Converts an angle measured in radians to an approximately equivalent angle - measured in degrees. - :param col: angle in radians - :return: angle in degrees, as if computed by `java.lang.Math.toDegrees()` - """, - 'radians': """ - Converts an angle measured in degrees to an approximately equivalent angle - measured in radians. - :param col: angle in degrees - :return: angle in radians, as if computed by `java.lang.Math.toRadians()` - """, -} - -# math functions that take two arguments as input -_binary_mathfunctions = { - 'atan2': """ - :param col1: coordinate on y-axis - :param col2: coordinate on x-axis - :return: the `theta` component of the point - (`r`, `theta`) - in polar coordinates that corresponds to the point - (`x`, `y`) in Cartesian coordinates, - as if computed by `java.lang.Math.atan2()` - """, - 'hypot': 'Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.', - 'pow': 'Returns the value of the first argument raised to the power of the second argument.', -} - -_window_functions = { - 'row_number': - """returns a sequential number starting at 1 within a window partition.""", - 'dense_rank': - """returns the rank of rows within a window partition, without any gaps. - - The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking - sequence when there are ties. That is, if you were ranking a competition using dense_rank - and had three people tie for second place, you would say that all three were in second - place and that the next person came in third. Rank would give me sequential numbers, making - the person that came in third place (after the ties) would register as coming in fifth. - - This is equivalent to the DENSE_RANK function in SQL.""", - 'rank': - """returns the rank of rows within a window partition. - - The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking - sequence when there are ties. That is, if you were ranking a competition using dense_rank - and had three people tie for second place, you would say that all three were in second - place and that the next person came in third. Rank would give me sequential numbers, making - the person that came in third place (after the ties) would register as coming in fifth. - - This is equivalent to the RANK function in SQL.""", - 'cume_dist': - """returns the cumulative distribution of values within a window partition, - i.e. the fraction of rows that are below the current row.""", - 'percent_rank': - """returns the relative rank (i.e. percentile) of rows within a window partition.""", -} - -# Wraps deprecated functions (keys) with the messages (values). -_functions_deprecated = { - 'toDegrees': 'Deprecated in 2.1, use degrees instead.', - 'toRadians': 'Deprecated in 2.1, use radians instead.', -} - -for _name, _doc in _functions.items(): - globals()[_name] = since(1.3)(_create_function(_name, _doc)) -for _name, _doc in _functions_1_4.items(): - globals()[_name] = since(1.4)(_create_function(_name, _doc)) -for _name, _doc in _binary_mathfunctions.items(): - globals()[_name] = since(1.4)(_create_binary_mathfunction(_name, _doc)) -for _name, _doc in _window_functions.items(): - globals()[_name] = since(1.6)(_create_window_function(_name, _doc)) -for _name, _doc in _functions_1_6.items(): - globals()[_name] = since(1.6)(_create_function(_name, _doc)) -for _name, _doc in _functions_2_1.items(): - globals()[_name] = since(2.1)(_create_function(_name, _doc)) -for _name, _message in _functions_deprecated.items(): - globals()[_name] = _wrap_deprecated_function(globals()[_name], _message) -for _name, _doc in _functions_2_4.items(): - globals()[_name] = since(2.4)(_create_function(_name, _doc)) -del _name, _doc - - -@since(1.3) -def approxCountDistinct(col, rsd=None): - """ - .. note:: Deprecated in 2.1, use :func:`approx_count_distinct` instead. - """ - warnings.warn("Deprecated in 2.1, use approx_count_distinct instead.", DeprecationWarning) - return approx_count_distinct(col, rsd) - - -@since(2.1) -def approx_count_distinct(col, rsd=None): - """Aggregate function: returns a new :class:`Column` for approximate distinct count of - column `col`. - - :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more - efficient to use :func:`countDistinct` - - >>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect() - [Row(distinct_ages=2)] - """ - sc = SparkContext._active_spark_context - if rsd is None: - jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col)) - else: - jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col), rsd) - return Column(jc) - - -@since(1.6) -def broadcast(df): - """Marks a DataFrame as small enough for use in broadcast joins.""" - - sc = SparkContext._active_spark_context - return DataFrame(sc._jvm.functions.broadcast(df._jdf), df.sql_ctx) - - -@since(1.4) -def coalesce(*cols): - """Returns the first column that is not null. - - >>> cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b")) - >>> cDf.show() - +----+----+ - | a| b| - +----+----+ - |null|null| - | 1|null| - |null| 2| - +----+----+ - - >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show() - +--------------+ - |coalesce(a, b)| - +--------------+ - | null| - | 1| - | 2| - +--------------+ - - >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show() - +----+----+----------------+ - | a| b|coalesce(a, 0.0)| - +----+----+----------------+ - |null|null| 0.0| - | 1|null| 1.0| - |null| 2| 0.0| - +----+----+----------------+ - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column)) - return Column(jc) - - -@since(1.6) -def corr(col1, col2): - """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` - and ``col2``. - - >>> a = range(20) - >>> b = [2 * x for x in range(20)] - >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) - >>> df.agg(corr("a", "b").alias('c')).collect() - [Row(c=1.0)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.corr(_to_java_column(col1), _to_java_column(col2))) - - -@since(2.0) -def covar_pop(col1, col2): - """Returns a new :class:`Column` for the population covariance of ``col1`` and ``col2``. - - >>> a = [1] * 10 - >>> b = [1] * 10 - >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) - >>> df.agg(covar_pop("a", "b").alias('c')).collect() - [Row(c=0.0)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.covar_pop(_to_java_column(col1), _to_java_column(col2))) - - -@since(2.0) -def covar_samp(col1, col2): - """Returns a new :class:`Column` for the sample covariance of ``col1`` and ``col2``. - - >>> a = [1] * 10 - >>> b = [1] * 10 - >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) - >>> df.agg(covar_samp("a", "b").alias('c')).collect() - [Row(c=0.0)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.covar_samp(_to_java_column(col1), _to_java_column(col2))) - - -@since(1.3) -def countDistinct(col, *cols): - """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``. - - >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect() - [Row(c=2)] - - >>> df.agg(countDistinct("age", "name").alias('c')).collect() - [Row(c=2)] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column)) - return Column(jc) - - -@since(1.3) -def first(col, ignorenulls=False): - """Aggregate function: returns the first value in a group. - - The function by default returns the first values it sees. It will return the first non-null - value it sees when ignoreNulls is set to true. If all values are null, then null is returned. - - .. note:: The function is non-deterministic because its results depends on order of rows which - may be non-deterministic after a shuffle. - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.first(_to_java_column(col), ignorenulls) - return Column(jc) - - -@since(2.0) -def grouping(col): - """ - Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated - or not, returns 1 for aggregated or 0 for not aggregated in the result set. - - >>> df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show() - +-----+--------------+--------+ - | name|grouping(name)|sum(age)| - +-----+--------------+--------+ - | null| 1| 7| - |Alice| 0| 2| - | Bob| 0| 5| - +-----+--------------+--------+ - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.grouping(_to_java_column(col)) - return Column(jc) - - -@since(2.0) -def grouping_id(*cols): - """ - Aggregate function: returns the level of grouping, equals to - - (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn) - - .. note:: The list of columns should match with grouping columns exactly, or empty (means all - the grouping columns). - - >>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show() - +-----+-------------+--------+ - | name|grouping_id()|sum(age)| - +-----+-------------+--------+ - | null| 1| 7| - |Alice| 0| 2| - | Bob| 0| 5| - +-----+-------------+--------+ - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.grouping_id(_to_seq(sc, cols, _to_java_column)) - return Column(jc) - - -@since(1.6) -def input_file_name(): - """Creates a string column for the file name of the current Spark task. - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.input_file_name()) - - -@since(1.6) -def isnan(col): - """An expression that returns true iff the column is NaN. - - >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) - >>> df.select(isnan("a").alias("r1"), isnan(df.a).alias("r2")).collect() - [Row(r1=False, r2=False), Row(r1=True, r2=True)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.isnan(_to_java_column(col))) - - -@since(1.6) -def isnull(col): - """An expression that returns true iff the column is null. - - >>> df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b")) - >>> df.select(isnull("a").alias("r1"), isnull(df.a).alias("r2")).collect() - [Row(r1=False, r2=False), Row(r1=True, r2=True)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.isnull(_to_java_column(col))) - - -@since(1.3) -def last(col, ignorenulls=False): - """Aggregate function: returns the last value in a group. - - The function by default returns the last values it sees. It will return the last non-null - value it sees when ignoreNulls is set to true. If all values are null, then null is returned. - - .. note:: The function is non-deterministic because its results depends on order of rows - which may be non-deterministic after a shuffle. - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.last(_to_java_column(col), ignorenulls) - return Column(jc) - - -@since(1.6) -def monotonically_increasing_id(): - """A column that generates monotonically increasing 64-bit integers. - - The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. - The current implementation puts the partition ID in the upper 31 bits, and the record number - within each partition in the lower 33 bits. The assumption is that the data frame has - less than 1 billion partitions, and each partition has less than 8 billion records. - - .. note:: The function is non-deterministic because its result depends on partition IDs. - - As an example, consider a :class:`DataFrame` with two partitions, each with 3 records. - This expression would return the following IDs: - 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. - - >>> df0 = sc.parallelize(range(2), 2).mapPartitions(lambda x: [(1,), (2,), (3,)]).toDF(['col1']) - >>> df0.select(monotonically_increasing_id().alias('id')).collect() - [Row(id=0), Row(id=1), Row(id=2), Row(id=8589934592), Row(id=8589934593), Row(id=8589934594)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.monotonically_increasing_id()) - - -@since(1.6) -def nanvl(col1, col2): - """Returns col1 if it is not NaN, or col2 if col1 is NaN. - - Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`). - - >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) - >>> df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect() - [Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.nanvl(_to_java_column(col1), _to_java_column(col2))) - - -@ignore_unicode_prefix -@since(1.4) -def rand(seed=None): - """Generates a random column with independent and identically distributed (i.i.d.) samples - from U[0.0, 1.0]. - - .. note:: The function is non-deterministic in general case. - - >>> df.withColumn('rand', rand(seed=42) * 3).collect() - [Row(age=2, name=u'Alice', rand=1.1568609015300986), - Row(age=5, name=u'Bob', rand=1.403379671529166)] - """ - sc = SparkContext._active_spark_context - if seed is not None: - jc = sc._jvm.functions.rand(seed) - else: - jc = sc._jvm.functions.rand() - return Column(jc) - - -@ignore_unicode_prefix -@since(1.4) -def randn(seed=None): - """Generates a column with independent and identically distributed (i.i.d.) samples from - the standard normal distribution. - - .. note:: The function is non-deterministic in general case. - - >>> df.withColumn('randn', randn(seed=42)).collect() - [Row(age=2, name=u'Alice', randn=-0.7556247885860078), - Row(age=5, name=u'Bob', randn=-0.0861619008451133)] - """ - sc = SparkContext._active_spark_context - if seed is not None: - jc = sc._jvm.functions.randn(seed) - else: - jc = sc._jvm.functions.randn() - return Column(jc) - - -@since(1.5) -def round(col, scale=0): - """ - Round the given value to `scale` decimal places using HALF_UP rounding mode if `scale` >= 0 - or at integral part when `scale` < 0. - - >>> spark.createDataFrame([(2.5,)], ['a']).select(round('a', 0).alias('r')).collect() - [Row(r=3.0)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.round(_to_java_column(col), scale)) - - -@since(2.0) -def bround(col, scale=0): - """ - Round the given value to `scale` decimal places using HALF_EVEN rounding mode if `scale` >= 0 - or at integral part when `scale` < 0. - - >>> spark.createDataFrame([(2.5,)], ['a']).select(bround('a', 0).alias('r')).collect() - [Row(r=2.0)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.bround(_to_java_column(col), scale)) - - -@since(1.5) -def shiftLeft(col, numBits): - """Shift the given value numBits left. - - >>> spark.createDataFrame([(21,)], ['a']).select(shiftLeft('a', 1).alias('r')).collect() - [Row(r=42)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.shiftLeft(_to_java_column(col), numBits)) - - -@since(1.5) -def shiftRight(col, numBits): - """(Signed) shift the given value numBits right. - - >>> spark.createDataFrame([(42,)], ['a']).select(shiftRight('a', 1).alias('r')).collect() - [Row(r=21)] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.shiftRight(_to_java_column(col), numBits) - return Column(jc) - - -@since(1.5) -def shiftRightUnsigned(col, numBits): - """Unsigned shift the given value numBits right. - - >>> df = spark.createDataFrame([(-42,)], ['a']) - >>> df.select(shiftRightUnsigned('a', 1).alias('r')).collect() - [Row(r=9223372036854775787)] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.shiftRightUnsigned(_to_java_column(col), numBits) - return Column(jc) - - -@since(1.6) -def spark_partition_id(): - """A column for partition ID. - - .. note:: This is indeterministic because it depends on data partitioning and task scheduling. - - >>> df.repartition(1).select(spark_partition_id().alias("pid")).collect() - [Row(pid=0), Row(pid=0)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.spark_partition_id()) - - -@since(1.5) -def expr(str): - """Parses the expression string into the column that it represents - - >>> df.select(expr("length(name)")).collect() - [Row(length(name)=5), Row(length(name)=3)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.expr(str)) - - -@ignore_unicode_prefix -@since(1.4) -def struct(*cols): - """Creates a new struct column. - - :param cols: list of column names (string) or list of :class:`Column` expressions - - >>> df.select(struct('age', 'name').alias("struct")).collect() - [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))] - >>> df.select(struct([df.age, df.name]).alias("struct")).collect() - [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))] - """ - sc = SparkContext._active_spark_context - if len(cols) == 1 and isinstance(cols[0], (list, set)): - cols = cols[0] - jc = sc._jvm.functions.struct(_to_seq(sc, cols, _to_java_column)) - return Column(jc) - - -@since(1.5) -def greatest(*cols): - """ - Returns the greatest value of the list of column names, skipping null values. - This function takes at least 2 parameters. It will return null iff all parameters are null. - - >>> df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c']) - >>> df.select(greatest(df.a, df.b, df.c).alias("greatest")).collect() - [Row(greatest=4)] - """ - if len(cols) < 2: - raise ValueError("greatest should take at least two columns") - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.greatest(_to_seq(sc, cols, _to_java_column))) - - -@since(1.5) -def least(*cols): - """ - Returns the least value of the list of column names, skipping null values. - This function takes at least 2 parameters. It will return null iff all parameters are null. - - >>> df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c']) - >>> df.select(least(df.a, df.b, df.c).alias("least")).collect() - [Row(least=1)] - """ - if len(cols) < 2: - raise ValueError("least should take at least two columns") - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.least(_to_seq(sc, cols, _to_java_column))) - - -@since(1.4) -def when(condition, value): - """Evaluates a list of conditions and returns one of multiple possible result expressions. - If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions. - - :param condition: a boolean :class:`Column` expression. - :param value: a literal value, or a :class:`Column` expression. - - >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect() - [Row(age=3), Row(age=4)] - - >>> df.select(when(df.age == 2, df.age + 1).alias("age")).collect() - [Row(age=3), Row(age=None)] - """ - sc = SparkContext._active_spark_context - if not isinstance(condition, Column): - raise TypeError("condition should be a Column") - v = value._jc if isinstance(value, Column) else value - jc = sc._jvm.functions.when(condition._jc, v) - return Column(jc) - - -@since(1.5) -def log(arg1, arg2=None): - """Returns the first argument-based logarithm of the second argument. - - If there is only one argument, then this takes the natural logarithm of the argument. - - >>> df.select(log(10.0, df.age).alias('ten')).rdd.map(lambda l: str(l.ten)[:7]).collect() - ['0.30102', '0.69897'] - - >>> df.select(log(df.age).alias('e')).rdd.map(lambda l: str(l.e)[:7]).collect() - ['0.69314', '1.60943'] - """ - sc = SparkContext._active_spark_context - if arg2 is None: - jc = sc._jvm.functions.log(_to_java_column(arg1)) - else: - jc = sc._jvm.functions.log(arg1, _to_java_column(arg2)) - return Column(jc) - - -@since(1.5) -def log2(col): - """Returns the base-2 logarithm of the argument. - - >>> spark.createDataFrame([(4,)], ['a']).select(log2('a').alias('log2')).collect() - [Row(log2=2.0)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.log2(_to_java_column(col))) - - -@since(1.5) -@ignore_unicode_prefix -def conv(col, fromBase, toBase): - """ - Convert a number in a string column from one base to another. - - >>> df = spark.createDataFrame([("010101",)], ['n']) - >>> df.select(conv(df.n, 2, 16).alias('hex')).collect() - [Row(hex=u'15')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.conv(_to_java_column(col), fromBase, toBase)) - - -@since(1.5) -def factorial(col): - """ - Computes the factorial of the given value. - - >>> df = spark.createDataFrame([(5,)], ['n']) - >>> df.select(factorial(df.n).alias('f')).collect() - [Row(f=120)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.factorial(_to_java_column(col))) - - -# --------------- Window functions ------------------------ - -@since(1.4) -def lag(col, count=1, default=None): - """ - Window function: returns the value that is `offset` rows before the current row, and - `defaultValue` if there is less than `offset` rows before the current row. For example, - an `offset` of one will return the previous row at any given point in the window partition. - - This is equivalent to the LAG function in SQL. - - :param col: name of column or expression - :param count: number of row to extend - :param default: default value - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.lag(_to_java_column(col), count, default)) - - -@since(1.4) -def lead(col, count=1, default=None): - """ - Window function: returns the value that is `offset` rows after the current row, and - `defaultValue` if there is less than `offset` rows after the current row. For example, - an `offset` of one will return the next row at any given point in the window partition. - - This is equivalent to the LEAD function in SQL. - - :param col: name of column or expression - :param count: number of row to extend - :param default: default value - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.lead(_to_java_column(col), count, default)) - - -@since(1.4) -def ntile(n): - """ - Window function: returns the ntile group id (from 1 to `n` inclusive) - in an ordered window partition. For example, if `n` is 4, the first - quarter of the rows will get value 1, the second quarter will get 2, - the third quarter will get 3, and the last quarter will get 4. - - This is equivalent to the NTILE function in SQL. - - :param n: an integer - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.ntile(int(n))) - - -# ---------------------- Date/Timestamp functions ------------------------------ - -@since(1.5) -def current_date(): - """ - Returns the current date as a :class:`DateType` column. - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.current_date()) - - -def current_timestamp(): - """ - Returns the current timestamp as a :class:`TimestampType` column. - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.current_timestamp()) - - -@ignore_unicode_prefix -@since(1.5) -def date_format(date, format): - """ - Converts a date/timestamp/string to a value of string in the format specified by the date - format given by the second argument. - - A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All - pattern letters of the Java class `java.text.SimpleDateFormat` can be used. - - .. note:: Use when ever possible specialized functions like `year`. These benefit from a - specialized implementation. - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect() - [Row(date=u'04/08/2015')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.date_format(_to_java_column(date), format)) - - -@since(1.5) -def year(col): - """ - Extract the year of a given date as integer. - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(year('dt').alias('year')).collect() - [Row(year=2015)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.year(_to_java_column(col))) - - -@since(1.5) -def quarter(col): - """ - Extract the quarter of a given date as integer. - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(quarter('dt').alias('quarter')).collect() - [Row(quarter=2)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.quarter(_to_java_column(col))) - - -@since(1.5) -def month(col): - """ - Extract the month of a given date as integer. - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(month('dt').alias('month')).collect() - [Row(month=4)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.month(_to_java_column(col))) - - -@since(2.3) -def dayofweek(col): - """ - Extract the day of the week of a given date as integer. - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(dayofweek('dt').alias('day')).collect() - [Row(day=4)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.dayofweek(_to_java_column(col))) - - -@since(1.5) -def dayofmonth(col): - """ - Extract the day of the month of a given date as integer. - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(dayofmonth('dt').alias('day')).collect() - [Row(day=8)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.dayofmonth(_to_java_column(col))) - - -@since(1.5) -def dayofyear(col): - """ - Extract the day of the year of a given date as integer. - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(dayofyear('dt').alias('day')).collect() - [Row(day=98)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.dayofyear(_to_java_column(col))) - - -@since(1.5) -def hour(col): - """ - Extract the hours of a given date as integer. - - >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts']) - >>> df.select(hour('ts').alias('hour')).collect() - [Row(hour=13)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.hour(_to_java_column(col))) - - -@since(1.5) -def minute(col): - """ - Extract the minutes of a given date as integer. - - >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts']) - >>> df.select(minute('ts').alias('minute')).collect() - [Row(minute=8)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.minute(_to_java_column(col))) - - -@since(1.5) -def second(col): - """ - Extract the seconds of a given date as integer. - - >>> df = spark.createDataFrame([('2015-04-08 13:08:15',)], ['ts']) - >>> df.select(second('ts').alias('second')).collect() - [Row(second=15)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.second(_to_java_column(col))) - - -@since(1.5) -def weekofyear(col): - """ - Extract the week number of a given date as integer. - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(weekofyear(df.dt).alias('week')).collect() - [Row(week=15)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.weekofyear(_to_java_column(col))) - - -@since(1.5) -def date_add(start, days): - """ - Returns the date that is `days` days after `start` - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(date_add(df.dt, 1).alias('next_date')).collect() - [Row(next_date=datetime.date(2015, 4, 9))] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.date_add(_to_java_column(start), days)) - - -@since(1.5) -def date_sub(start, days): - """ - Returns the date that is `days` days before `start` - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(date_sub(df.dt, 1).alias('prev_date')).collect() - [Row(prev_date=datetime.date(2015, 4, 7))] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.date_sub(_to_java_column(start), days)) - - -@since(1.5) -def datediff(end, start): - """ - Returns the number of days from `start` to `end`. - - >>> df = spark.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2']) - >>> df.select(datediff(df.d2, df.d1).alias('diff')).collect() - [Row(diff=32)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.datediff(_to_java_column(end), _to_java_column(start))) - - -@since(1.5) -def add_months(start, months): - """ - Returns the date that is `months` months after `start` - - >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> df.select(add_months(df.dt, 1).alias('next_month')).collect() - [Row(next_month=datetime.date(2015, 5, 8))] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.add_months(_to_java_column(start), months)) - - -@since(1.5) -def months_between(date1, date2, roundOff=True): - """ - Returns number of months between dates date1 and date2. - If date1 is later than date2, then the result is positive. - If date1 and date2 are on the same day of month, or both are the last day of month, - returns an integer (time of day will be ignored). - The result is rounded off to 8 digits unless `roundOff` is set to `False`. - - >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2']) - >>> df.select(months_between(df.date1, df.date2).alias('months')).collect() - [Row(months=3.94959677)] - >>> df.select(months_between(df.date1, df.date2, False).alias('months')).collect() - [Row(months=3.9495967741935485)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.months_between( - _to_java_column(date1), _to_java_column(date2), roundOff)) - - -@since(2.2) -def to_date(col, format=None): - """Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or - :class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType` - using the optionally specified format. Specify formats according to - `SimpleDateFormats `_. - By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format - is omitted (equivalent to ``col.cast("date")``). - - >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) - >>> df.select(to_date(df.t).alias('date')).collect() - [Row(date=datetime.date(1997, 2, 28))] - - >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) - >>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect() - [Row(date=datetime.date(1997, 2, 28))] - """ - sc = SparkContext._active_spark_context - if format is None: - jc = sc._jvm.functions.to_date(_to_java_column(col)) - else: - jc = sc._jvm.functions.to_date(_to_java_column(col), format) - return Column(jc) - - -@since(2.2) -def to_timestamp(col, format=None): - """Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or - :class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType` - using the optionally specified format. Specify formats according to - `SimpleDateFormats `_. - By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format - is omitted (equivalent to ``col.cast("timestamp")``). - - >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) - >>> df.select(to_timestamp(df.t).alias('dt')).collect() - [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))] - - >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) - >>> df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).collect() - [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))] - """ - sc = SparkContext._active_spark_context - if format is None: - jc = sc._jvm.functions.to_timestamp(_to_java_column(col)) - else: - jc = sc._jvm.functions.to_timestamp(_to_java_column(col), format) - return Column(jc) - - -@since(1.5) -def trunc(date, format): - """ - Returns date truncated to the unit specified by the format. - - :param format: 'year', 'yyyy', 'yy' or 'month', 'mon', 'mm' - - >>> df = spark.createDataFrame([('1997-02-28',)], ['d']) - >>> df.select(trunc(df.d, 'year').alias('year')).collect() - [Row(year=datetime.date(1997, 1, 1))] - >>> df.select(trunc(df.d, 'mon').alias('month')).collect() - [Row(month=datetime.date(1997, 2, 1))] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.trunc(_to_java_column(date), format)) - - -@since(2.3) -def date_trunc(format, timestamp): - """ - Returns timestamp truncated to the unit specified by the format. - - :param format: 'year', 'yyyy', 'yy', 'month', 'mon', 'mm', - 'day', 'dd', 'hour', 'minute', 'second', 'week', 'quarter' - - >>> df = spark.createDataFrame([('1997-02-28 05:02:11',)], ['t']) - >>> df.select(date_trunc('year', df.t).alias('year')).collect() - [Row(year=datetime.datetime(1997, 1, 1, 0, 0))] - >>> df.select(date_trunc('mon', df.t).alias('month')).collect() - [Row(month=datetime.datetime(1997, 2, 1, 0, 0))] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.date_trunc(format, _to_java_column(timestamp))) - - -@since(1.5) -def next_day(date, dayOfWeek): - """ - Returns the first date which is later than the value of the date column. - - Day of the week parameter is case insensitive, and accepts: - "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun". - - >>> df = spark.createDataFrame([('2015-07-27',)], ['d']) - >>> df.select(next_day(df.d, 'Sun').alias('date')).collect() - [Row(date=datetime.date(2015, 8, 2))] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.next_day(_to_java_column(date), dayOfWeek)) - - -@since(1.5) -def last_day(date): - """ - Returns the last day of the month which the given date belongs to. - - >>> df = spark.createDataFrame([('1997-02-10',)], ['d']) - >>> df.select(last_day(df.d).alias('date')).collect() - [Row(date=datetime.date(1997, 2, 28))] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.last_day(_to_java_column(date))) - - -@ignore_unicode_prefix -@since(1.5) -def from_unixtime(timestamp, format="yyyy-MM-dd HH:mm:ss"): - """ - Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string - representing the timestamp of that moment in the current system time zone in the given - format. - - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") - >>> time_df = spark.createDataFrame([(1428476400,)], ['unix_time']) - >>> time_df.select(from_unixtime('unix_time').alias('ts')).collect() - [Row(ts=u'2015-04-08 00:00:00')] - >>> spark.conf.unset("spark.sql.session.timeZone") - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.from_unixtime(_to_java_column(timestamp), format)) - - -@since(1.5) -def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'): - """ - Convert time string with given pattern ('yyyy-MM-dd HH:mm:ss', by default) - to Unix time stamp (in seconds), using the default timezone and the default - locale, return null if fail. - - if `timestamp` is None, then it returns current timestamp. - - >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") - >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect() - [Row(unix_time=1428476400)] - >>> spark.conf.unset("spark.sql.session.timeZone") - """ - sc = SparkContext._active_spark_context - if timestamp is None: - return Column(sc._jvm.functions.unix_timestamp()) - return Column(sc._jvm.functions.unix_timestamp(_to_java_column(timestamp), format)) - - -@since(1.5) -def from_utc_timestamp(timestamp, tz): - """ - This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function - takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in UTC, and - renders that timestamp as a timestamp in the given time zone. - - However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not - timezone-agnostic. So in Spark this function just shift the timestamp value from UTC timezone to - the given timezone. - - This function may return confusing result if the input is a string with timezone, e.g. - '2018-03-13T06:18:23+00:00'. The reason is that, Spark firstly cast the string to timestamp - according to the timezone in the string, and finally display the result by converting the - timestamp to string according to the session local timezone. - - :param timestamp: the column that contains timestamps - :param tz: a string that has the ID of timezone, e.g. "GMT", "America/Los_Angeles", etc - - .. versionchanged:: 2.4 - `tz` can take a :class:`Column` containing timezone ID strings. - - >>> df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz']) - >>> df.select(from_utc_timestamp(df.ts, "PST").alias('local_time')).collect() - [Row(local_time=datetime.datetime(1997, 2, 28, 2, 30))] - >>> df.select(from_utc_timestamp(df.ts, df.tz).alias('local_time')).collect() - [Row(local_time=datetime.datetime(1997, 2, 28, 19, 30))] - """ - sc = SparkContext._active_spark_context - if isinstance(tz, Column): - tz = _to_java_column(tz) - return Column(sc._jvm.functions.from_utc_timestamp(_to_java_column(timestamp), tz)) - - -@since(1.5) -def to_utc_timestamp(timestamp, tz): - """ - This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function - takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in the given - timezone, and renders that timestamp as a timestamp in UTC. - - However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not - timezone-agnostic. So in Spark this function just shift the timestamp value from the given - timezone to UTC timezone. - - This function may return confusing result if the input is a string with timezone, e.g. - '2018-03-13T06:18:23+00:00'. The reason is that, Spark firstly cast the string to timestamp - according to the timezone in the string, and finally display the result by converting the - timestamp to string according to the session local timezone. - - :param timestamp: the column that contains timestamps - :param tz: a string that has the ID of timezone, e.g. "GMT", "America/Los_Angeles", etc - - .. versionchanged:: 2.4 - `tz` can take a :class:`Column` containing timezone ID strings. - - >>> df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz']) - >>> df.select(to_utc_timestamp(df.ts, "PST").alias('utc_time')).collect() - [Row(utc_time=datetime.datetime(1997, 2, 28, 18, 30))] - >>> df.select(to_utc_timestamp(df.ts, df.tz).alias('utc_time')).collect() - [Row(utc_time=datetime.datetime(1997, 2, 28, 1, 30))] - """ - sc = SparkContext._active_spark_context - if isinstance(tz, Column): - tz = _to_java_column(tz) - return Column(sc._jvm.functions.to_utc_timestamp(_to_java_column(timestamp), tz)) - - -@since(2.0) -@ignore_unicode_prefix -def window(timeColumn, windowDuration, slideDuration=None, startTime=None): - """Bucketize rows into one or more time windows given a timestamp specifying column. Window - starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window - [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in - the order of months are not supported. - - The time column must be of :class:`pyspark.sql.types.TimestampType`. - - Durations are provided as strings, e.g. '1 second', '1 day 12 hours', '2 minutes'. Valid - interval strings are 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'. - If the ``slideDuration`` is not provided, the windows will be tumbling windows. - - The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start - window intervals. For example, in order to have hourly tumbling windows that start 15 minutes - past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`. - - The output column will be a struct called 'window' by default with the nested columns 'start' - and 'end', where 'start' and 'end' will be of :class:`pyspark.sql.types.TimestampType`. - - >>> df = spark.createDataFrame([("2016-03-11 09:00:07", 1)]).toDF("date", "val") - >>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum")) - >>> w.select(w.window.start.cast("string").alias("start"), - ... w.window.end.cast("string").alias("end"), "sum").collect() - [Row(start=u'2016-03-11 09:00:05', end=u'2016-03-11 09:00:10', sum=1)] - """ - def check_string_field(field, fieldName): - if not field or type(field) is not str: - raise TypeError("%s should be provided as a string" % fieldName) - - sc = SparkContext._active_spark_context - time_col = _to_java_column(timeColumn) - check_string_field(windowDuration, "windowDuration") - if slideDuration and startTime: - check_string_field(slideDuration, "slideDuration") - check_string_field(startTime, "startTime") - res = sc._jvm.functions.window(time_col, windowDuration, slideDuration, startTime) - elif slideDuration: - check_string_field(slideDuration, "slideDuration") - res = sc._jvm.functions.window(time_col, windowDuration, slideDuration) - elif startTime: - check_string_field(startTime, "startTime") - res = sc._jvm.functions.window(time_col, windowDuration, windowDuration, startTime) - else: - res = sc._jvm.functions.window(time_col, windowDuration) - return Column(res) - - -# ---------------------------- misc functions ---------------------------------- - -@since(1.5) -@ignore_unicode_prefix -def crc32(col): - """ - Calculates the cyclic redundancy check value (CRC32) of a binary column and - returns the value as a bigint. - - >>> spark.createDataFrame([('ABC',)], ['a']).select(crc32('a').alias('crc32')).collect() - [Row(crc32=2743272264)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.crc32(_to_java_column(col))) - - -@ignore_unicode_prefix -@since(1.5) -def md5(col): - """Calculates the MD5 digest and returns the value as a 32 character hex string. - - >>> spark.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect() - [Row(hash=u'902fbdd2b1df0c4f70b4a5d23525e932')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.md5(_to_java_column(col)) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def sha1(col): - """Returns the hex string result of SHA-1. - - >>> spark.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect() - [Row(hash=u'3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.sha1(_to_java_column(col)) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def sha2(col, numBits): - """Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384, - and SHA-512). The numBits indicates the desired bit length of the result, which must have a - value of 224, 256, 384, 512, or 0 (which is equivalent to 256). - - >>> digests = df.select(sha2(df.name, 256).alias('s')).collect() - >>> digests[0] - Row(s=u'3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043') - >>> digests[1] - Row(s=u'cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961') - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.sha2(_to_java_column(col), numBits) - return Column(jc) - - -@since(2.0) -def hash(*cols): - """Calculates the hash code of given columns, and returns the result as an int column. - - >>> spark.createDataFrame([('ABC',)], ['a']).select(hash('a').alias('hash')).collect() - [Row(hash=-757602832)] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.hash(_to_seq(sc, cols, _to_java_column)) - return Column(jc) - - -# ---------------------- String/Binary functions ------------------------------ - -_string_functions = { - 'ascii': 'Computes the numeric value of the first character of the string column.', - 'base64': 'Computes the BASE64 encoding of a binary column and returns it as a string column.', - 'unbase64': 'Decodes a BASE64 encoded string column and returns it as a binary column.', - 'initcap': 'Returns a new string column by converting the first letter of each word to ' + - 'uppercase. Words are delimited by whitespace.', - 'lower': 'Converts a string column to lower case.', - 'upper': 'Converts a string column to upper case.', - 'ltrim': 'Trim the spaces from left end for the specified string value.', - 'rtrim': 'Trim the spaces from right end for the specified string value.', - 'trim': 'Trim the spaces from both ends for the specified string column.', -} - - -for _name, _doc in _string_functions.items(): - globals()[_name] = since(1.5)(_create_function(_name, _doc)) -del _name, _doc - - -@since(1.5) -@ignore_unicode_prefix -def concat_ws(sep, *cols): - """ - Concatenates multiple input string columns together into a single string column, - using the given separator. - - >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd']) - >>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect() - [Row(s=u'abcd-123')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.concat_ws(sep, _to_seq(sc, cols, _to_java_column))) - - -@since(1.5) -def decode(col, charset): - """ - Computes the first argument into a string from a binary using the provided character set - (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.decode(_to_java_column(col), charset)) - - -@since(1.5) -def encode(col, charset): - """ - Computes the first argument into a binary from a string using the provided character set - (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.encode(_to_java_column(col), charset)) - - -@ignore_unicode_prefix -@since(1.5) -def format_number(col, d): - """ - Formats the number X to a format like '#,--#,--#.--', rounded to d decimal places - with HALF_EVEN round mode, and returns the result as a string. - - :param col: the column name of the numeric value to be formatted - :param d: the N decimal places - - >>> spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect() - [Row(v=u'5.0000')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.format_number(_to_java_column(col), d)) - - -@ignore_unicode_prefix -@since(1.5) -def format_string(format, *cols): - """ - Formats the arguments in printf-style and returns the result as a string column. - - :param col: the column name of the numeric value to be formatted - :param d: the N decimal places - - >>> df = spark.createDataFrame([(5, "hello")], ['a', 'b']) - >>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect() - [Row(v=u'5 hello')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.format_string(format, _to_seq(sc, cols, _to_java_column))) - - -@since(1.5) -def instr(str, substr): - """ - Locate the position of the first occurrence of substr column in the given string. - Returns null if either of the arguments are null. - - .. note:: The position is not zero based, but 1 based index. Returns 0 if substr - could not be found in str. - - >>> df = spark.createDataFrame([('abcd',)], ['s',]) - >>> df.select(instr(df.s, 'b').alias('s')).collect() - [Row(s=2)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.instr(_to_java_column(str), substr)) - - -@since(1.5) -@ignore_unicode_prefix -def substring(str, pos, len): - """ - Substring starts at `pos` and is of length `len` when str is String type or - returns the slice of byte array that starts at `pos` in byte and is of length `len` - when str is Binary type. - - .. note:: The position is not zero based, but 1 based index. - - >>> df = spark.createDataFrame([('abcd',)], ['s',]) - >>> df.select(substring(df.s, 1, 2).alias('s')).collect() - [Row(s=u'ab')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.substring(_to_java_column(str), pos, len)) - - -@since(1.5) -@ignore_unicode_prefix -def substring_index(str, delim, count): - """ - Returns the substring from string str before count occurrences of the delimiter delim. - If count is positive, everything the left of the final delimiter (counting from left) is - returned. If count is negative, every to the right of the final delimiter (counting from the - right) is returned. substring_index performs a case-sensitive match when searching for delim. - - >>> df = spark.createDataFrame([('a.b.c.d',)], ['s']) - >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect() - [Row(s=u'a.b')] - >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect() - [Row(s=u'b.c.d')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.substring_index(_to_java_column(str), delim, count)) - - -@ignore_unicode_prefix -@since(1.5) -def levenshtein(left, right): - """Computes the Levenshtein distance of the two given strings. - - >>> df0 = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r']) - >>> df0.select(levenshtein('l', 'r').alias('d')).collect() - [Row(d=3)] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.levenshtein(_to_java_column(left), _to_java_column(right)) - return Column(jc) - - -@since(1.5) -def locate(substr, str, pos=1): - """ - Locate the position of the first occurrence of substr in a string column, after position pos. - - .. note:: The position is not zero based, but 1 based index. Returns 0 if substr - could not be found in str. - - :param substr: a string - :param str: a Column of :class:`pyspark.sql.types.StringType` - :param pos: start position (zero based) - - >>> df = spark.createDataFrame([('abcd',)], ['s',]) - >>> df.select(locate('b', df.s, 1).alias('s')).collect() - [Row(s=2)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.locate(substr, _to_java_column(str), pos)) - - -@since(1.5) -@ignore_unicode_prefix -def lpad(col, len, pad): - """ - Left-pad the string column to width `len` with `pad`. - - >>> df = spark.createDataFrame([('abcd',)], ['s',]) - >>> df.select(lpad(df.s, 6, '#').alias('s')).collect() - [Row(s=u'##abcd')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.lpad(_to_java_column(col), len, pad)) - - -@since(1.5) -@ignore_unicode_prefix -def rpad(col, len, pad): - """ - Right-pad the string column to width `len` with `pad`. - - >>> df = spark.createDataFrame([('abcd',)], ['s',]) - >>> df.select(rpad(df.s, 6, '#').alias('s')).collect() - [Row(s=u'abcd##')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.rpad(_to_java_column(col), len, pad)) - - -@since(1.5) -@ignore_unicode_prefix -def repeat(col, n): - """ - Repeats a string column n times, and returns it as a new string column. - - >>> df = spark.createDataFrame([('ab',)], ['s',]) - >>> df.select(repeat(df.s, 3).alias('s')).collect() - [Row(s=u'ababab')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.repeat(_to_java_column(col), n)) - - -@since(1.5) -@ignore_unicode_prefix -def split(str, pattern): - """ - Splits str around pattern (pattern is a regular expression). - - .. note:: pattern is a string represent the regular expression. - - >>> df = spark.createDataFrame([('ab12cd',)], ['s',]) - >>> df.select(split(df.s, '[0-9]+').alias('s')).collect() - [Row(s=[u'ab', u'cd'])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.split(_to_java_column(str), pattern)) - - -@ignore_unicode_prefix -@since(1.5) -def regexp_extract(str, pattern, idx): - r"""Extract a specific group matched by a Java regex, from the specified string column. - If the regex did not match, or the specified group did not match, an empty string is returned. - - >>> df = spark.createDataFrame([('100-200',)], ['str']) - >>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect() - [Row(d=u'100')] - >>> df = spark.createDataFrame([('foo',)], ['str']) - >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect() - [Row(d=u'')] - >>> df = spark.createDataFrame([('aaaac',)], ['str']) - >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect() - [Row(d=u'')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def regexp_replace(str, pattern, replacement): - r"""Replace all substrings of the specified string value that match regexp with rep. - - >>> df = spark.createDataFrame([('100-200',)], ['str']) - >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect() - [Row(d=u'-----')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def initcap(col): - """Translate the first letter of each word to upper case in the sentence. - - >>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect() - [Row(v=u'Ab Cd')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.initcap(_to_java_column(col))) - - -@since(1.5) -@ignore_unicode_prefix -def soundex(col): - """ - Returns the SoundEx encoding for a string - - >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name']) - >>> df.select(soundex(df.name).alias("soundex")).collect() - [Row(soundex=u'P362'), Row(soundex=u'U612')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.soundex(_to_java_column(col))) - - -@ignore_unicode_prefix -@since(1.5) -def bin(col): - """Returns the string representation of the binary value of the given column. - - >>> df.select(bin(df.age).alias('c')).collect() - [Row(c=u'10'), Row(c=u'101')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.bin(_to_java_column(col)) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def hex(col): - """Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`, - :class:`pyspark.sql.types.BinaryType`, :class:`pyspark.sql.types.IntegerType` or - :class:`pyspark.sql.types.LongType`. - - >>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect() - [Row(hex(a)=u'414243', hex(b)=u'3')] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.hex(_to_java_column(col)) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.5) -def unhex(col): - """Inverse of hex. Interprets each pair of characters as a hexadecimal number - and converts to the byte representation of number. - - >>> spark.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect() - [Row(unhex(a)=bytearray(b'ABC'))] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.unhex(_to_java_column(col))) - - -@ignore_unicode_prefix -@since(1.5) -def length(col): - """Computes the character length of string data or number of bytes of binary data. - The length of character data includes the trailing spaces. The length of binary data - includes binary zeros. - - >>> spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect() - [Row(length=4)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.length(_to_java_column(col))) - - -@ignore_unicode_prefix -@since(1.5) -def translate(srcCol, matching, replace): - """A function translate any character in the `srcCol` by a character in `matching`. - The characters in `replace` is corresponding to the characters in `matching`. - The translate will happen when any character in the string matching with the character - in the `matching`. - - >>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \\ - ... .alias('r')).collect() - [Row(r=u'1a2s3ae')] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.translate(_to_java_column(srcCol), matching, replace)) - - -# ---------------------- Collection functions ------------------------------ - -@ignore_unicode_prefix -@since(2.0) -def create_map(*cols): - """Creates a new map column. - - :param cols: list of column names (string) or list of :class:`Column` expressions that are - grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...). - - >>> df.select(create_map('name', 'age').alias("map")).collect() - [Row(map={u'Alice': 2}), Row(map={u'Bob': 5})] - >>> df.select(create_map([df.name, df.age]).alias("map")).collect() - [Row(map={u'Alice': 2}), Row(map={u'Bob': 5})] - """ - sc = SparkContext._active_spark_context - if len(cols) == 1 and isinstance(cols[0], (list, set)): - cols = cols[0] - jc = sc._jvm.functions.map(_to_seq(sc, cols, _to_java_column)) - return Column(jc) - - -@since(2.4) -def map_from_arrays(col1, col2): - """Creates a new map from two arrays. - - :param col1: name of column containing a set of keys. All elements should not be null - :param col2: name of column containing a set of values - - >>> df = spark.createDataFrame([([2, 5], ['a', 'b'])], ['k', 'v']) - >>> df.select(map_from_arrays(df.k, df.v).alias("map")).show() - +----------------+ - | map| - +----------------+ - |[2 -> a, 5 -> b]| - +----------------+ - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.map_from_arrays(_to_java_column(col1), _to_java_column(col2))) - - -@since(1.4) -def array(*cols): - """Creates a new array column. - - :param cols: list of column names (string) or list of :class:`Column` expressions that have - the same data type. - - >>> df.select(array('age', 'age').alias("arr")).collect() - [Row(arr=[2, 2]), Row(arr=[5, 5])] - >>> df.select(array([df.age, df.age]).alias("arr")).collect() - [Row(arr=[2, 2]), Row(arr=[5, 5])] - """ - sc = SparkContext._active_spark_context - if len(cols) == 1 and isinstance(cols[0], (list, set)): - cols = cols[0] - jc = sc._jvm.functions.array(_to_seq(sc, cols, _to_java_column)) - return Column(jc) - - -@since(1.5) -def array_contains(col, value): - """ - Collection function: returns null if the array is null, true if the array contains the - given value, and false otherwise. - - :param col: name of column containing array - :param value: value to check for in array - - >>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data']) - >>> df.select(array_contains(df.data, "a")).collect() - [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_contains(_to_java_column(col), value)) - - -@since(2.4) -def arrays_overlap(a1, a2): - """ - Collection function: returns true if the arrays contain any common non-null element; if not, - returns null if both the arrays are non-empty and any of them contains a null element; returns - false otherwise. - - >>> df = spark.createDataFrame([(["a", "b"], ["b", "c"]), (["a"], ["b", "c"])], ['x', 'y']) - >>> df.select(arrays_overlap(df.x, df.y).alias("overlap")).collect() - [Row(overlap=True), Row(overlap=False)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.arrays_overlap(_to_java_column(a1), _to_java_column(a2))) - - -@since(2.4) -def slice(x, start, length): - """ - Collection function: returns an array containing all the elements in `x` from index `start` - (or starting from the end if `start` is negative) with the specified `length`. - >>> df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x']) - >>> df.select(slice(df.x, 2, 2).alias("sliced")).collect() - [Row(sliced=[2, 3]), Row(sliced=[5])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.slice(_to_java_column(x), start, length)) - - -@ignore_unicode_prefix -@since(2.4) -def array_join(col, delimiter, null_replacement=None): - """ - Concatenates the elements of `column` using the `delimiter`. Null values are replaced with - `null_replacement` if set, otherwise they are ignored. - - >>> df = spark.createDataFrame([(["a", "b", "c"],), (["a", None],)], ['data']) - >>> df.select(array_join(df.data, ",").alias("joined")).collect() - [Row(joined=u'a,b,c'), Row(joined=u'a')] - >>> df.select(array_join(df.data, ",", "NULL").alias("joined")).collect() - [Row(joined=u'a,b,c'), Row(joined=u'a,NULL')] - """ - sc = SparkContext._active_spark_context - if null_replacement is None: - return Column(sc._jvm.functions.array_join(_to_java_column(col), delimiter)) - else: - return Column(sc._jvm.functions.array_join( - _to_java_column(col), delimiter, null_replacement)) - - -@since(1.5) -@ignore_unicode_prefix -def concat(*cols): - """ - Concatenates multiple input columns together into a single column. - The function works with strings, binary and compatible array columns. - - >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd']) - >>> df.select(concat(df.s, df.d).alias('s')).collect() - [Row(s=u'abcd123')] - - >>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c']) - >>> df.select(concat(df.a, df.b, df.c).alias("arr")).collect() - [Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.concat(_to_seq(sc, cols, _to_java_column))) - - -@since(2.4) -def array_position(col, value): - """ - Collection function: Locates the position of the first occurrence of the given value - in the given array. Returns null if either of the arguments are null. - - .. note:: The position is not zero based, but 1 based index. Returns 0 if the given - value could not be found in the array. - - >>> df = spark.createDataFrame([(["c", "b", "a"],), ([],)], ['data']) - >>> df.select(array_position(df.data, "a")).collect() - [Row(array_position(data, a)=3), Row(array_position(data, a)=0)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_position(_to_java_column(col), value)) - - -@ignore_unicode_prefix -@since(2.4) -def element_at(col, extraction): - """ - Collection function: Returns element of array at given index in extraction if col is array. - Returns value for the given key in extraction if col is map. - - :param col: name of column containing array or map - :param extraction: index to check for in array or key to check for in map - - .. note:: The position is not zero based, but 1 based index. - - >>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data']) - >>> df.select(element_at(df.data, 1)).collect() - [Row(element_at(data, 1)=u'a'), Row(element_at(data, 1)=None)] - - >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},), ({},)], ['data']) - >>> df.select(element_at(df.data, "a")).collect() - [Row(element_at(data, a)=1.0), Row(element_at(data, a)=None)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.element_at(_to_java_column(col), extraction)) - - -@since(2.4) -def array_remove(col, element): - """ - Collection function: Remove all elements that equal to element from the given array. - - :param col: name of column containing array - :param element: element to be removed from the array - - >>> df = spark.createDataFrame([([1, 2, 3, 1, 1],), ([],)], ['data']) - >>> df.select(array_remove(df.data, 1)).collect() - [Row(array_remove(data, 1)=[2, 3]), Row(array_remove(data, 1)=[])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_remove(_to_java_column(col), element)) - - -@since(2.4) -def array_distinct(col): - """ - Collection function: removes duplicate values from the array. - :param col: name of column or expression - - >>> df = spark.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], ['data']) - >>> df.select(array_distinct(df.data)).collect() - [Row(array_distinct(data)=[1, 2, 3]), Row(array_distinct(data)=[4, 5])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_distinct(_to_java_column(col))) - - -@ignore_unicode_prefix -@since(2.4) -def array_intersect(col1, col2): - """ - Collection function: returns an array of the elements in the intersection of col1 and col2, - without duplicates. - - :param col1: name of column containing array - :param col2: name of column containing array - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) - >>> df.select(array_intersect(df.c1, df.c2)).collect() - [Row(array_intersect(c1, c2)=[u'a', u'c'])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_intersect(_to_java_column(col1), _to_java_column(col2))) - - -@ignore_unicode_prefix -@since(2.4) -def array_union(col1, col2): - """ - Collection function: returns an array of the elements in the union of col1 and col2, - without duplicates. - - :param col1: name of column containing array - :param col2: name of column containing array - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) - >>> df.select(array_union(df.c1, df.c2)).collect() - [Row(array_union(c1, c2)=[u'b', u'a', u'c', u'd', u'f'])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_union(_to_java_column(col1), _to_java_column(col2))) - - -@ignore_unicode_prefix -@since(2.4) -def array_except(col1, col2): - """ - Collection function: returns an array of the elements in col1 but not in col2, - without duplicates. - - :param col1: name of column containing array - :param col2: name of column containing array - - >>> from pyspark.sql import Row - >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) - >>> df.select(array_except(df.c1, df.c2)).collect() - [Row(array_except(c1, c2)=[u'b'])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_except(_to_java_column(col1), _to_java_column(col2))) - - -@since(1.4) -def explode(col): - """Returns a new row for each element in the given array or map. - - >>> from pyspark.sql import Row - >>> eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) - >>> eDF.select(explode(eDF.intlist).alias("anInt")).collect() - [Row(anInt=1), Row(anInt=2), Row(anInt=3)] - - >>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show() - +---+-----+ - |key|value| - +---+-----+ - | a| b| - +---+-----+ - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.explode(_to_java_column(col)) - return Column(jc) - - -@since(2.1) -def posexplode(col): - """Returns a new row for each element with position in the given array or map. - - >>> from pyspark.sql import Row - >>> eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) - >>> eDF.select(posexplode(eDF.intlist)).collect() - [Row(pos=0, col=1), Row(pos=1, col=2), Row(pos=2, col=3)] - - >>> eDF.select(posexplode(eDF.mapfield)).show() - +---+---+-----+ - |pos|key|value| - +---+---+-----+ - | 0| a| b| - +---+---+-----+ - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.posexplode(_to_java_column(col)) - return Column(jc) - - -@since(2.3) -def explode_outer(col): - """Returns a new row for each element in the given array or map. - Unlike explode, if the array/map is null or empty then null is produced. - - >>> df = spark.createDataFrame( - ... [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)], - ... ("id", "an_array", "a_map") - ... ) - >>> df.select("id", "an_array", explode_outer("a_map")).show() - +---+----------+----+-----+ - | id| an_array| key|value| - +---+----------+----+-----+ - | 1|[foo, bar]| x| 1.0| - | 2| []|null| null| - | 3| null|null| null| - +---+----------+----+-----+ - - >>> df.select("id", "a_map", explode_outer("an_array")).show() - +---+----------+----+ - | id| a_map| col| - +---+----------+----+ - | 1|[x -> 1.0]| foo| - | 1|[x -> 1.0]| bar| - | 2| []|null| - | 3| null|null| - +---+----------+----+ - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.explode_outer(_to_java_column(col)) - return Column(jc) - - -@since(2.3) -def posexplode_outer(col): - """Returns a new row for each element with position in the given array or map. - Unlike posexplode, if the array/map is null or empty then the row (null, null) is produced. - - >>> df = spark.createDataFrame( - ... [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)], - ... ("id", "an_array", "a_map") - ... ) - >>> df.select("id", "an_array", posexplode_outer("a_map")).show() - +---+----------+----+----+-----+ - | id| an_array| pos| key|value| - +---+----------+----+----+-----+ - | 1|[foo, bar]| 0| x| 1.0| - | 2| []|null|null| null| - | 3| null|null|null| null| - +---+----------+----+----+-----+ - >>> df.select("id", "a_map", posexplode_outer("an_array")).show() - +---+----------+----+----+ - | id| a_map| pos| col| - +---+----------+----+----+ - | 1|[x -> 1.0]| 0| foo| - | 1|[x -> 1.0]| 1| bar| - | 2| []|null|null| - | 3| null|null|null| - +---+----------+----+----+ - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.posexplode_outer(_to_java_column(col)) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.6) -def get_json_object(col, path): - """ - Extracts json object from a json string based on json path specified, and returns json string - of the extracted json object. It will return null if the input json string is invalid. - - :param col: string column in json format - :param path: path to the json object to extract - - >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] - >>> df = spark.createDataFrame(data, ("key", "jstring")) - >>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\ - ... get_json_object(df.jstring, '$.f2').alias("c1") ).collect() - [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.get_json_object(_to_java_column(col), path) - return Column(jc) - - -@ignore_unicode_prefix -@since(1.6) -def json_tuple(col, *fields): - """Creates a new row for a json column according to the given field names. - - :param col: string column in json format - :param fields: list of fields to extract - - >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] - >>> df = spark.createDataFrame(data, ("key", "jstring")) - >>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect() - [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)] - """ - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.json_tuple(_to_java_column(col), _to_seq(sc, fields)) - return Column(jc) - - -@ignore_unicode_prefix -@since(2.1) -def from_json(col, schema, options={}): - """ - Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType` - as keys type, :class:`StructType` or :class:`ArrayType` with - the specified schema. Returns `null`, in the case of an unparseable string. - - :param col: string column in json format - :param schema: a StructType or ArrayType of StructType to use when parsing the json column. - :param options: options to control parsing. accepts the same options as the json datasource - - .. note:: Since Spark 2.3, the DDL-formatted string or a JSON format string is also - supported for ``schema``. - - >>> from pyspark.sql.types import * - >>> data = [(1, '''{"a": 1}''')] - >>> schema = StructType([StructField("a", IntegerType())]) - >>> df = spark.createDataFrame(data, ("key", "value")) - >>> df.select(from_json(df.value, schema).alias("json")).collect() - [Row(json=Row(a=1))] - >>> df.select(from_json(df.value, "a INT").alias("json")).collect() - [Row(json=Row(a=1))] - >>> df.select(from_json(df.value, "MAP").alias("json")).collect() - [Row(json={u'a': 1})] - >>> data = [(1, '''[{"a": 1}]''')] - >>> schema = ArrayType(StructType([StructField("a", IntegerType())])) - >>> df = spark.createDataFrame(data, ("key", "value")) - >>> df.select(from_json(df.value, schema).alias("json")).collect() - [Row(json=[Row(a=1)])] - >>> schema = schema_of_json(lit('''{"a": 0}''')) - >>> df.select(from_json(df.value, schema).alias("json")).collect() - [Row(json=Row(a=1))] - >>> data = [(1, '''[1, 2, 3]''')] - >>> schema = ArrayType(IntegerType()) - >>> df = spark.createDataFrame(data, ("key", "value")) - >>> df.select(from_json(df.value, schema).alias("json")).collect() - [Row(json=[1, 2, 3])] - """ - - sc = SparkContext._active_spark_context - if isinstance(schema, DataType): - schema = schema.json() - elif isinstance(schema, Column): - schema = _to_java_column(schema) - jc = sc._jvm.functions.from_json(_to_java_column(col), schema, options) - return Column(jc) - - -@ignore_unicode_prefix -@since(2.1) -def to_json(col, options={}): - """ - Converts a column containing a :class:`StructType`, :class:`ArrayType` or a :class:`MapType` - into a JSON string. Throws an exception, in the case of an unsupported type. - - :param col: name of column containing a struct, an array or a map. - :param options: options to control converting. accepts the same options as the JSON datasource - - >>> from pyspark.sql import Row - >>> from pyspark.sql.types import * - >>> data = [(1, Row(name='Alice', age=2))] - >>> df = spark.createDataFrame(data, ("key", "value")) - >>> df.select(to_json(df.value).alias("json")).collect() - [Row(json=u'{"age":2,"name":"Alice"}')] - >>> data = [(1, [Row(name='Alice', age=2), Row(name='Bob', age=3)])] - >>> df = spark.createDataFrame(data, ("key", "value")) - >>> df.select(to_json(df.value).alias("json")).collect() - [Row(json=u'[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')] - >>> data = [(1, {"name": "Alice"})] - >>> df = spark.createDataFrame(data, ("key", "value")) - >>> df.select(to_json(df.value).alias("json")).collect() - [Row(json=u'{"name":"Alice"}')] - >>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])] - >>> df = spark.createDataFrame(data, ("key", "value")) - >>> df.select(to_json(df.value).alias("json")).collect() - [Row(json=u'[{"name":"Alice"},{"name":"Bob"}]')] - >>> data = [(1, ["Alice", "Bob"])] - >>> df = spark.createDataFrame(data, ("key", "value")) - >>> df.select(to_json(df.value).alias("json")).collect() - [Row(json=u'["Alice","Bob"]')] - """ - - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.to_json(_to_java_column(col), options) - return Column(jc) - - -@ignore_unicode_prefix -@since(2.4) -def schema_of_json(json): - """ - Parses a JSON string and infers its schema in DDL format. - - :param json: a JSON string or a string literal containing a JSON string. - - >>> df = spark.range(1) - >>> df.select(schema_of_json('{"a": 0}').alias("json")).collect() - [Row(json=u'struct')] - """ - if isinstance(json, basestring): - col = _create_column_from_literal(json) - elif isinstance(json, Column): - col = _to_java_column(json) - else: - raise TypeError("schema argument should be a column or string") - - sc = SparkContext._active_spark_context - jc = sc._jvm.functions.schema_of_json(col) - return Column(jc) - - -@since(1.5) -def size(col): - """ - Collection function: returns the length of the array or map stored in the column. - - :param col: name of column or expression - - >>> df = spark.createDataFrame([([1, 2, 3],),([1],),([],)], ['data']) - >>> df.select(size(df.data)).collect() - [Row(size(data)=3), Row(size(data)=1), Row(size(data)=0)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.size(_to_java_column(col))) - - -@since(2.4) -def array_min(col): - """ - Collection function: returns the minimum value of the array. - - :param col: name of column or expression - - >>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data']) - >>> df.select(array_min(df.data).alias('min')).collect() - [Row(min=1), Row(min=-1)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_min(_to_java_column(col))) - - -@since(2.4) -def array_max(col): - """ - Collection function: returns the maximum value of the array. - - :param col: name of column or expression - - >>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data']) - >>> df.select(array_max(df.data).alias('max')).collect() - [Row(max=3), Row(max=10)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_max(_to_java_column(col))) - - -@since(1.5) -def sort_array(col, asc=True): - """ - Collection function: sorts the input array in ascending or descending order according - to the natural ordering of the array elements. Null elements will be placed at the beginning - of the returned array in ascending order or at the end of the returned array in descending - order. - - :param col: name of column or expression - - >>> df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data']) - >>> df.select(sort_array(df.data).alias('r')).collect() - [Row(r=[None, 1, 2, 3]), Row(r=[1]), Row(r=[])] - >>> df.select(sort_array(df.data, asc=False).alias('r')).collect() - [Row(r=[3, 2, 1, None]), Row(r=[1]), Row(r=[])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.sort_array(_to_java_column(col), asc)) - - -@since(2.4) -def array_sort(col): - """ - Collection function: sorts the input array in ascending order. The elements of the input array - must be orderable. Null elements will be placed at the end of the returned array. - - :param col: name of column or expression - - >>> df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data']) - >>> df.select(array_sort(df.data).alias('r')).collect() - [Row(r=[1, 2, 3, None]), Row(r=[1]), Row(r=[])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_sort(_to_java_column(col))) - - -@since(2.4) -def shuffle(col): - """ - Collection function: Generates a random permutation of the given array. - - .. note:: The function is non-deterministic. - - :param col: name of column or expression - - >>> df = spark.createDataFrame([([1, 20, 3, 5],), ([1, 20, None, 3],)], ['data']) - >>> df.select(shuffle(df.data).alias('s')).collect() # doctest: +SKIP - [Row(s=[3, 1, 5, 20]), Row(s=[20, None, 3, 1])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.shuffle(_to_java_column(col))) - - -@since(1.5) -@ignore_unicode_prefix -def reverse(col): - """ - Collection function: returns a reversed string or an array with reverse order of elements. - - :param col: name of column or expression - - >>> df = spark.createDataFrame([('Spark SQL',)], ['data']) - >>> df.select(reverse(df.data).alias('s')).collect() - [Row(s=u'LQS krapS')] - >>> df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], ['data']) - >>> df.select(reverse(df.data).alias('r')).collect() - [Row(r=[3, 1, 2]), Row(r=[1]), Row(r=[])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.reverse(_to_java_column(col))) - - -@since(2.4) -def flatten(col): - """ - Collection function: creates a single array from an array of arrays. - If a structure of nested arrays is deeper than two levels, - only one level of nesting is removed. - - :param col: name of column or expression - - >>> df = spark.createDataFrame([([[1, 2, 3], [4, 5], [6]],), ([None, [4, 5]],)], ['data']) - >>> df.select(flatten(df.data).alias('r')).collect() - [Row(r=[1, 2, 3, 4, 5, 6]), Row(r=None)] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.flatten(_to_java_column(col))) - - -@since(2.3) -def map_keys(col): - """ - Collection function: Returns an unordered array containing the keys of the map. - - :param col: name of column or expression - - >>> from pyspark.sql.functions import map_keys - >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data") - >>> df.select(map_keys("data").alias("keys")).show() - +------+ - | keys| - +------+ - |[1, 2]| - +------+ - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.map_keys(_to_java_column(col))) - - -@since(2.3) -def map_values(col): - """ - Collection function: Returns an unordered array containing the values of the map. - - :param col: name of column or expression - - >>> from pyspark.sql.functions import map_values - >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data") - >>> df.select(map_values("data").alias("values")).show() - +------+ - |values| - +------+ - |[a, b]| - +------+ - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.map_values(_to_java_column(col))) - - -@since(2.4) -def map_from_entries(col): - """ - Collection function: Returns a map created from the given array of entries. - - :param col: name of column or expression - - >>> from pyspark.sql.functions import map_from_entries - >>> df = spark.sql("SELECT array(struct(1, 'a'), struct(2, 'b')) as data") - >>> df.select(map_from_entries("data").alias("map")).show() - +----------------+ - | map| - +----------------+ - |[1 -> a, 2 -> b]| - +----------------+ - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.map_from_entries(_to_java_column(col))) - - -@ignore_unicode_prefix -@since(2.4) -def array_repeat(col, count): - """ - Collection function: creates an array containing a column repeated count times. - - >>> df = spark.createDataFrame([('ab',)], ['data']) - >>> df.select(array_repeat(df.data, 3).alias('r')).collect() - [Row(r=[u'ab', u'ab', u'ab'])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.array_repeat(_to_java_column(col), count)) - - -@since(2.4) -def arrays_zip(*cols): - """ - Collection function: Returns a merged array of structs in which the N-th struct contains all - N-th values of input arrays. - - :param cols: columns of arrays to be merged. - - >>> from pyspark.sql.functions import arrays_zip - >>> df = spark.createDataFrame([(([1, 2, 3], [2, 3, 4]))], ['vals1', 'vals2']) - >>> df.select(arrays_zip(df.vals1, df.vals2).alias('zipped')).collect() - [Row(zipped=[Row(vals1=1, vals2=2), Row(vals1=2, vals2=3), Row(vals1=3, vals2=4)])] - """ - sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.arrays_zip(_to_seq(sc, cols, _to_java_column))) - - -@since(2.4) -def map_concat(*cols): - """Returns the union of all the given maps. - - :param cols: list of column names (string) or list of :class:`Column` expressions - - >>> from pyspark.sql.functions import map_concat - >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c', 1, 'd') as map2") - >>> df.select(map_concat("map1", "map2").alias("map3")).show(truncate=False) - +--------------------------------+ - |map3 | - +--------------------------------+ - |[1 -> a, 2 -> b, 3 -> c, 1 -> d]| - +--------------------------------+ - """ - sc = SparkContext._active_spark_context - if len(cols) == 1 and isinstance(cols[0], (list, set)): - cols = cols[0] - jc = sc._jvm.functions.map_concat(_to_seq(sc, cols, _to_java_column)) - return Column(jc) - - -@since(2.4) -def sequence(start, stop, step=None): - """ - Generate a sequence of integers from `start` to `stop`, incrementing by `step`. - If `step` is not set, incrementing by 1 if `start` is less than or equal to `stop`, - otherwise -1. - - >>> df1 = spark.createDataFrame([(-2, 2)], ('C1', 'C2')) - >>> df1.select(sequence('C1', 'C2').alias('r')).collect() - [Row(r=[-2, -1, 0, 1, 2])] - >>> df2 = spark.createDataFrame([(4, -4, -2)], ('C1', 'C2', 'C3')) - >>> df2.select(sequence('C1', 'C2', 'C3').alias('r')).collect() - [Row(r=[4, 2, 0, -2, -4])] - """ - sc = SparkContext._active_spark_context - if step is None: - return Column(sc._jvm.functions.sequence(_to_java_column(start), _to_java_column(stop))) - else: - return Column(sc._jvm.functions.sequence( - _to_java_column(start), _to_java_column(stop), _to_java_column(step))) - - -# ---------------------------- User Defined Function ---------------------------------- - -class PandasUDFType(object): - """Pandas UDF Types. See :meth:`pyspark.sql.functions.pandas_udf`. - """ - SCALAR = PythonEvalType.SQL_SCALAR_PANDAS_UDF - - GROUPED_MAP = PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF - - GROUPED_AGG = PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF - - -@since(1.3) -def udf(f=None, returnType=StringType()): - """Creates a user defined function (UDF). - - .. note:: The user-defined functions are considered deterministic by default. Due to - optimization, duplicate invocations may be eliminated or the function may even be invoked - more times than it is present in the query. If your function is not deterministic, call - `asNondeterministic` on the user defined function. E.g.: - - >>> from pyspark.sql.types import IntegerType - >>> import random - >>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic() - - .. note:: The user-defined functions do not support conditional expressions or short circuiting - in boolean expressions and it ends up with being executed all internally. If the functions - can fail on special rows, the workaround is to incorporate the condition into the functions. - - .. note:: The user-defined functions do not take keyword arguments on the calling side. - - :param f: python function if used as a standalone function - :param returnType: the return type of the user-defined function. The value can be either a - :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. - - >>> from pyspark.sql.types import IntegerType - >>> slen = udf(lambda s: len(s), IntegerType()) - >>> @udf - ... def to_upper(s): - ... if s is not None: - ... return s.upper() - ... - >>> @udf(returnType=IntegerType()) - ... def add_one(x): - ... if x is not None: - ... return x + 1 - ... - >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) - >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show() - +----------+--------------+------------+ - |slen(name)|to_upper(name)|add_one(age)| - +----------+--------------+------------+ - | 8| JOHN DOE| 22| - +----------+--------------+------------+ - """ - # decorator @udf, @udf(), @udf(dataType()) - if f is None or isinstance(f, (str, DataType)): - # If DataType has been passed as a positional argument - # for decorator use it as a returnType - return_type = f or returnType - return functools.partial(_create_udf, returnType=return_type, - evalType=PythonEvalType.SQL_BATCHED_UDF) - else: - return _create_udf(f=f, returnType=returnType, - evalType=PythonEvalType.SQL_BATCHED_UDF) - - -@since(2.3) -def pandas_udf(f=None, returnType=None, functionType=None): - """ - Creates a vectorized user defined function (UDF). - - :param f: user-defined function. A python function if used as a standalone function - :param returnType: the return type of the user-defined function. The value can be either a - :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. - :param functionType: an enum value in :class:`pyspark.sql.functions.PandasUDFType`. - Default: SCALAR. - - .. note:: Experimental - - The function type of the UDF can be one of the following: - - 1. SCALAR - - A scalar UDF defines a transformation: One or more `pandas.Series` -> A `pandas.Series`. - The length of the returned `pandas.Series` must be of the same as the input `pandas.Series`. - - :class:`MapType`, :class:`StructType` are currently not supported as output types. - - Scalar UDFs are used with :meth:`pyspark.sql.DataFrame.withColumn` and - :meth:`pyspark.sql.DataFrame.select`. - - >>> from pyspark.sql.functions import pandas_udf, PandasUDFType - >>> from pyspark.sql.types import IntegerType, StringType - >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) # doctest: +SKIP - >>> @pandas_udf(StringType()) # doctest: +SKIP - ... def to_upper(s): - ... return s.str.upper() - ... - >>> @pandas_udf("integer", PandasUDFType.SCALAR) # doctest: +SKIP - ... def add_one(x): - ... return x + 1 - ... - >>> df = spark.createDataFrame([(1, "John Doe", 21)], - ... ("id", "name", "age")) # doctest: +SKIP - >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ - ... .show() # doctest: +SKIP - +----------+--------------+------------+ - |slen(name)|to_upper(name)|add_one(age)| - +----------+--------------+------------+ - | 8| JOHN DOE| 22| - +----------+--------------+------------+ - - .. note:: The length of `pandas.Series` within a scalar UDF is not that of the whole input - column, but is the length of an internal batch used for each call to the function. - Therefore, this can be used, for example, to ensure the length of each returned - `pandas.Series`, and can not be used as the column length. - - 2. GROUPED_MAP - - A grouped map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame` - The returnType should be a :class:`StructType` describing the schema of the returned - `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match - the field names in the defined returnType schema if specified as strings, or match the - field data types by position if not strings, e.g. integer indices. - The length of the returned `pandas.DataFrame` can be arbitrary. - - Grouped map UDFs are used with :meth:`pyspark.sql.GroupedData.apply`. - - >>> from pyspark.sql.functions import pandas_udf, PandasUDFType - >>> df = spark.createDataFrame( - ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], - ... ("id", "v")) # doctest: +SKIP - >>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP) # doctest: +SKIP - ... def normalize(pdf): - ... v = pdf.v - ... return pdf.assign(v=(v - v.mean()) / v.std()) - >>> df.groupby("id").apply(normalize).show() # doctest: +SKIP - +---+-------------------+ - | id| v| - +---+-------------------+ - | 1|-0.7071067811865475| - | 1| 0.7071067811865475| - | 2|-0.8320502943378437| - | 2|-0.2773500981126146| - | 2| 1.1094003924504583| - +---+-------------------+ - - Alternatively, the user can define a function that takes two arguments. - In this case, the grouping key(s) will be passed as the first argument and the data will - be passed as the second argument. The grouping key(s) will be passed as a tuple of numpy - data types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in - as a `pandas.DataFrame` containing all columns from the original Spark DataFrame. - This is useful when the user does not want to hardcode grouping key(s) in the function. - - >>> import pandas as pd # doctest: +SKIP - >>> from pyspark.sql.functions import pandas_udf, PandasUDFType - >>> df = spark.createDataFrame( - ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], - ... ("id", "v")) # doctest: +SKIP - >>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP) # doctest: +SKIP - ... def mean_udf(key, pdf): - ... # key is a tuple of one numpy.int64, which is the value - ... # of 'id' for the current group - ... return pd.DataFrame([key + (pdf.v.mean(),)]) - >>> df.groupby('id').apply(mean_udf).show() # doctest: +SKIP - +---+---+ - | id| v| - +---+---+ - | 1|1.5| - | 2|6.0| - +---+---+ - >>> @pandas_udf( - ... "id long, `ceil(v / 2)` long, v double", - ... PandasUDFType.GROUPED_MAP) # doctest: +SKIP - >>> def sum_udf(key, pdf): - ... # key is a tuple of two numpy.int64s, which is the values - ... # of 'id' and 'ceil(df.v / 2)' for the current group - ... return pd.DataFrame([key + (pdf.v.sum(),)]) - >>> df.groupby(df.id, ceil(df.v / 2)).apply(sum_udf).show() # doctest: +SKIP - +---+-----------+----+ - | id|ceil(v / 2)| v| - +---+-----------+----+ - | 2| 5|10.0| - | 1| 1| 3.0| - | 2| 3| 5.0| - | 2| 2| 3.0| - +---+-----------+----+ - - .. note:: If returning a new `pandas.DataFrame` constructed with a dictionary, it is - recommended to explicitly index the columns by name to ensure the positions are correct, - or alternatively use an `OrderedDict`. - For example, `pd.DataFrame({'id': ids, 'a': data}, columns=['id', 'a'])` or - `pd.DataFrame(OrderedDict([('id', ids), ('a', data)]))`. - - .. seealso:: :meth:`pyspark.sql.GroupedData.apply` - - 3. GROUPED_AGG - - A grouped aggregate UDF defines a transformation: One or more `pandas.Series` -> A scalar - The `returnType` should be a primitive data type, e.g., :class:`DoubleType`. - The returned scalar can be either a python primitive type, e.g., `int` or `float` - or a numpy data type, e.g., `numpy.int64` or `numpy.float64`. - - :class:`MapType` and :class:`StructType` are currently not supported as output types. - - Group aggregate UDFs are used with :meth:`pyspark.sql.GroupedData.agg` and - :class:`pyspark.sql.Window` - - This example shows using grouped aggregated UDFs with groupby: - - >>> from pyspark.sql.functions import pandas_udf, PandasUDFType - >>> df = spark.createDataFrame( - ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], - ... ("id", "v")) - >>> @pandas_udf("double", PandasUDFType.GROUPED_AGG) # doctest: +SKIP - ... def mean_udf(v): - ... return v.mean() - >>> df.groupby("id").agg(mean_udf(df['v'])).show() # doctest: +SKIP - +---+-----------+ - | id|mean_udf(v)| - +---+-----------+ - | 1| 1.5| - | 2| 6.0| - +---+-----------+ - - This example shows using grouped aggregated UDFs as window functions. Note that only - unbounded window frame is supported at the moment: - - >>> from pyspark.sql.functions import pandas_udf, PandasUDFType - >>> from pyspark.sql import Window - >>> df = spark.createDataFrame( - ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], - ... ("id", "v")) - >>> @pandas_udf("double", PandasUDFType.GROUPED_AGG) # doctest: +SKIP - ... def mean_udf(v): - ... return v.mean() - >>> w = Window \\ - ... .partitionBy('id') \\ - ... .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) - >>> df.withColumn('mean_v', mean_udf(df['v']).over(w)).show() # doctest: +SKIP - +---+----+------+ - | id| v|mean_v| - +---+----+------+ - | 1| 1.0| 1.5| - | 1| 2.0| 1.5| - | 2| 3.0| 6.0| - | 2| 5.0| 6.0| - | 2|10.0| 6.0| - +---+----+------+ - - .. seealso:: :meth:`pyspark.sql.GroupedData.agg` and :class:`pyspark.sql.Window` - - .. note:: The user-defined functions are considered deterministic by default. Due to - optimization, duplicate invocations may be eliminated or the function may even be invoked - more times than it is present in the query. If your function is not deterministic, call - `asNondeterministic` on the user defined function. E.g.: - - >>> @pandas_udf('double', PandasUDFType.SCALAR) # doctest: +SKIP - ... def random(v): - ... import numpy as np - ... import pandas as pd - ... return pd.Series(np.random.randn(len(v)) - >>> random = random.asNondeterministic() # doctest: +SKIP - - .. note:: The user-defined functions do not support conditional expressions or short circuiting - in boolean expressions and it ends up with being executed all internally. If the functions - can fail on special rows, the workaround is to incorporate the condition into the functions. - - .. note:: The user-defined functions do not take keyword arguments on the calling side. - """ - # decorator @pandas_udf(returnType, functionType) - is_decorator = f is None or isinstance(f, (str, DataType)) - - if is_decorator: - # If DataType has been passed as a positional argument - # for decorator use it as a returnType - return_type = f or returnType - - if functionType is not None: - # @pandas_udf(dataType, functionType=functionType) - # @pandas_udf(returnType=dataType, functionType=functionType) - eval_type = functionType - elif returnType is not None and isinstance(returnType, int): - # @pandas_udf(dataType, functionType) - eval_type = returnType - else: - # @pandas_udf(dataType) or @pandas_udf(returnType=dataType) - eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF - else: - return_type = returnType - - if functionType is not None: - eval_type = functionType - else: - eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF - - if return_type is None: - raise ValueError("Invalid returnType: returnType can not be None") - - if eval_type not in [PythonEvalType.SQL_SCALAR_PANDAS_UDF, - PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, - PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]: - raise ValueError("Invalid functionType: " - "functionType must be one the values from PandasUDFType") - - if is_decorator: - return functools.partial(_create_udf, returnType=return_type, evalType=eval_type) - else: - return _create_udf(f=f, returnType=return_type, evalType=eval_type) - - -blacklist = ['map', 'since', 'ignore_unicode_prefix'] -__all__ = [k for k, v in globals().items() - if not k.startswith('_') and k[0].islower() and callable(v) and k not in blacklist] -__all__ += ["PandasUDFType"] -__all__.sort() - - -def _test(): - import doctest - from pyspark.sql import Row, SparkSession - import pyspark.sql.functions - globs = pyspark.sql.functions.__dict__.copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("sql.functions tests")\ - .getOrCreate() - sc = spark.sparkContext - globs['sc'] = sc - globs['spark'] = spark - globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)]) - (failure_count, test_count) = doctest.testmod( - pyspark.sql.functions, globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/group.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/group.py deleted file mode 100644 index cc1da8e..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/group.py +++ /dev/null @@ -1,317 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys - -from pyspark import since -from pyspark.rdd import ignore_unicode_prefix, PythonEvalType -from pyspark.sql.column import Column, _to_seq -from pyspark.sql.dataframe import DataFrame -from pyspark.sql.types import * - -__all__ = ["GroupedData"] - - -def dfapi(f): - def _api(self): - name = f.__name__ - jdf = getattr(self._jgd, name)() - return DataFrame(jdf, self.sql_ctx) - _api.__name__ = f.__name__ - _api.__doc__ = f.__doc__ - return _api - - -def df_varargs_api(f): - def _api(self, *cols): - name = f.__name__ - jdf = getattr(self._jgd, name)(_to_seq(self.sql_ctx._sc, cols)) - return DataFrame(jdf, self.sql_ctx) - _api.__name__ = f.__name__ - _api.__doc__ = f.__doc__ - return _api - - -class GroupedData(object): - """ - A set of methods for aggregations on a :class:`DataFrame`, - created by :func:`DataFrame.groupBy`. - - .. note:: Experimental - - .. versionadded:: 1.3 - """ - - def __init__(self, jgd, df): - self._jgd = jgd - self._df = df - self.sql_ctx = df.sql_ctx - - @ignore_unicode_prefix - @since(1.3) - def agg(self, *exprs): - """Compute aggregates and returns the result as a :class:`DataFrame`. - - The available aggregate functions can be: - - 1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count` - - 2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf` - - .. note:: There is no partial aggregation with group aggregate UDFs, i.e., - a full shuffle is required. Also, all the data of a group will be loaded into - memory, so the user should be aware of the potential OOM risk if data is skewed - and certain groups are too large to fit in memory. - - .. seealso:: :func:`pyspark.sql.functions.pandas_udf` - - If ``exprs`` is a single :class:`dict` mapping from string to string, then the key - is the column to perform aggregation on, and the value is the aggregate function. - - Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions. - - .. note:: Built-in aggregation functions and group aggregate pandas UDFs cannot be mixed - in a single call to this function. - - :param exprs: a dict mapping from column name (string) to aggregate functions (string), - or a list of :class:`Column`. - - >>> gdf = df.groupBy(df.name) - >>> sorted(gdf.agg({"*": "count"}).collect()) - [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)] - - >>> from pyspark.sql import functions as F - >>> sorted(gdf.agg(F.min(df.age)).collect()) - [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)] - - >>> from pyspark.sql.functions import pandas_udf, PandasUDFType - >>> @pandas_udf('int', PandasUDFType.GROUPED_AGG) # doctest: +SKIP - ... def min_udf(v): - ... return v.min() - >>> sorted(gdf.agg(min_udf(df.age)).collect()) # doctest: +SKIP - [Row(name=u'Alice', min_udf(age)=2), Row(name=u'Bob', min_udf(age)=5)] - """ - assert exprs, "exprs should not be empty" - if len(exprs) == 1 and isinstance(exprs[0], dict): - jdf = self._jgd.agg(exprs[0]) - else: - # Columns - assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column" - jdf = self._jgd.agg(exprs[0]._jc, - _to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]])) - return DataFrame(jdf, self.sql_ctx) - - @dfapi - @since(1.3) - def count(self): - """Counts the number of records for each group. - - >>> sorted(df.groupBy(df.age).count().collect()) - [Row(age=2, count=1), Row(age=5, count=1)] - """ - - @df_varargs_api - @since(1.3) - def mean(self, *cols): - """Computes average values for each numeric columns for each group. - - :func:`mean` is an alias for :func:`avg`. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df.groupBy().mean('age').collect() - [Row(avg(age)=3.5)] - >>> df3.groupBy().mean('age', 'height').collect() - [Row(avg(age)=3.5, avg(height)=82.5)] - """ - - @df_varargs_api - @since(1.3) - def avg(self, *cols): - """Computes average values for each numeric columns for each group. - - :func:`mean` is an alias for :func:`avg`. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df.groupBy().avg('age').collect() - [Row(avg(age)=3.5)] - >>> df3.groupBy().avg('age', 'height').collect() - [Row(avg(age)=3.5, avg(height)=82.5)] - """ - - @df_varargs_api - @since(1.3) - def max(self, *cols): - """Computes the max value for each numeric columns for each group. - - >>> df.groupBy().max('age').collect() - [Row(max(age)=5)] - >>> df3.groupBy().max('age', 'height').collect() - [Row(max(age)=5, max(height)=85)] - """ - - @df_varargs_api - @since(1.3) - def min(self, *cols): - """Computes the min value for each numeric column for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df.groupBy().min('age').collect() - [Row(min(age)=2)] - >>> df3.groupBy().min('age', 'height').collect() - [Row(min(age)=2, min(height)=80)] - """ - - @df_varargs_api - @since(1.3) - def sum(self, *cols): - """Compute the sum for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df.groupBy().sum('age').collect() - [Row(sum(age)=7)] - >>> df3.groupBy().sum('age', 'height').collect() - [Row(sum(age)=7, sum(height)=165)] - """ - - @since(1.6) - def pivot(self, pivot_col, values=None): - """ - Pivots a column of the current :class:`DataFrame` and perform the specified aggregation. - There are two versions of pivot function: one that requires the caller to specify the list - of distinct values to pivot on, and one that does not. The latter is more concise but less - efficient, because Spark needs to first compute the list of distinct values internally. - - :param pivot_col: Name of the column to pivot. - :param values: List of values that will be translated to columns in the output DataFrame. - - # Compute the sum of earnings for each year by course with each course as a separate column - - >>> df4.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").collect() - [Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)] - - # Or without specifying column values (less efficient) - - >>> df4.groupBy("year").pivot("course").sum("earnings").collect() - [Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)] - >>> df5.groupBy("sales.year").pivot("sales.course").sum("sales.earnings").collect() - [Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)] - """ - if values is None: - jgd = self._jgd.pivot(pivot_col) - else: - jgd = self._jgd.pivot(pivot_col, values) - return GroupedData(jgd, self._df) - - @since(2.3) - def apply(self, udf): - """ - Maps each group of the current :class:`DataFrame` using a pandas udf and returns the result - as a `DataFrame`. - - The user-defined function should take a `pandas.DataFrame` and return another - `pandas.DataFrame`. For each group, all columns are passed together as a `pandas.DataFrame` - to the user-function and the returned `pandas.DataFrame` are combined as a - :class:`DataFrame`. - - The returned `pandas.DataFrame` can be of arbitrary length and its schema must match the - returnType of the pandas udf. - - .. note:: This function requires a full shuffle. all the data of a group will be loaded - into memory, so the user should be aware of the potential OOM risk if data is skewed - and certain groups are too large to fit in memory. - - .. note:: Experimental - - :param udf: a grouped map user-defined function returned by - :func:`pyspark.sql.functions.pandas_udf`. - - >>> from pyspark.sql.functions import pandas_udf, PandasUDFType - >>> df = spark.createDataFrame( - ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], - ... ("id", "v")) - >>> @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP) # doctest: +SKIP - ... def normalize(pdf): - ... v = pdf.v - ... return pdf.assign(v=(v - v.mean()) / v.std()) - >>> df.groupby("id").apply(normalize).show() # doctest: +SKIP - +---+-------------------+ - | id| v| - +---+-------------------+ - | 1|-0.7071067811865475| - | 1| 0.7071067811865475| - | 2|-0.8320502943378437| - | 2|-0.2773500981126146| - | 2| 1.1094003924504583| - +---+-------------------+ - - .. seealso:: :meth:`pyspark.sql.functions.pandas_udf` - - """ - # Columns are special because hasattr always return True - if isinstance(udf, Column) or not hasattr(udf, 'func') \ - or udf.evalType != PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: - raise ValueError("Invalid udf: the udf argument must be a pandas_udf of type " - "GROUPED_MAP.") - df = self._df - udf_column = udf(*[df[col] for col in df.columns]) - jdf = self._jgd.flatMapGroupsInPandas(udf_column._jc.expr()) - return DataFrame(jdf, self.sql_ctx) - - -def _test(): - import doctest - from pyspark.sql import Row, SparkSession - import pyspark.sql.group - globs = pyspark.sql.group.__dict__.copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("sql.group tests")\ - .getOrCreate() - sc = spark.sparkContext - globs['sc'] = sc - globs['spark'] = spark - globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ - .toDF(StructType([StructField('age', IntegerType()), - StructField('name', StringType())])) - globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80), - Row(name='Bob', age=5, height=85)]).toDF() - globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000), - Row(course="Java", year=2012, earnings=20000), - Row(course="dotNET", year=2012, earnings=5000), - Row(course="dotNET", year=2013, earnings=48000), - Row(course="Java", year=2013, earnings=30000)]).toDF() - globs['df5'] = sc.parallelize([ - Row(training="expert", sales=Row(course="dotNET", year=2012, earnings=10000)), - Row(training="junior", sales=Row(course="Java", year=2012, earnings=20000)), - Row(training="expert", sales=Row(course="dotNET", year=2012, earnings=5000)), - Row(training="junior", sales=Row(course="dotNET", year=2013, earnings=48000)), - Row(training="expert", sales=Row(course="Java", year=2013, earnings=30000))]).toDF() - - (failure_count, test_count) = doctest.testmod( - pyspark.sql.group, globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/readwriter.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/readwriter.py deleted file mode 100644 index c25426c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/readwriter.py +++ /dev/null @@ -1,1017 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys - -if sys.version >= '3': - basestring = unicode = str - -from py4j.java_gateway import JavaClass - -from pyspark import RDD, since -from pyspark.rdd import ignore_unicode_prefix -from pyspark.sql.column import _to_seq -from pyspark.sql.types import * -from pyspark.sql import utils - -__all__ = ["DataFrameReader", "DataFrameWriter"] - - -def to_str(value): - """ - A wrapper over str(), but converts bool values to lower case strings. - If None is given, just returns None, instead of converting it to string "None". - """ - if isinstance(value, bool): - return str(value).lower() - elif value is None: - return value - else: - return str(value) - - -class OptionUtils(object): - - def _set_opts(self, schema=None, **options): - """ - Set named options (filter out those the value is None) - """ - if schema is not None: - self.schema(schema) - for k, v in options.items(): - if v is not None: - self.option(k, v) - - -class DataFrameReader(OptionUtils): - """ - Interface used to load a :class:`DataFrame` from external storage systems - (e.g. file systems, key-value stores, etc). Use :func:`spark.read` - to access this. - - .. versionadded:: 1.4 - """ - - def __init__(self, spark): - self._jreader = spark._ssql_ctx.read() - self._spark = spark - - def _df(self, jdf): - from pyspark.sql.dataframe import DataFrame - return DataFrame(jdf, self._spark) - - @since(1.4) - def format(self, source): - """Specifies the input data source format. - - :param source: string, name of the data source, e.g. 'json', 'parquet'. - - >>> df = spark.read.format('json').load('python/test_support/sql/people.json') - >>> df.dtypes - [('age', 'bigint'), ('name', 'string')] - - """ - self._jreader = self._jreader.format(source) - return self - - @since(1.4) - def schema(self, schema): - """Specifies the input schema. - - Some data sources (e.g. JSON) can infer the input schema automatically from data. - By specifying the schema here, the underlying data source can skip the schema - inference step, and thus speed up data loading. - - :param schema: a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string - (For example ``col0 INT, col1 DOUBLE``). - - >>> s = spark.read.schema("col0 INT, col1 DOUBLE") - """ - from pyspark.sql import SparkSession - spark = SparkSession.builder.getOrCreate() - if isinstance(schema, StructType): - jschema = spark._jsparkSession.parseDataType(schema.json()) - self._jreader = self._jreader.schema(jschema) - elif isinstance(schema, basestring): - self._jreader = self._jreader.schema(schema) - else: - raise TypeError("schema should be StructType or string") - return self - - @since(1.5) - def option(self, key, value): - """Adds an input option for the underlying data source. - - You can set the following option(s) for reading files: - * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps - in the JSON/CSV datasources or partition values. - If it isn't set, it uses the default value, session local timezone. - """ - self._jreader = self._jreader.option(key, to_str(value)) - return self - - @since(1.4) - def options(self, **options): - """Adds input options for the underlying data source. - - You can set the following option(s) for reading files: - * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps - in the JSON/CSV datasources or partition values. - If it isn't set, it uses the default value, session local timezone. - """ - for k in options: - self._jreader = self._jreader.option(k, to_str(options[k])) - return self - - @since(1.4) - def load(self, path=None, format=None, schema=None, **options): - """Loads data from a data source and returns it as a :class`DataFrame`. - - :param path: optional string or a list of string for file-system backed data sources. - :param format: optional string for format of the data source. Default to 'parquet'. - :param schema: optional :class:`pyspark.sql.types.StructType` for the input schema - or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``). - :param options: all other string options - - >>> df = spark.read.format("parquet").load('python/test_support/sql/parquet_partitioned', - ... opt1=True, opt2=1, opt3='str') - >>> df.dtypes - [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')] - - >>> df = spark.read.format('json').load(['python/test_support/sql/people.json', - ... 'python/test_support/sql/people1.json']) - >>> df.dtypes - [('age', 'bigint'), ('aka', 'string'), ('name', 'string')] - """ - if format is not None: - self.format(format) - if schema is not None: - self.schema(schema) - self.options(**options) - if isinstance(path, basestring): - return self._df(self._jreader.load(path)) - elif path is not None: - if type(path) != list: - path = [path] - return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path))) - else: - return self._df(self._jreader.load()) - - @since(1.4) - def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, - allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None, - allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None, - mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None, - multiLine=None, allowUnquotedControlChars=None, lineSep=None, samplingRatio=None, - dropFieldIfAllNull=None, encoding=None): - """ - Loads JSON files and returns the results as a :class:`DataFrame`. - - `JSON Lines `_ (newline-delimited JSON) is supported by default. - For JSON (one record per file), set the ``multiLine`` parameter to ``true``. - - If the ``schema`` parameter is not specified, this function goes - through the input once to determine the input schema. - - :param path: string represents path to the JSON dataset, or a list of paths, - or RDD of Strings storing JSON objects. - :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema or - a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``). - :param primitivesAsString: infers all primitive values as a string type. If None is set, - it uses the default value, ``false``. - :param prefersDecimal: infers all floating-point values as a decimal type. If the values - do not fit in decimal, then it infers them as doubles. If None is - set, it uses the default value, ``false``. - :param allowComments: ignores Java/C++ style comment in JSON records. If None is set, - it uses the default value, ``false``. - :param allowUnquotedFieldNames: allows unquoted JSON field names. If None is set, - it uses the default value, ``false``. - :param allowSingleQuotes: allows single quotes in addition to double quotes. If None is - set, it uses the default value, ``true``. - :param allowNumericLeadingZero: allows leading zeros in numbers (e.g. 00012). If None is - set, it uses the default value, ``false``. - :param allowBackslashEscapingAnyCharacter: allows accepting quoting of all character - using backslash quoting mechanism. If None is - set, it uses the default value, ``false``. - :param mode: allows a mode for dealing with corrupt records during parsing. If None is - set, it uses the default value, ``PERMISSIVE``. - - * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \ - into a field configured by ``columnNameOfCorruptRecord``, and sets other \ - fields to ``null``. To keep corrupt records, an user can set a string type \ - field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \ - schema does not have the field, it drops corrupt records during parsing. \ - When inferring a schema, it implicitly adds a ``columnNameOfCorruptRecord`` \ - field in an output schema. - * ``DROPMALFORMED`` : ignores the whole corrupted records. - * ``FAILFAST`` : throws an exception when it meets corrupted records. - - :param columnNameOfCorruptRecord: allows renaming the new field having malformed string - created by ``PERMISSIVE`` mode. This overrides - ``spark.sql.columnNameOfCorruptRecord``. If None is set, - it uses the value specified in - ``spark.sql.columnNameOfCorruptRecord``. - :param dateFormat: sets the string that indicates a date format. Custom date formats - follow the formats at ``java.text.SimpleDateFormat``. This - applies to date type. If None is set, it uses the - default value, ``yyyy-MM-dd``. - :param timestampFormat: sets the string that indicates a timestamp format. Custom date - formats follow the formats at ``java.text.SimpleDateFormat``. - This applies to timestamp type. If None is set, it uses the - default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``. - :param multiLine: parse one record, which may span multiple lines, per file. If None is - set, it uses the default value, ``false``. - :param allowUnquotedControlChars: allows JSON Strings to contain unquoted control - characters (ASCII characters with value less than 32, - including tab and line feed characters) or not. - :param encoding: allows to forcibly set one of standard basic or extended encoding for - the JSON files. For example UTF-16BE, UTF-32LE. If None is set, - the encoding of input JSON will be detected automatically - when the multiLine option is set to ``true``. - :param lineSep: defines the line separator that should be used for parsing. If None is - set, it covers all ``\\r``, ``\\r\\n`` and ``\\n``. - :param samplingRatio: defines fraction of input JSON objects used for schema inferring. - If None is set, it uses the default value, ``1.0``. - :param dropFieldIfAllNull: whether to ignore column of all null values or empty - array/struct during schema inference. If None is set, it - uses the default value, ``false``. - - >>> df1 = spark.read.json('python/test_support/sql/people.json') - >>> df1.dtypes - [('age', 'bigint'), ('name', 'string')] - >>> rdd = sc.textFile('python/test_support/sql/people.json') - >>> df2 = spark.read.json(rdd) - >>> df2.dtypes - [('age', 'bigint'), ('name', 'string')] - - """ - self._set_opts( - schema=schema, primitivesAsString=primitivesAsString, prefersDecimal=prefersDecimal, - allowComments=allowComments, allowUnquotedFieldNames=allowUnquotedFieldNames, - allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero, - allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter, - mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat, - timestampFormat=timestampFormat, multiLine=multiLine, - allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep, - samplingRatio=samplingRatio, dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding) - if isinstance(path, basestring): - path = [path] - if type(path) == list: - return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path))) - elif isinstance(path, RDD): - def func(iterator): - for x in iterator: - if not isinstance(x, basestring): - x = unicode(x) - if isinstance(x, unicode): - x = x.encode("utf-8") - yield x - keyed = path.mapPartitions(func) - keyed._bypass_serializer = True - jrdd = keyed._jrdd.map(self._spark._jvm.BytesToString()) - return self._df(self._jreader.json(jrdd)) - else: - raise TypeError("path can be only string, list or RDD") - - @since(1.4) - def table(self, tableName): - """Returns the specified table as a :class:`DataFrame`. - - :param tableName: string, name of the table. - - >>> df = spark.read.parquet('python/test_support/sql/parquet_partitioned') - >>> df.createOrReplaceTempView('tmpTable') - >>> spark.read.table('tmpTable').dtypes - [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')] - """ - return self._df(self._jreader.table(tableName)) - - @since(1.4) - def parquet(self, *paths): - """Loads Parquet files, returning the result as a :class:`DataFrame`. - - You can set the following Parquet-specific option(s) for reading Parquet files: - * ``mergeSchema``: sets whether we should merge schemas collected from all \ - Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. \ - The default value is specified in ``spark.sql.parquet.mergeSchema``. - - >>> df = spark.read.parquet('python/test_support/sql/parquet_partitioned') - >>> df.dtypes - [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')] - """ - return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths))) - - @ignore_unicode_prefix - @since(1.6) - def text(self, paths, wholetext=False, lineSep=None): - """ - Loads text files and returns a :class:`DataFrame` whose schema starts with a - string column named "value", and followed by partitioned columns if there - are any. - - By default, each line in the text file is a new row in the resulting DataFrame. - - :param paths: string, or list of strings, for input path(s). - :param wholetext: if true, read each file from input path(s) as a single row. - :param lineSep: defines the line separator that should be used for parsing. If None is - set, it covers all ``\\r``, ``\\r\\n`` and ``\\n``. - - >>> df = spark.read.text('python/test_support/sql/text-test.txt') - >>> df.collect() - [Row(value=u'hello'), Row(value=u'this')] - >>> df = spark.read.text('python/test_support/sql/text-test.txt', wholetext=True) - >>> df.collect() - [Row(value=u'hello\\nthis')] - """ - self._set_opts(wholetext=wholetext, lineSep=lineSep) - if isinstance(paths, basestring): - paths = [paths] - return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(paths))) - - @since(2.0) - def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None, - comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, - ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, - negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, - maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, - columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, - samplingRatio=None, enforceSchema=None, emptyValue=None): - r"""Loads a CSV file and returns the result as a :class:`DataFrame`. - - This function will go through the input once to determine the input schema if - ``inferSchema`` is enabled. To avoid going through the entire data once, disable - ``inferSchema`` option or specify the schema explicitly using ``schema``. - - :param path: string, or list of strings, for input path(s), - or RDD of Strings storing CSV rows. - :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema - or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``). - :param sep: sets a single character as a separator for each field and value. - If None is set, it uses the default value, ``,``. - :param encoding: decodes the CSV files by the given encoding type. If None is set, - it uses the default value, ``UTF-8``. - :param quote: sets a single character used for escaping quoted values where the - separator can be part of the value. If None is set, it uses the default - value, ``"``. If you would like to turn off quotations, you need to set an - empty string. - :param escape: sets a single character used for escaping quotes inside an already - quoted value. If None is set, it uses the default value, ``\``. - :param comment: sets a single character used for skipping lines beginning with this - character. By default (None), it is disabled. - :param header: uses the first line as names of columns. If None is set, it uses the - default value, ``false``. - :param inferSchema: infers the input schema automatically from data. It requires one extra - pass over the data. If None is set, it uses the default value, ``false``. - :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be - forcibly applied to datasource files, and headers in CSV files will be - ignored. If the option is set to ``false``, the schema will be - validated against all headers in CSV files or the first header in RDD - if the ``header`` option is set to ``true``. Field names in the schema - and column names in CSV headers are checked by their positions - taking into account ``spark.sql.caseSensitive``. If None is set, - ``true`` is used by default. Though the default value is ``true``, - it is recommended to disable the ``enforceSchema`` option - to avoid incorrect results. - :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from - values being read should be skipped. If None is set, it - uses the default value, ``false``. - :param ignoreTrailingWhiteSpace: A flag indicating whether or not trailing whitespaces from - values being read should be skipped. If None is set, it - uses the default value, ``false``. - :param nullValue: sets the string representation of a null value. If None is set, it uses - the default value, empty string. Since 2.0.1, this ``nullValue`` param - applies to all supported types including the string type. - :param nanValue: sets the string representation of a non-number value. If None is set, it - uses the default value, ``NaN``. - :param positiveInf: sets the string representation of a positive infinity value. If None - is set, it uses the default value, ``Inf``. - :param negativeInf: sets the string representation of a negative infinity value. If None - is set, it uses the default value, ``Inf``. - :param dateFormat: sets the string that indicates a date format. Custom date formats - follow the formats at ``java.text.SimpleDateFormat``. This - applies to date type. If None is set, it uses the - default value, ``yyyy-MM-dd``. - :param timestampFormat: sets the string that indicates a timestamp format. Custom date - formats follow the formats at ``java.text.SimpleDateFormat``. - This applies to timestamp type. If None is set, it uses the - default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``. - :param maxColumns: defines a hard limit of how many columns a record can have. If None is - set, it uses the default value, ``20480``. - :param maxCharsPerColumn: defines the maximum number of characters allowed for any given - value being read. If None is set, it uses the default value, - ``-1`` meaning unlimited length. - :param maxMalformedLogPerPartition: this parameter is no longer used since Spark 2.2.0. - If specified, it is ignored. - :param mode: allows a mode for dealing with corrupt records during parsing. If None is - set, it uses the default value, ``PERMISSIVE``. - - * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \ - into a field configured by ``columnNameOfCorruptRecord``, and sets other \ - fields to ``null``. To keep corrupt records, an user can set a string type \ - field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \ - schema does not have the field, it drops corrupt records during parsing. \ - A record with less/more tokens than schema is not a corrupted record to CSV. \ - When it meets a record having fewer tokens than the length of the schema, \ - sets ``null`` to extra fields. When the record has more tokens than the \ - length of the schema, it drops extra tokens. - * ``DROPMALFORMED`` : ignores the whole corrupted records. - * ``FAILFAST`` : throws an exception when it meets corrupted records. - - :param columnNameOfCorruptRecord: allows renaming the new field having malformed string - created by ``PERMISSIVE`` mode. This overrides - ``spark.sql.columnNameOfCorruptRecord``. If None is set, - it uses the value specified in - ``spark.sql.columnNameOfCorruptRecord``. - :param multiLine: parse records, which may span multiple lines. If None is - set, it uses the default value, ``false``. - :param charToEscapeQuoteEscaping: sets a single character used for escaping the escape for - the quote character. If None is set, the default value is - escape character when escape and quote characters are - different, ``\0`` otherwise. - :param samplingRatio: defines fraction of rows used for schema inferring. - If None is set, it uses the default value, ``1.0``. - :param emptyValue: sets the string representation of an empty value. If None is set, it uses - the default value, empty string. - - >>> df = spark.read.csv('python/test_support/sql/ages.csv') - >>> df.dtypes - [('_c0', 'string'), ('_c1', 'string')] - >>> rdd = sc.textFile('python/test_support/sql/ages.csv') - >>> df2 = spark.read.csv(rdd) - >>> df2.dtypes - [('_c0', 'string'), ('_c1', 'string')] - """ - self._set_opts( - schema=schema, sep=sep, encoding=encoding, quote=quote, escape=escape, comment=comment, - header=header, inferSchema=inferSchema, ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace, - ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace, nullValue=nullValue, - nanValue=nanValue, positiveInf=positiveInf, negativeInf=negativeInf, - dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns, - maxCharsPerColumn=maxCharsPerColumn, - maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, - columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine, - charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, samplingRatio=samplingRatio, - enforceSchema=enforceSchema, emptyValue=emptyValue) - if isinstance(path, basestring): - path = [path] - if type(path) == list: - return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path))) - elif isinstance(path, RDD): - def func(iterator): - for x in iterator: - if not isinstance(x, basestring): - x = unicode(x) - if isinstance(x, unicode): - x = x.encode("utf-8") - yield x - keyed = path.mapPartitions(func) - keyed._bypass_serializer = True - jrdd = keyed._jrdd.map(self._spark._jvm.BytesToString()) - # see SPARK-22112 - # There aren't any jvm api for creating a dataframe from rdd storing csv. - # We can do it through creating a jvm dataset firstly and using the jvm api - # for creating a dataframe from dataset storing csv. - jdataset = self._spark._ssql_ctx.createDataset( - jrdd.rdd(), - self._spark._jvm.Encoders.STRING()) - return self._df(self._jreader.csv(jdataset)) - else: - raise TypeError("path can be only string, list or RDD") - - @since(1.5) - def orc(self, path): - """Loads ORC files, returning the result as a :class:`DataFrame`. - - >>> df = spark.read.orc('python/test_support/sql/orc_partitioned') - >>> df.dtypes - [('a', 'bigint'), ('b', 'int'), ('c', 'int')] - """ - if isinstance(path, basestring): - path = [path] - return self._df(self._jreader.orc(_to_seq(self._spark._sc, path))) - - @since(1.4) - def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None, - predicates=None, properties=None): - """ - Construct a :class:`DataFrame` representing the database table named ``table`` - accessible via JDBC URL ``url`` and connection ``properties``. - - Partitions of the table will be retrieved in parallel if either ``column`` or - ``predicates`` is specified. ``lowerBound`, ``upperBound`` and ``numPartitions`` - is needed when ``column`` is specified. - - If both ``column`` and ``predicates`` are specified, ``column`` will be used. - - .. note:: Don't create too many partitions in parallel on a large cluster; - otherwise Spark might crash your external database systems. - - :param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` - :param table: the name of the table - :param column: the name of an integer column that will be used for partitioning; - if this parameter is specified, then ``numPartitions``, ``lowerBound`` - (inclusive), and ``upperBound`` (exclusive) will form partition strides - for generated WHERE clause expressions used to split the column - ``column`` evenly - :param lowerBound: the minimum value of ``column`` used to decide partition stride - :param upperBound: the maximum value of ``column`` used to decide partition stride - :param numPartitions: the number of partitions - :param predicates: a list of expressions suitable for inclusion in WHERE clauses; - each one defines one partition of the :class:`DataFrame` - :param properties: a dictionary of JDBC database connection arguments. Normally at - least properties "user" and "password" with their corresponding values. - For example { 'user' : 'SYSTEM', 'password' : 'mypassword' } - :return: a DataFrame - """ - if properties is None: - properties = dict() - jprop = JavaClass("java.util.Properties", self._spark._sc._gateway._gateway_client)() - for k in properties: - jprop.setProperty(k, properties[k]) - if column is not None: - assert lowerBound is not None, "lowerBound can not be None when ``column`` is specified" - assert upperBound is not None, "upperBound can not be None when ``column`` is specified" - assert numPartitions is not None, \ - "numPartitions can not be None when ``column`` is specified" - return self._df(self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound), - int(numPartitions), jprop)) - if predicates is not None: - gateway = self._spark._sc._gateway - jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates) - return self._df(self._jreader.jdbc(url, table, jpredicates, jprop)) - return self._df(self._jreader.jdbc(url, table, jprop)) - - -class DataFrameWriter(OptionUtils): - """ - Interface used to write a :class:`DataFrame` to external storage systems - (e.g. file systems, key-value stores, etc). Use :func:`DataFrame.write` - to access this. - - .. versionadded:: 1.4 - """ - def __init__(self, df): - self._df = df - self._spark = df.sql_ctx - self._jwrite = df._jdf.write() - - def _sq(self, jsq): - from pyspark.sql.streaming import StreamingQuery - return StreamingQuery(jsq) - - @since(1.4) - def mode(self, saveMode): - """Specifies the behavior when data or table already exists. - - Options include: - - * `append`: Append contents of this :class:`DataFrame` to existing data. - * `overwrite`: Overwrite existing data. - * `error` or `errorifexists`: Throw an exception if data already exists. - * `ignore`: Silently ignore this operation if data already exists. - - >>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data')) - """ - # At the JVM side, the default value of mode is already set to "error". - # So, if the given saveMode is None, we will not call JVM-side's mode method. - if saveMode is not None: - self._jwrite = self._jwrite.mode(saveMode) - return self - - @since(1.4) - def format(self, source): - """Specifies the underlying output data source. - - :param source: string, name of the data source, e.g. 'json', 'parquet'. - - >>> df.write.format('json').save(os.path.join(tempfile.mkdtemp(), 'data')) - """ - self._jwrite = self._jwrite.format(source) - return self - - @since(1.5) - def option(self, key, value): - """Adds an output option for the underlying data source. - - You can set the following option(s) for writing files: - * ``timeZone``: sets the string that indicates a timezone to be used to format - timestamps in the JSON/CSV datasources or partition values. - If it isn't set, it uses the default value, session local timezone. - """ - self._jwrite = self._jwrite.option(key, to_str(value)) - return self - - @since(1.4) - def options(self, **options): - """Adds output options for the underlying data source. - - You can set the following option(s) for writing files: - * ``timeZone``: sets the string that indicates a timezone to be used to format - timestamps in the JSON/CSV datasources or partition values. - If it isn't set, it uses the default value, session local timezone. - """ - for k in options: - self._jwrite = self._jwrite.option(k, to_str(options[k])) - return self - - @since(1.4) - def partitionBy(self, *cols): - """Partitions the output by the given columns on the file system. - - If specified, the output is laid out on the file system similar - to Hive's partitioning scheme. - - :param cols: name of columns - - >>> df.write.partitionBy('year', 'month').parquet(os.path.join(tempfile.mkdtemp(), 'data')) - """ - if len(cols) == 1 and isinstance(cols[0], (list, tuple)): - cols = cols[0] - self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols)) - return self - - @since(2.3) - def bucketBy(self, numBuckets, col, *cols): - """Buckets the output by the given columns.If specified, - the output is laid out on the file system similar to Hive's bucketing scheme. - - :param numBuckets: the number of buckets to save - :param col: a name of a column, or a list of names. - :param cols: additional names (optional). If `col` is a list it should be empty. - - .. note:: Applicable for file-based data sources in combination with - :py:meth:`DataFrameWriter.saveAsTable`. - - >>> (df.write.format('parquet') # doctest: +SKIP - ... .bucketBy(100, 'year', 'month') - ... .mode("overwrite") - ... .saveAsTable('bucketed_table')) - """ - if not isinstance(numBuckets, int): - raise TypeError("numBuckets should be an int, got {0}.".format(type(numBuckets))) - - if isinstance(col, (list, tuple)): - if cols: - raise ValueError("col is a {0} but cols are not empty".format(type(col))) - - col, cols = col[0], col[1:] - - if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)): - raise TypeError("all names should be `str`") - - self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols)) - return self - - @since(2.3) - def sortBy(self, col, *cols): - """Sorts the output in each bucket by the given columns on the file system. - - :param col: a name of a column, or a list of names. - :param cols: additional names (optional). If `col` is a list it should be empty. - - >>> (df.write.format('parquet') # doctest: +SKIP - ... .bucketBy(100, 'year', 'month') - ... .sortBy('day') - ... .mode("overwrite") - ... .saveAsTable('sorted_bucketed_table')) - """ - if isinstance(col, (list, tuple)): - if cols: - raise ValueError("col is a {0} but cols are not empty".format(type(col))) - - col, cols = col[0], col[1:] - - if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)): - raise TypeError("all names should be `str`") - - self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols)) - return self - - @since(1.4) - def save(self, path=None, format=None, mode=None, partitionBy=None, **options): - """Saves the contents of the :class:`DataFrame` to a data source. - - The data source is specified by the ``format`` and a set of ``options``. - If ``format`` is not specified, the default data source configured by - ``spark.sql.sources.default`` will be used. - - :param path: the path in a Hadoop supported file system - :param format: the format used to save - :param mode: specifies the behavior of the save operation when data already exists. - - * ``append``: Append contents of this :class:`DataFrame` to existing data. - * ``overwrite``: Overwrite existing data. - * ``ignore``: Silently ignore this operation if data already exists. - * ``error`` or ``errorifexists`` (default case): Throw an exception if data already \ - exists. - :param partitionBy: names of partitioning columns - :param options: all other string options - - >>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data')) - """ - self.mode(mode).options(**options) - if partitionBy is not None: - self.partitionBy(partitionBy) - if format is not None: - self.format(format) - if path is None: - self._jwrite.save() - else: - self._jwrite.save(path) - - @since(1.4) - def insertInto(self, tableName, overwrite=False): - """Inserts the content of the :class:`DataFrame` to the specified table. - - It requires that the schema of the class:`DataFrame` is the same as the - schema of the table. - - Optionally overwriting any existing data. - """ - self._jwrite.mode("overwrite" if overwrite else "append").insertInto(tableName) - - @since(1.4) - def saveAsTable(self, name, format=None, mode=None, partitionBy=None, **options): - """Saves the content of the :class:`DataFrame` as the specified table. - - In the case the table already exists, behavior of this function depends on the - save mode, specified by the `mode` function (default to throwing an exception). - When `mode` is `Overwrite`, the schema of the :class:`DataFrame` does not need to be - the same as that of the existing table. - - * `append`: Append contents of this :class:`DataFrame` to existing data. - * `overwrite`: Overwrite existing data. - * `error` or `errorifexists`: Throw an exception if data already exists. - * `ignore`: Silently ignore this operation if data already exists. - - :param name: the table name - :param format: the format used to save - :param mode: one of `append`, `overwrite`, `error`, `errorifexists`, `ignore` \ - (default: error) - :param partitionBy: names of partitioning columns - :param options: all other string options - """ - self.mode(mode).options(**options) - if partitionBy is not None: - self.partitionBy(partitionBy) - if format is not None: - self.format(format) - self._jwrite.saveAsTable(name) - - @since(1.4) - def json(self, path, mode=None, compression=None, dateFormat=None, timestampFormat=None, - lineSep=None, encoding=None): - """Saves the content of the :class:`DataFrame` in JSON format - (`JSON Lines text format or newline-delimited JSON `_) at the - specified path. - - :param path: the path in any Hadoop supported file system - :param mode: specifies the behavior of the save operation when data already exists. - - * ``append``: Append contents of this :class:`DataFrame` to existing data. - * ``overwrite``: Overwrite existing data. - * ``ignore``: Silently ignore this operation if data already exists. - * ``error`` or ``errorifexists`` (default case): Throw an exception if data already \ - exists. - :param compression: compression codec to use when saving to file. This can be one of the - known case-insensitive shorten names (none, bzip2, gzip, lz4, - snappy and deflate). - :param dateFormat: sets the string that indicates a date format. Custom date formats - follow the formats at ``java.text.SimpleDateFormat``. This - applies to date type. If None is set, it uses the - default value, ``yyyy-MM-dd``. - :param timestampFormat: sets the string that indicates a timestamp format. Custom date - formats follow the formats at ``java.text.SimpleDateFormat``. - This applies to timestamp type. If None is set, it uses the - default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``. - :param encoding: specifies encoding (charset) of saved json files. If None is set, - the default UTF-8 charset will be used. - :param lineSep: defines the line separator that should be used for writing. If None is - set, it uses the default value, ``\\n``. - - >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data')) - """ - self.mode(mode) - self._set_opts( - compression=compression, dateFormat=dateFormat, timestampFormat=timestampFormat, - lineSep=lineSep, encoding=encoding) - self._jwrite.json(path) - - @since(1.4) - def parquet(self, path, mode=None, partitionBy=None, compression=None): - """Saves the content of the :class:`DataFrame` in Parquet format at the specified path. - - :param path: the path in any Hadoop supported file system - :param mode: specifies the behavior of the save operation when data already exists. - - * ``append``: Append contents of this :class:`DataFrame` to existing data. - * ``overwrite``: Overwrite existing data. - * ``ignore``: Silently ignore this operation if data already exists. - * ``error`` or ``errorifexists`` (default case): Throw an exception if data already \ - exists. - :param partitionBy: names of partitioning columns - :param compression: compression codec to use when saving to file. This can be one of the - known case-insensitive shorten names (none, uncompressed, snappy, gzip, - lzo, brotli, lz4, and zstd). This will override - ``spark.sql.parquet.compression.codec``. If None is set, it uses the - value specified in ``spark.sql.parquet.compression.codec``. - - >>> df.write.parquet(os.path.join(tempfile.mkdtemp(), 'data')) - """ - self.mode(mode) - if partitionBy is not None: - self.partitionBy(partitionBy) - self._set_opts(compression=compression) - self._jwrite.parquet(path) - - @since(1.6) - def text(self, path, compression=None, lineSep=None): - """Saves the content of the DataFrame in a text file at the specified path. - - :param path: the path in any Hadoop supported file system - :param compression: compression codec to use when saving to file. This can be one of the - known case-insensitive shorten names (none, bzip2, gzip, lz4, - snappy and deflate). - :param lineSep: defines the line separator that should be used for writing. If None is - set, it uses the default value, ``\\n``. - - The DataFrame must have only one column that is of string type. - Each row becomes a new line in the output file. - """ - self._set_opts(compression=compression, lineSep=lineSep) - self._jwrite.text(path) - - @since(2.0) - def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None, - header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None, - timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, - charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None): - r"""Saves the content of the :class:`DataFrame` in CSV format at the specified path. - - :param path: the path in any Hadoop supported file system - :param mode: specifies the behavior of the save operation when data already exists. - - * ``append``: Append contents of this :class:`DataFrame` to existing data. - * ``overwrite``: Overwrite existing data. - * ``ignore``: Silently ignore this operation if data already exists. - * ``error`` or ``errorifexists`` (default case): Throw an exception if data already \ - exists. - - :param compression: compression codec to use when saving to file. This can be one of the - known case-insensitive shorten names (none, bzip2, gzip, lz4, - snappy and deflate). - :param sep: sets a single character as a separator for each field and value. If None is - set, it uses the default value, ``,``. - :param quote: sets a single character used for escaping quoted values where the - separator can be part of the value. If None is set, it uses the default - value, ``"``. If an empty string is set, it uses ``u0000`` (null character). - :param escape: sets a single character used for escaping quotes inside an already - quoted value. If None is set, it uses the default value, ``\`` - :param escapeQuotes: a flag indicating whether values containing quotes should always - be enclosed in quotes. If None is set, it uses the default value - ``true``, escaping all values containing a quote character. - :param quoteAll: a flag indicating whether all values should always be enclosed in - quotes. If None is set, it uses the default value ``false``, - only escaping values containing a quote character. - :param header: writes the names of columns as the first line. If None is set, it uses - the default value, ``false``. - :param nullValue: sets the string representation of a null value. If None is set, it uses - the default value, empty string. - :param dateFormat: sets the string that indicates a date format. Custom date formats - follow the formats at ``java.text.SimpleDateFormat``. This - applies to date type. If None is set, it uses the - default value, ``yyyy-MM-dd``. - :param timestampFormat: sets the string that indicates a timestamp format. Custom date - formats follow the formats at ``java.text.SimpleDateFormat``. - This applies to timestamp type. If None is set, it uses the - default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``. - :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from - values being written should be skipped. If None is set, it - uses the default value, ``true``. - :param ignoreTrailingWhiteSpace: a flag indicating whether or not trailing whitespaces from - values being written should be skipped. If None is set, it - uses the default value, ``true``. - :param charToEscapeQuoteEscaping: sets a single character used for escaping the escape for - the quote character. If None is set, the default value is - escape character when escape and quote characters are - different, ``\0`` otherwise.. - :param encoding: sets the encoding (charset) of saved csv files. If None is set, - the default UTF-8 charset will be used. - :param emptyValue: sets the string representation of an empty value. If None is set, it uses - the default value, ``""``. - - >>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data')) - """ - self.mode(mode) - self._set_opts(compression=compression, sep=sep, quote=quote, escape=escape, header=header, - nullValue=nullValue, escapeQuotes=escapeQuotes, quoteAll=quoteAll, - dateFormat=dateFormat, timestampFormat=timestampFormat, - ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace, - ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace, - charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, - encoding=encoding, emptyValue=emptyValue) - self._jwrite.csv(path) - - @since(1.5) - def orc(self, path, mode=None, partitionBy=None, compression=None): - """Saves the content of the :class:`DataFrame` in ORC format at the specified path. - - :param path: the path in any Hadoop supported file system - :param mode: specifies the behavior of the save operation when data already exists. - - * ``append``: Append contents of this :class:`DataFrame` to existing data. - * ``overwrite``: Overwrite existing data. - * ``ignore``: Silently ignore this operation if data already exists. - * ``error`` or ``errorifexists`` (default case): Throw an exception if data already \ - exists. - :param partitionBy: names of partitioning columns - :param compression: compression codec to use when saving to file. This can be one of the - known case-insensitive shorten names (none, snappy, zlib, and lzo). - This will override ``orc.compress`` and - ``spark.sql.orc.compression.codec``. If None is set, it uses the value - specified in ``spark.sql.orc.compression.codec``. - - >>> orc_df = spark.read.orc('python/test_support/sql/orc_partitioned') - >>> orc_df.write.orc(os.path.join(tempfile.mkdtemp(), 'data')) - """ - self.mode(mode) - if partitionBy is not None: - self.partitionBy(partitionBy) - self._set_opts(compression=compression) - self._jwrite.orc(path) - - @since(1.4) - def jdbc(self, url, table, mode=None, properties=None): - """Saves the content of the :class:`DataFrame` to an external database table via JDBC. - - .. note:: Don't create too many partitions in parallel on a large cluster; - otherwise Spark might crash your external database systems. - - :param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` - :param table: Name of the table in the external database. - :param mode: specifies the behavior of the save operation when data already exists. - - * ``append``: Append contents of this :class:`DataFrame` to existing data. - * ``overwrite``: Overwrite existing data. - * ``ignore``: Silently ignore this operation if data already exists. - * ``error`` or ``errorifexists`` (default case): Throw an exception if data already \ - exists. - :param properties: a dictionary of JDBC database connection arguments. Normally at - least properties "user" and "password" with their corresponding values. - For example { 'user' : 'SYSTEM', 'password' : 'mypassword' } - """ - if properties is None: - properties = dict() - jprop = JavaClass("java.util.Properties", self._spark._sc._gateway._gateway_client)() - for k in properties: - jprop.setProperty(k, properties[k]) - self.mode(mode)._jwrite.jdbc(url, table, jprop) - - -def _test(): - import doctest - import os - import tempfile - import py4j - from pyspark.context import SparkContext - from pyspark.sql import SparkSession, Row - import pyspark.sql.readwriter - - os.chdir(os.environ["SPARK_HOME"]) - - globs = pyspark.sql.readwriter.__dict__.copy() - sc = SparkContext('local[4]', 'PythonTest') - try: - spark = SparkSession.builder.getOrCreate() - except py4j.protocol.Py4JError: - spark = SparkSession(sc) - - globs['tempfile'] = tempfile - globs['os'] = os - globs['sc'] = sc - globs['spark'] = spark - globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned') - (failure_count, test_count) = doctest.testmod( - pyspark.sql.readwriter, globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) - sc.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/session.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/session.py deleted file mode 100644 index 51a38eb..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/session.py +++ /dev/null @@ -1,871 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function -import sys -import warnings -from functools import reduce -from threading import RLock - -if sys.version >= '3': - basestring = unicode = str - xrange = range -else: - from itertools import izip as zip, imap as map - -from pyspark import since -from pyspark.rdd import RDD, ignore_unicode_prefix -from pyspark.sql.conf import RuntimeConfig -from pyspark.sql.dataframe import DataFrame -from pyspark.sql.readwriter import DataFrameReader -from pyspark.sql.streaming import DataStreamReader -from pyspark.sql.types import Row, DataType, StringType, StructType, TimestampType, \ - _make_type_verifier, _infer_schema, _has_nulltype, _merge_type, _create_converter, \ - _parse_datatype_string -from pyspark.sql.utils import install_exception_handler - -__all__ = ["SparkSession"] - - -def _monkey_patch_RDD(sparkSession): - def toDF(self, schema=None, sampleRatio=None): - """ - Converts current :class:`RDD` into a :class:`DataFrame` - - This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)`` - - :param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns - :param samplingRatio: the sample ratio of rows used for inferring - :return: a DataFrame - - >>> rdd.toDF().collect() - [Row(name=u'Alice', age=1)] - """ - return sparkSession.createDataFrame(self, schema, sampleRatio) - - RDD.toDF = toDF - - -class SparkSession(object): - """The entry point to programming Spark with the Dataset and DataFrame API. - - A SparkSession can be used create :class:`DataFrame`, register :class:`DataFrame` as - tables, execute SQL over tables, cache tables, and read parquet files. - To create a SparkSession, use the following builder pattern: - - >>> spark = SparkSession.builder \\ - ... .master("local") \\ - ... .appName("Word Count") \\ - ... .config("spark.some.config.option", "some-value") \\ - ... .getOrCreate() - - .. autoattribute:: builder - :annotation: - """ - - class Builder(object): - """Builder for :class:`SparkSession`. - """ - - _lock = RLock() - _options = {} - - @since(2.0) - def config(self, key=None, value=None, conf=None): - """Sets a config option. Options set using this method are automatically propagated to - both :class:`SparkConf` and :class:`SparkSession`'s own configuration. - - For an existing SparkConf, use `conf` parameter. - - >>> from pyspark.conf import SparkConf - >>> SparkSession.builder.config(conf=SparkConf()) - >> SparkSession.builder.config("spark.some.config.option", "some-value") - >> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() - >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1" - True - - In case an existing SparkSession is returned, the config options specified - in this builder will be applied to the existing SparkSession. - - >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate() - >>> s1.conf.get("k1") == s2.conf.get("k1") - True - >>> s1.conf.get("k2") == s2.conf.get("k2") - True - """ - with self._lock: - from pyspark.context import SparkContext - from pyspark.conf import SparkConf - session = SparkSession._instantiatedSession - if session is None or session._sc._jsc is None: - sparkConf = SparkConf() - for key, value in self._options.items(): - sparkConf.set(key, value) - sc = SparkContext.getOrCreate(sparkConf) - # This SparkContext may be an existing one. - for key, value in self._options.items(): - # we need to propagate the confs - # before we create the SparkSession. Otherwise, confs like - # warehouse path and metastore url will not be set correctly ( - # these confs cannot be changed once the SparkSession is created). - sc._conf.set(key, value) - session = SparkSession(sc) - for key, value in self._options.items(): - session._jsparkSession.sessionState().conf().setConfString(key, value) - for key, value in self._options.items(): - session.sparkContext._conf.set(key, value) - return session - - builder = Builder() - """A class attribute having a :class:`Builder` to construct :class:`SparkSession` instances""" - - _instantiatedSession = None - - @ignore_unicode_prefix - def __init__(self, sparkContext, jsparkSession=None): - """Creates a new SparkSession. - - >>> from datetime import datetime - >>> spark = SparkSession(sc) - >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1, - ... b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1), - ... time=datetime(2014, 8, 1, 14, 1, 5))]) - >>> df = allTypes.toDF() - >>> df.createOrReplaceTempView("allTypes") - >>> spark.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a ' - ... 'from allTypes where b and i > 0').collect() - [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \ - dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)] - >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect() - [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] - """ - from pyspark.sql.context import SQLContext - self._sc = sparkContext - self._jsc = self._sc._jsc - self._jvm = self._sc._jvm - if jsparkSession is None: - if self._jvm.SparkSession.getDefaultSession().isDefined() \ - and not self._jvm.SparkSession.getDefaultSession().get() \ - .sparkContext().isStopped(): - jsparkSession = self._jvm.SparkSession.getDefaultSession().get() - else: - jsparkSession = self._jvm.SparkSession(self._jsc.sc()) - self._jsparkSession = jsparkSession - self._jwrapped = self._jsparkSession.sqlContext() - self._wrapped = SQLContext(self._sc, self, self._jwrapped) - _monkey_patch_RDD(self) - install_exception_handler() - # If we had an instantiated SparkSession attached with a SparkContext - # which is stopped now, we need to renew the instantiated SparkSession. - # Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate. - if SparkSession._instantiatedSession is None \ - or SparkSession._instantiatedSession._sc._jsc is None: - SparkSession._instantiatedSession = self - self._jvm.SparkSession.setDefaultSession(self._jsparkSession) - - def _repr_html_(self): - return """ -
      -

      SparkSession - {catalogImplementation}

      - {sc_HTML} -
      - """.format( - catalogImplementation=self.conf.get("spark.sql.catalogImplementation"), - sc_HTML=self.sparkContext._repr_html_() - ) - - @since(2.0) - def newSession(self): - """ - Returns a new SparkSession as new session, that has separate SQLConf, - registered temporary views and UDFs, but shared SparkContext and - table cache. - """ - return self.__class__(self._sc, self._jsparkSession.newSession()) - - @property - @since(2.0) - def sparkContext(self): - """Returns the underlying :class:`SparkContext`.""" - return self._sc - - @property - @since(2.0) - def version(self): - """The version of Spark on which this application is running.""" - return self._jsparkSession.version() - - @property - @since(2.0) - def conf(self): - """Runtime configuration interface for Spark. - - This is the interface through which the user can get and set all Spark and Hadoop - configurations that are relevant to Spark SQL. When getting the value of a config, - this defaults to the value set in the underlying :class:`SparkContext`, if any. - """ - if not hasattr(self, "_conf"): - self._conf = RuntimeConfig(self._jsparkSession.conf()) - return self._conf - - @property - @since(2.0) - def catalog(self): - """Interface through which the user may create, drop, alter or query underlying - databases, tables, functions etc. - - :return: :class:`Catalog` - """ - from pyspark.sql.catalog import Catalog - if not hasattr(self, "_catalog"): - self._catalog = Catalog(self) - return self._catalog - - @property - @since(2.0) - def udf(self): - """Returns a :class:`UDFRegistration` for UDF registration. - - :return: :class:`UDFRegistration` - """ - from pyspark.sql.udf import UDFRegistration - return UDFRegistration(self) - - @since(2.0) - def range(self, start, end=None, step=1, numPartitions=None): - """ - Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named - ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with - step value ``step``. - - :param start: the start value - :param end: the end value (exclusive) - :param step: the incremental step (default: 1) - :param numPartitions: the number of partitions of the DataFrame - :return: :class:`DataFrame` - - >>> spark.range(1, 7, 2).collect() - [Row(id=1), Row(id=3), Row(id=5)] - - If only one argument is specified, it will be used as the end value. - - >>> spark.range(3).collect() - [Row(id=0), Row(id=1), Row(id=2)] - """ - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - - if end is None: - jdf = self._jsparkSession.range(0, int(start), int(step), int(numPartitions)) - else: - jdf = self._jsparkSession.range(int(start), int(end), int(step), int(numPartitions)) - - return DataFrame(jdf, self._wrapped) - - def _inferSchemaFromList(self, data, names=None): - """ - Infer schema from list of Row or tuple. - - :param data: list of Row or tuple - :param names: list of column names - :return: :class:`pyspark.sql.types.StructType` - """ - if not data: - raise ValueError("can not infer schema from empty dataset") - first = data[0] - if type(first) is dict: - warnings.warn("inferring schema from dict is deprecated," - "please use pyspark.sql.Row instead") - schema = reduce(_merge_type, (_infer_schema(row, names) for row in data)) - if _has_nulltype(schema): - raise ValueError("Some of types cannot be determined after inferring") - return schema - - def _inferSchema(self, rdd, samplingRatio=None, names=None): - """ - Infer schema from an RDD of Row or tuple. - - :param rdd: an RDD of Row or tuple - :param samplingRatio: sampling ratio, or no sampling (default) - :return: :class:`pyspark.sql.types.StructType` - """ - first = rdd.first() - if not first: - raise ValueError("The first row in RDD is empty, " - "can not infer schema") - if type(first) is dict: - warnings.warn("Using RDD of dict to inferSchema is deprecated. " - "Use pyspark.sql.Row instead") - - if samplingRatio is None: - schema = _infer_schema(first, names=names) - if _has_nulltype(schema): - for row in rdd.take(100)[1:]: - schema = _merge_type(schema, _infer_schema(row, names=names)) - if not _has_nulltype(schema): - break - else: - raise ValueError("Some of types cannot be determined by the " - "first 100 rows, please try again with sampling") - else: - if samplingRatio < 0.99: - rdd = rdd.sample(False, float(samplingRatio)) - schema = rdd.map(lambda row: _infer_schema(row, names)).reduce(_merge_type) - return schema - - def _createFromRDD(self, rdd, schema, samplingRatio): - """ - Create an RDD for DataFrame from an existing RDD, returns the RDD and schema. - """ - if schema is None or isinstance(schema, (list, tuple)): - struct = self._inferSchema(rdd, samplingRatio, names=schema) - converter = _create_converter(struct) - rdd = rdd.map(converter) - if isinstance(schema, (list, tuple)): - for i, name in enumerate(schema): - struct.fields[i].name = name - struct.names[i] = name - schema = struct - - elif not isinstance(schema, StructType): - raise TypeError("schema should be StructType or list or None, but got: %s" % schema) - - # convert python objects to sql data - rdd = rdd.map(schema.toInternal) - return rdd, schema - - def _createFromLocal(self, data, schema): - """ - Create an RDD for DataFrame from a list or pandas.DataFrame, returns - the RDD and schema. - """ - # make sure data could consumed multiple times - if not isinstance(data, list): - data = list(data) - - if schema is None or isinstance(schema, (list, tuple)): - struct = self._inferSchemaFromList(data, names=schema) - converter = _create_converter(struct) - data = map(converter, data) - if isinstance(schema, (list, tuple)): - for i, name in enumerate(schema): - struct.fields[i].name = name - struct.names[i] = name - schema = struct - - elif not isinstance(schema, StructType): - raise TypeError("schema should be StructType or list or None, but got: %s" % schema) - - # convert python objects to sql data - data = [schema.toInternal(row) for row in data] - return self._sc.parallelize(data), schema - - def _get_numpy_record_dtype(self, rec): - """ - Used when converting a pandas.DataFrame to Spark using to_records(), this will correct - the dtypes of fields in a record so they can be properly loaded into Spark. - :param rec: a numpy record to check field dtypes - :return corrected dtype for a numpy.record or None if no correction needed - """ - import numpy as np - cur_dtypes = rec.dtype - col_names = cur_dtypes.names - record_type_list = [] - has_rec_fix = False - for i in xrange(len(cur_dtypes)): - curr_type = cur_dtypes[i] - # If type is a datetime64 timestamp, convert to microseconds - # NOTE: if dtype is datetime[ns] then np.record.tolist() will output values as longs, - # conversion from [us] or lower will lead to py datetime objects, see SPARK-22417 - if curr_type == np.dtype('datetime64[ns]'): - curr_type = 'datetime64[us]' - has_rec_fix = True - record_type_list.append((str(col_names[i]), curr_type)) - return np.dtype(record_type_list) if has_rec_fix else None - - def _convert_from_pandas(self, pdf, schema, timezone): - """ - Convert a pandas.DataFrame to list of records that can be used to make a DataFrame - :return list of records - """ - if timezone is not None: - from pyspark.sql.types import _check_series_convert_timestamps_tz_local - copied = False - if isinstance(schema, StructType): - for field in schema: - # TODO: handle nested timestamps, such as ArrayType(TimestampType())? - if isinstance(field.dataType, TimestampType): - s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone) - if s is not pdf[field.name]: - if not copied: - # Copy once if the series is modified to prevent the original - # Pandas DataFrame from being updated - pdf = pdf.copy() - copied = True - pdf[field.name] = s - else: - for column, series in pdf.iteritems(): - s = _check_series_convert_timestamps_tz_local(series, timezone) - if s is not series: - if not copied: - # Copy once if the series is modified to prevent the original - # Pandas DataFrame from being updated - pdf = pdf.copy() - copied = True - pdf[column] = s - - # Convert pandas.DataFrame to list of numpy records - np_records = pdf.to_records(index=False) - - # Check if any columns need to be fixed for Spark to infer properly - if len(np_records) > 0: - record_dtype = self._get_numpy_record_dtype(np_records[0]) - if record_dtype is not None: - return [r.astype(record_dtype).tolist() for r in np_records] - - # Convert list of numpy records to python lists - return [r.tolist() for r in np_records] - - def _create_from_pandas_with_arrow(self, pdf, schema, timezone): - """ - Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting - to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the - data types will be used to coerce the data in Pandas to Arrow conversion. - """ - from pyspark.serializers import ArrowStreamSerializer, _create_batch - from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType - from pyspark.sql.utils import require_minimum_pandas_version, \ - require_minimum_pyarrow_version - - require_minimum_pandas_version() - require_minimum_pyarrow_version() - - from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype - - # Determine arrow types to coerce data when creating batches - if isinstance(schema, StructType): - arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] - elif isinstance(schema, DataType): - raise ValueError("Single data type %s is not supported with Arrow" % str(schema)) - else: - # Any timestamps must be coerced to be compatible with Spark - arrow_types = [to_arrow_type(TimestampType()) - if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None - for t in pdf.dtypes] - - # Slice the DataFrame to be batched - step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up - pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step)) - - # Create Arrow record batches - batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)], - timezone) - for pdf_slice in pdf_slices] - - # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing) - if isinstance(schema, (list, tuple)): - struct = from_arrow_schema(batches[0].schema) - for i, name in enumerate(schema): - struct.fields[i].name = name - struct.names[i] = name - schema = struct - - jsqlContext = self._wrapped._jsqlContext - - def reader_func(temp_filename): - return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename) - - def create_RDD_server(): - return self._jvm.ArrowRDDServer(jsqlContext) - - # Create Spark DataFrame from Arrow stream file, using one batch per partition - jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func, - create_RDD_server) - jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext) - df = DataFrame(jdf, self._wrapped) - df._schema = schema - return df - - @staticmethod - def _create_shell_session(): - """ - Initialize a SparkSession for a pyspark shell session. This is called from shell.py - to make error handling simpler without needing to declare local variables in that - script, which would expose those to users. - """ - import py4j - from pyspark.conf import SparkConf - from pyspark.context import SparkContext - try: - # Try to access HiveConf, it will raise exception if Hive is not added - conf = SparkConf() - if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive': - SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() - return SparkSession.builder\ - .enableHiveSupport()\ - .getOrCreate() - else: - return SparkSession.builder.getOrCreate() - except (py4j.protocol.Py4JError, TypeError): - if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive': - warnings.warn("Fall back to non-hive support because failing to access HiveConf, " - "please make sure you build spark with hive") - - return SparkSession.builder.getOrCreate() - - @since(2.0) - @ignore_unicode_prefix - def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): - """ - Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. - - When ``schema`` is a list of column names, the type of each column - will be inferred from ``data``. - - When ``schema`` is ``None``, it will try to infer the schema (column names and types) - from ``data``, which should be an RDD of :class:`Row`, - or :class:`namedtuple`, or :class:`dict`. - - When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match - the real data, or an exception will be thrown at runtime. If the given schema is not - :class:`pyspark.sql.types.StructType`, it will be wrapped into a - :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", - each record will also be wrapped into a tuple, which can be converted to row later. - - If schema inference is needed, ``samplingRatio`` is used to determined the ratio of - rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. - - :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, - etc.), or :class:`list`, or :class:`pandas.DataFrame`. - :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of - column names, default is ``None``. The data type string format equals to - :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can - omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use - ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use - ``int`` as a short name for ``IntegerType``. - :param samplingRatio: the sample ratio of rows used for inferring - :param verifySchema: verify data types of every row against schema. - :return: :class:`DataFrame` - - .. versionchanged:: 2.1 - Added verifySchema. - - .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental. - - >>> l = [('Alice', 1)] - >>> spark.createDataFrame(l).collect() - [Row(_1=u'Alice', _2=1)] - >>> spark.createDataFrame(l, ['name', 'age']).collect() - [Row(name=u'Alice', age=1)] - - >>> d = [{'name': 'Alice', 'age': 1}] - >>> spark.createDataFrame(d).collect() - [Row(age=1, name=u'Alice')] - - >>> rdd = sc.parallelize(l) - >>> spark.createDataFrame(rdd).collect() - [Row(_1=u'Alice', _2=1)] - >>> df = spark.createDataFrame(rdd, ['name', 'age']) - >>> df.collect() - [Row(name=u'Alice', age=1)] - - >>> from pyspark.sql import Row - >>> Person = Row('name', 'age') - >>> person = rdd.map(lambda r: Person(*r)) - >>> df2 = spark.createDataFrame(person) - >>> df2.collect() - [Row(name=u'Alice', age=1)] - - >>> from pyspark.sql.types import * - >>> schema = StructType([ - ... StructField("name", StringType(), True), - ... StructField("age", IntegerType(), True)]) - >>> df3 = spark.createDataFrame(rdd, schema) - >>> df3.collect() - [Row(name=u'Alice', age=1)] - - >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP - [Row(name=u'Alice', age=1)] - >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP - [Row(0=1, 1=2)] - - >>> spark.createDataFrame(rdd, "a: string, b: int").collect() - [Row(a=u'Alice', b=1)] - >>> rdd = rdd.map(lambda row: row[1]) - >>> spark.createDataFrame(rdd, "int").collect() - [Row(value=1)] - >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - Py4JJavaError: ... - """ - if isinstance(data, DataFrame): - raise TypeError("data is already a DataFrame") - - if isinstance(schema, basestring): - schema = _parse_datatype_string(schema) - elif isinstance(schema, (list, tuple)): - # Must re-encode any unicode strings to be consistent with StructField names - schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] - - try: - import pandas - has_pandas = True - except Exception: - has_pandas = False - if has_pandas and isinstance(data, pandas.DataFrame): - from pyspark.sql.utils import require_minimum_pandas_version - require_minimum_pandas_version() - - if self._wrapped._conf.pandasRespectSessionTimeZone(): - timezone = self._wrapped._conf.sessionLocalTimeZone() - else: - timezone = None - - # If no schema supplied by user then get the names of columns only - if schema is None: - schema = [str(x) if not isinstance(x, basestring) else - (x.encode('utf-8') if not isinstance(x, str) else x) - for x in data.columns] - - if self._wrapped._conf.arrowEnabled() and len(data) > 0: - try: - return self._create_from_pandas_with_arrow(data, schema, timezone) - except Exception as e: - from pyspark.util import _exception_message - - if self._wrapped._conf.arrowFallbackEnabled(): - msg = ( - "createDataFrame attempted Arrow optimization because " - "'spark.sql.execution.arrow.enabled' is set to true; however, " - "failed by the reason below:\n %s\n" - "Attempting non-optimization as " - "'spark.sql.execution.arrow.fallback.enabled' is set to " - "true." % _exception_message(e)) - warnings.warn(msg) - else: - msg = ( - "createDataFrame attempted Arrow optimization because " - "'spark.sql.execution.arrow.enabled' is set to true, but has reached " - "the error below and will not continue because automatic fallback " - "with 'spark.sql.execution.arrow.fallback.enabled' has been set to " - "false.\n %s" % _exception_message(e)) - warnings.warn(msg) - raise - data = self._convert_from_pandas(data, schema, timezone) - - if isinstance(schema, StructType): - verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True - - def prepare(obj): - verify_func(obj) - return obj - elif isinstance(schema, DataType): - dataType = schema - schema = StructType().add("value", schema) - - verify_func = _make_type_verifier( - dataType, name="field value") if verifySchema else lambda _: True - - def prepare(obj): - verify_func(obj) - return obj, - else: - prepare = lambda obj: obj - - if isinstance(data, RDD): - rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) - else: - rdd, schema = self._createFromLocal(map(prepare, data), schema) - jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) - jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) - df = DataFrame(jdf, self._wrapped) - df._schema = schema - return df - - @ignore_unicode_prefix - @since(2.0) - def sql(self, sqlQuery): - """Returns a :class:`DataFrame` representing the result of the given query. - - :return: :class:`DataFrame` - - >>> df.createOrReplaceTempView("table1") - >>> df2 = spark.sql("SELECT field1 AS f1, field2 as f2 from table1") - >>> df2.collect() - [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')] - """ - return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped) - - @since(2.0) - def table(self, tableName): - """Returns the specified table as a :class:`DataFrame`. - - :return: :class:`DataFrame` - - >>> df.createOrReplaceTempView("table1") - >>> df2 = spark.table("table1") - >>> sorted(df.collect()) == sorted(df2.collect()) - True - """ - return DataFrame(self._jsparkSession.table(tableName), self._wrapped) - - @property - @since(2.0) - def read(self): - """ - Returns a :class:`DataFrameReader` that can be used to read data - in as a :class:`DataFrame`. - - :return: :class:`DataFrameReader` - """ - return DataFrameReader(self._wrapped) - - @property - @since(2.0) - def readStream(self): - """ - Returns a :class:`DataStreamReader` that can be used to read data streams - as a streaming :class:`DataFrame`. - - .. note:: Evolving. - - :return: :class:`DataStreamReader` - """ - return DataStreamReader(self._wrapped) - - @property - @since(2.0) - def streams(self): - """Returns a :class:`StreamingQueryManager` that allows managing all the - :class:`StreamingQuery` StreamingQueries active on `this` context. - - .. note:: Evolving. - - :return: :class:`StreamingQueryManager` - """ - from pyspark.sql.streaming import StreamingQueryManager - return StreamingQueryManager(self._jsparkSession.streams()) - - @since(2.0) - def stop(self): - """Stop the underlying :class:`SparkContext`. - """ - self._sc.stop() - # We should clean the default session up. See SPARK-23228. - self._jvm.SparkSession.clearDefaultSession() - SparkSession._instantiatedSession = None - - @since(2.0) - def __enter__(self): - """ - Enable 'with SparkSession.builder.(...).getOrCreate() as session: app' syntax. - """ - return self - - @since(2.0) - def __exit__(self, exc_type, exc_val, exc_tb): - """ - Enable 'with SparkSession.builder.(...).getOrCreate() as session: app' syntax. - - Specifically stop the SparkSession on exit of the with block. - """ - self.stop() - - -def _test(): - import os - import doctest - from pyspark.context import SparkContext - from pyspark.sql import Row - import pyspark.sql.session - - os.chdir(os.environ["SPARK_HOME"]) - - globs = pyspark.sql.session.__dict__.copy() - sc = SparkContext('local[4]', 'PythonTest') - globs['sc'] = sc - globs['spark'] = SparkSession(sc) - globs['rdd'] = rdd = sc.parallelize( - [Row(field1=1, field2="row1"), - Row(field1=2, field2="row2"), - Row(field1=3, field2="row3")]) - globs['df'] = rdd.toDF() - (failure_count, test_count) = doctest.testmod( - pyspark.sql.session, globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) - globs['sc'].stop() - if failure_count: - sys.exit(-1) - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/streaming.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/streaming.py deleted file mode 100644 index b18453b..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/streaming.py +++ /dev/null @@ -1,1145 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import json - -if sys.version >= '3': - basestring = str - -from py4j.java_gateway import java_import - -from pyspark import since, keyword_only -from pyspark.rdd import ignore_unicode_prefix -from pyspark.sql.column import _to_seq -from pyspark.sql.readwriter import OptionUtils, to_str -from pyspark.sql.types import * -from pyspark.sql.utils import ForeachBatchFunction, StreamingQueryException - -__all__ = ["StreamingQuery", "StreamingQueryManager", "DataStreamReader", "DataStreamWriter"] - - -class StreamingQuery(object): - """ - A handle to a query that is executing continuously in the background as new data arrives. - All these methods are thread-safe. - - .. note:: Evolving - - .. versionadded:: 2.0 - """ - - def __init__(self, jsq): - self._jsq = jsq - - @property - @since(2.0) - def id(self): - """Returns the unique id of this query that persists across restarts from checkpoint data. - That is, this id is generated when a query is started for the first time, and - will be the same every time it is restarted from checkpoint data. - There can only be one query with the same id active in a Spark cluster. - Also see, `runId`. - """ - return self._jsq.id().toString() - - @property - @since(2.1) - def runId(self): - """Returns the unique id of this query that does not persist across restarts. That is, every - query that is started (or restarted from checkpoint) will have a different runId. - """ - return self._jsq.runId().toString() - - @property - @since(2.0) - def name(self): - """Returns the user-specified name of the query, or null if not specified. - This name can be specified in the `org.apache.spark.sql.streaming.DataStreamWriter` - as `dataframe.writeStream.queryName("query").start()`. - This name, if set, must be unique across all active queries. - """ - return self._jsq.name() - - @property - @since(2.0) - def isActive(self): - """Whether this streaming query is currently active or not. - """ - return self._jsq.isActive() - - @since(2.0) - def awaitTermination(self, timeout=None): - """Waits for the termination of `this` query, either by :func:`query.stop()` or by an - exception. If the query has terminated with an exception, then the exception will be thrown. - If `timeout` is set, it returns whether the query has terminated or not within the - `timeout` seconds. - - If the query has terminated, then all subsequent calls to this method will either return - immediately (if the query was terminated by :func:`stop()`), or throw the exception - immediately (if the query has terminated with exception). - - throws :class:`StreamingQueryException`, if `this` query has terminated with an exception - """ - if timeout is not None: - if not isinstance(timeout, (int, float)) or timeout < 0: - raise ValueError("timeout must be a positive integer or float. Got %s" % timeout) - return self._jsq.awaitTermination(int(timeout * 1000)) - else: - return self._jsq.awaitTermination() - - @property - @since(2.1) - def status(self): - """ - Returns the current status of the query. - """ - return json.loads(self._jsq.status().json()) - - @property - @since(2.1) - def recentProgress(self): - """Returns an array of the most recent [[StreamingQueryProgress]] updates for this query. - The number of progress updates retained for each stream is configured by Spark session - configuration `spark.sql.streaming.numRecentProgressUpdates`. - """ - return [json.loads(p.json()) for p in self._jsq.recentProgress()] - - @property - @since(2.1) - def lastProgress(self): - """ - Returns the most recent :class:`StreamingQueryProgress` update of this streaming query or - None if there were no progress updates - :return: a map - """ - lastProgress = self._jsq.lastProgress() - if lastProgress: - return json.loads(lastProgress.json()) - else: - return None - - @since(2.0) - def processAllAvailable(self): - """Blocks until all available data in the source has been processed and committed to the - sink. This method is intended for testing. - - .. note:: In the case of continually arriving data, this method may block forever. - Additionally, this method is only guaranteed to block until data that has been - synchronously appended data to a stream source prior to invocation. - (i.e. `getOffset` must immediately reflect the addition). - """ - return self._jsq.processAllAvailable() - - @since(2.0) - def stop(self): - """Stop this streaming query. - """ - self._jsq.stop() - - @since(2.1) - def explain(self, extended=False): - """Prints the (logical and physical) plans to the console for debugging purpose. - - :param extended: boolean, default ``False``. If ``False``, prints only the physical plan. - - >>> sq = sdf.writeStream.format('memory').queryName('query_explain').start() - >>> sq.processAllAvailable() # Wait a bit to generate the runtime plans. - >>> sq.explain() - == Physical Plan == - ... - >>> sq.explain(True) - == Parsed Logical Plan == - ... - == Analyzed Logical Plan == - ... - == Optimized Logical Plan == - ... - == Physical Plan == - ... - >>> sq.stop() - """ - # Cannot call `_jsq.explain(...)` because it will print in the JVM process. - # We should print it in the Python process. - print(self._jsq.explainInternal(extended)) - - @since(2.1) - def exception(self): - """ - :return: the StreamingQueryException if the query was terminated by an exception, or None. - """ - if self._jsq.exception().isDefined(): - je = self._jsq.exception().get() - msg = je.toString().split(': ', 1)[1] # Drop the Java StreamingQueryException type info - stackTrace = '\n\t at '.join(map(lambda x: x.toString(), je.getStackTrace())) - return StreamingQueryException(msg, stackTrace) - else: - return None - - -class StreamingQueryManager(object): - """A class to manage all the :class:`StreamingQuery` StreamingQueries active. - - .. note:: Evolving - - .. versionadded:: 2.0 - """ - - def __init__(self, jsqm): - self._jsqm = jsqm - - @property - @ignore_unicode_prefix - @since(2.0) - def active(self): - """Returns a list of active queries associated with this SQLContext - - >>> sq = sdf.writeStream.format('memory').queryName('this_query').start() - >>> sqm = spark.streams - >>> # get the list of active streaming queries - >>> [q.name for q in sqm.active] - [u'this_query'] - >>> sq.stop() - """ - return [StreamingQuery(jsq) for jsq in self._jsqm.active()] - - @ignore_unicode_prefix - @since(2.0) - def get(self, id): - """Returns an active query from this SQLContext or throws exception if an active query - with this name doesn't exist. - - >>> sq = sdf.writeStream.format('memory').queryName('this_query').start() - >>> sq.name - u'this_query' - >>> sq = spark.streams.get(sq.id) - >>> sq.isActive - True - >>> sq = sqlContext.streams.get(sq.id) - >>> sq.isActive - True - >>> sq.stop() - """ - return StreamingQuery(self._jsqm.get(id)) - - @since(2.0) - def awaitAnyTermination(self, timeout=None): - """Wait until any of the queries on the associated SQLContext has terminated since the - creation of the context, or since :func:`resetTerminated()` was called. If any query was - terminated with an exception, then the exception will be thrown. - If `timeout` is set, it returns whether the query has terminated or not within the - `timeout` seconds. - - If a query has terminated, then subsequent calls to :func:`awaitAnyTermination()` will - either return immediately (if the query was terminated by :func:`query.stop()`), - or throw the exception immediately (if the query was terminated with exception). Use - :func:`resetTerminated()` to clear past terminations and wait for new terminations. - - In the case where multiple queries have terminated since :func:`resetTermination()` - was called, if any query has terminated with exception, then :func:`awaitAnyTermination()` - will throw any of the exception. For correctly documenting exceptions across multiple - queries, users need to stop all of them after any of them terminates with exception, and - then check the `query.exception()` for each query. - - throws :class:`StreamingQueryException`, if `this` query has terminated with an exception - """ - if timeout is not None: - if not isinstance(timeout, (int, float)) or timeout < 0: - raise ValueError("timeout must be a positive integer or float. Got %s" % timeout) - return self._jsqm.awaitAnyTermination(int(timeout * 1000)) - else: - return self._jsqm.awaitAnyTermination() - - @since(2.0) - def resetTerminated(self): - """Forget about past terminated queries so that :func:`awaitAnyTermination()` can be used - again to wait for new terminations. - - >>> spark.streams.resetTerminated() - """ - self._jsqm.resetTerminated() - - -class DataStreamReader(OptionUtils): - """ - Interface used to load a streaming :class:`DataFrame` from external storage systems - (e.g. file systems, key-value stores, etc). Use :func:`spark.readStream` - to access this. - - .. note:: Evolving. - - .. versionadded:: 2.0 - """ - - def __init__(self, spark): - self._jreader = spark._ssql_ctx.readStream() - self._spark = spark - - def _df(self, jdf): - from pyspark.sql.dataframe import DataFrame - return DataFrame(jdf, self._spark) - - @since(2.0) - def format(self, source): - """Specifies the input data source format. - - .. note:: Evolving. - - :param source: string, name of the data source, e.g. 'json', 'parquet'. - - >>> s = spark.readStream.format("text") - """ - self._jreader = self._jreader.format(source) - return self - - @since(2.0) - def schema(self, schema): - """Specifies the input schema. - - Some data sources (e.g. JSON) can infer the input schema automatically from data. - By specifying the schema here, the underlying data source can skip the schema - inference step, and thus speed up data loading. - - .. note:: Evolving. - - :param schema: a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string - (For example ``col0 INT, col1 DOUBLE``). - - >>> s = spark.readStream.schema(sdf_schema) - >>> s = spark.readStream.schema("col0 INT, col1 DOUBLE") - """ - from pyspark.sql import SparkSession - spark = SparkSession.builder.getOrCreate() - if isinstance(schema, StructType): - jschema = spark._jsparkSession.parseDataType(schema.json()) - self._jreader = self._jreader.schema(jschema) - elif isinstance(schema, basestring): - self._jreader = self._jreader.schema(schema) - else: - raise TypeError("schema should be StructType or string") - return self - - @since(2.0) - def option(self, key, value): - """Adds an input option for the underlying data source. - - You can set the following option(s) for reading files: - * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps - in the JSON/CSV datasources or partition values. - If it isn't set, it uses the default value, session local timezone. - - .. note:: Evolving. - - >>> s = spark.readStream.option("x", 1) - """ - self._jreader = self._jreader.option(key, to_str(value)) - return self - - @since(2.0) - def options(self, **options): - """Adds input options for the underlying data source. - - You can set the following option(s) for reading files: - * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps - in the JSON/CSV datasources or partition values. - If it isn't set, it uses the default value, session local timezone. - - .. note:: Evolving. - - >>> s = spark.readStream.options(x="1", y=2) - """ - for k in options: - self._jreader = self._jreader.option(k, to_str(options[k])) - return self - - @since(2.0) - def load(self, path=None, format=None, schema=None, **options): - """Loads a data stream from a data source and returns it as a :class`DataFrame`. - - .. note:: Evolving. - - :param path: optional string for file-system backed data sources. - :param format: optional string for format of the data source. Default to 'parquet'. - :param schema: optional :class:`pyspark.sql.types.StructType` for the input schema - or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``). - :param options: all other string options - - >>> json_sdf = spark.readStream.format("json") \\ - ... .schema(sdf_schema) \\ - ... .load(tempfile.mkdtemp()) - >>> json_sdf.isStreaming - True - >>> json_sdf.schema == sdf_schema - True - """ - if format is not None: - self.format(format) - if schema is not None: - self.schema(schema) - self.options(**options) - if path is not None: - if type(path) != str or len(path.strip()) == 0: - raise ValueError("If the path is provided for stream, it needs to be a " + - "non-empty string. List of paths are not supported.") - return self._df(self._jreader.load(path)) - else: - return self._df(self._jreader.load()) - - @since(2.0) - def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, - allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None, - allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None, - mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None, - multiLine=None, allowUnquotedControlChars=None, lineSep=None): - """ - Loads a JSON file stream and returns the results as a :class:`DataFrame`. - - `JSON Lines `_ (newline-delimited JSON) is supported by default. - For JSON (one record per file), set the ``multiLine`` parameter to ``true``. - - If the ``schema`` parameter is not specified, this function goes - through the input once to determine the input schema. - - .. note:: Evolving. - - :param path: string represents path to the JSON dataset, - or RDD of Strings storing JSON objects. - :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema - or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``). - :param primitivesAsString: infers all primitive values as a string type. If None is set, - it uses the default value, ``false``. - :param prefersDecimal: infers all floating-point values as a decimal type. If the values - do not fit in decimal, then it infers them as doubles. If None is - set, it uses the default value, ``false``. - :param allowComments: ignores Java/C++ style comment in JSON records. If None is set, - it uses the default value, ``false``. - :param allowUnquotedFieldNames: allows unquoted JSON field names. If None is set, - it uses the default value, ``false``. - :param allowSingleQuotes: allows single quotes in addition to double quotes. If None is - set, it uses the default value, ``true``. - :param allowNumericLeadingZero: allows leading zeros in numbers (e.g. 00012). If None is - set, it uses the default value, ``false``. - :param allowBackslashEscapingAnyCharacter: allows accepting quoting of all character - using backslash quoting mechanism. If None is - set, it uses the default value, ``false``. - :param mode: allows a mode for dealing with corrupt records during parsing. If None is - set, it uses the default value, ``PERMISSIVE``. - - * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \ - into a field configured by ``columnNameOfCorruptRecord``, and sets other \ - fields to ``null``. To keep corrupt records, an user can set a string type \ - field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \ - schema does not have the field, it drops corrupt records during parsing. \ - When inferring a schema, it implicitly adds a ``columnNameOfCorruptRecord`` \ - field in an output schema. - * ``DROPMALFORMED`` : ignores the whole corrupted records. - * ``FAILFAST`` : throws an exception when it meets corrupted records. - - :param columnNameOfCorruptRecord: allows renaming the new field having malformed string - created by ``PERMISSIVE`` mode. This overrides - ``spark.sql.columnNameOfCorruptRecord``. If None is set, - it uses the value specified in - ``spark.sql.columnNameOfCorruptRecord``. - :param dateFormat: sets the string that indicates a date format. Custom date formats - follow the formats at ``java.text.SimpleDateFormat``. This - applies to date type. If None is set, it uses the - default value, ``yyyy-MM-dd``. - :param timestampFormat: sets the string that indicates a timestamp format. Custom date - formats follow the formats at ``java.text.SimpleDateFormat``. - This applies to timestamp type. If None is set, it uses the - default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``. - :param multiLine: parse one record, which may span multiple lines, per file. If None is - set, it uses the default value, ``false``. - :param allowUnquotedControlChars: allows JSON Strings to contain unquoted control - characters (ASCII characters with value less than 32, - including tab and line feed characters) or not. - :param lineSep: defines the line separator that should be used for parsing. If None is - set, it covers all ``\\r``, ``\\r\\n`` and ``\\n``. - - >>> json_sdf = spark.readStream.json(tempfile.mkdtemp(), schema = sdf_schema) - >>> json_sdf.isStreaming - True - >>> json_sdf.schema == sdf_schema - True - """ - self._set_opts( - schema=schema, primitivesAsString=primitivesAsString, prefersDecimal=prefersDecimal, - allowComments=allowComments, allowUnquotedFieldNames=allowUnquotedFieldNames, - allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero, - allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter, - mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat, - timestampFormat=timestampFormat, multiLine=multiLine, - allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep) - if isinstance(path, basestring): - return self._df(self._jreader.json(path)) - else: - raise TypeError("path can be only a single string") - - @since(2.3) - def orc(self, path): - """Loads a ORC file stream, returning the result as a :class:`DataFrame`. - - .. note:: Evolving. - - >>> orc_sdf = spark.readStream.schema(sdf_schema).orc(tempfile.mkdtemp()) - >>> orc_sdf.isStreaming - True - >>> orc_sdf.schema == sdf_schema - True - """ - if isinstance(path, basestring): - return self._df(self._jreader.orc(path)) - else: - raise TypeError("path can be only a single string") - - @since(2.0) - def parquet(self, path): - """Loads a Parquet file stream, returning the result as a :class:`DataFrame`. - - You can set the following Parquet-specific option(s) for reading Parquet files: - * ``mergeSchema``: sets whether we should merge schemas collected from all \ - Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. \ - The default value is specified in ``spark.sql.parquet.mergeSchema``. - - .. note:: Evolving. - - >>> parquet_sdf = spark.readStream.schema(sdf_schema).parquet(tempfile.mkdtemp()) - >>> parquet_sdf.isStreaming - True - >>> parquet_sdf.schema == sdf_schema - True - """ - if isinstance(path, basestring): - return self._df(self._jreader.parquet(path)) - else: - raise TypeError("path can be only a single string") - - @ignore_unicode_prefix - @since(2.0) - def text(self, path, wholetext=False, lineSep=None): - """ - Loads a text file stream and returns a :class:`DataFrame` whose schema starts with a - string column named "value", and followed by partitioned columns if there - are any. - - By default, each line in the text file is a new row in the resulting DataFrame. - - .. note:: Evolving. - - :param paths: string, or list of strings, for input path(s). - :param wholetext: if true, read each file from input path(s) as a single row. - :param lineSep: defines the line separator that should be used for parsing. If None is - set, it covers all ``\\r``, ``\\r\\n`` and ``\\n``. - - >>> text_sdf = spark.readStream.text(tempfile.mkdtemp()) - >>> text_sdf.isStreaming - True - >>> "value" in str(text_sdf.schema) - True - """ - self._set_opts(wholetext=wholetext, lineSep=lineSep) - if isinstance(path, basestring): - return self._df(self._jreader.text(path)) - else: - raise TypeError("path can be only a single string") - - @since(2.0) - def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None, - comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, - ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, - negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, - maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, - columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, - enforceSchema=None, emptyValue=None): - r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`. - - This function will go through the input once to determine the input schema if - ``inferSchema`` is enabled. To avoid going through the entire data once, disable - ``inferSchema`` option or specify the schema explicitly using ``schema``. - - .. note:: Evolving. - - :param path: string, or list of strings, for input path(s). - :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema - or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``). - :param sep: sets a single character as a separator for each field and value. - If None is set, it uses the default value, ``,``. - :param encoding: decodes the CSV files by the given encoding type. If None is set, - it uses the default value, ``UTF-8``. - :param quote: sets a single character used for escaping quoted values where the - separator can be part of the value. If None is set, it uses the default - value, ``"``. If you would like to turn off quotations, you need to set an - empty string. - :param escape: sets a single character used for escaping quotes inside an already - quoted value. If None is set, it uses the default value, ``\``. - :param comment: sets a single character used for skipping lines beginning with this - character. By default (None), it is disabled. - :param header: uses the first line as names of columns. If None is set, it uses the - default value, ``false``. - :param inferSchema: infers the input schema automatically from data. It requires one extra - pass over the data. If None is set, it uses the default value, ``false``. - :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be - forcibly applied to datasource files, and headers in CSV files will be - ignored. If the option is set to ``false``, the schema will be - validated against all headers in CSV files or the first header in RDD - if the ``header`` option is set to ``true``. Field names in the schema - and column names in CSV headers are checked by their positions - taking into account ``spark.sql.caseSensitive``. If None is set, - ``true`` is used by default. Though the default value is ``true``, - it is recommended to disable the ``enforceSchema`` option - to avoid incorrect results. - :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from - values being read should be skipped. If None is set, it - uses the default value, ``false``. - :param ignoreTrailingWhiteSpace: a flag indicating whether or not trailing whitespaces from - values being read should be skipped. If None is set, it - uses the default value, ``false``. - :param nullValue: sets the string representation of a null value. If None is set, it uses - the default value, empty string. Since 2.0.1, this ``nullValue`` param - applies to all supported types including the string type. - :param nanValue: sets the string representation of a non-number value. If None is set, it - uses the default value, ``NaN``. - :param positiveInf: sets the string representation of a positive infinity value. If None - is set, it uses the default value, ``Inf``. - :param negativeInf: sets the string representation of a negative infinity value. If None - is set, it uses the default value, ``Inf``. - :param dateFormat: sets the string that indicates a date format. Custom date formats - follow the formats at ``java.text.SimpleDateFormat``. This - applies to date type. If None is set, it uses the - default value, ``yyyy-MM-dd``. - :param timestampFormat: sets the string that indicates a timestamp format. Custom date - formats follow the formats at ``java.text.SimpleDateFormat``. - This applies to timestamp type. If None is set, it uses the - default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``. - :param maxColumns: defines a hard limit of how many columns a record can have. If None is - set, it uses the default value, ``20480``. - :param maxCharsPerColumn: defines the maximum number of characters allowed for any given - value being read. If None is set, it uses the default value, - ``-1`` meaning unlimited length. - :param maxMalformedLogPerPartition: this parameter is no longer used since Spark 2.2.0. - If specified, it is ignored. - :param mode: allows a mode for dealing with corrupt records during parsing. If None is - set, it uses the default value, ``PERMISSIVE``. - - * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \ - into a field configured by ``columnNameOfCorruptRecord``, and sets other \ - fields to ``null``. To keep corrupt records, an user can set a string type \ - field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \ - schema does not have the field, it drops corrupt records during parsing. \ - A record with less/more tokens than schema is not a corrupted record to CSV. \ - When it meets a record having fewer tokens than the length of the schema, \ - sets ``null`` to extra fields. When the record has more tokens than the \ - length of the schema, it drops extra tokens. - * ``DROPMALFORMED`` : ignores the whole corrupted records. - * ``FAILFAST`` : throws an exception when it meets corrupted records. - - :param columnNameOfCorruptRecord: allows renaming the new field having malformed string - created by ``PERMISSIVE`` mode. This overrides - ``spark.sql.columnNameOfCorruptRecord``. If None is set, - it uses the value specified in - ``spark.sql.columnNameOfCorruptRecord``. - :param multiLine: parse one record, which may span multiple lines. If None is - set, it uses the default value, ``false``. - :param charToEscapeQuoteEscaping: sets a single character used for escaping the escape for - the quote character. If None is set, the default value is - escape character when escape and quote characters are - different, ``\0`` otherwise.. - :param emptyValue: sets the string representation of an empty value. If None is set, it uses - the default value, empty string. - - >>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema) - >>> csv_sdf.isStreaming - True - >>> csv_sdf.schema == sdf_schema - True - """ - self._set_opts( - schema=schema, sep=sep, encoding=encoding, quote=quote, escape=escape, comment=comment, - header=header, inferSchema=inferSchema, ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace, - ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace, nullValue=nullValue, - nanValue=nanValue, positiveInf=positiveInf, negativeInf=negativeInf, - dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns, - maxCharsPerColumn=maxCharsPerColumn, - maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, - columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine, - charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema, - emptyValue=emptyValue) - if isinstance(path, basestring): - return self._df(self._jreader.csv(path)) - else: - raise TypeError("path can be only a single string") - - -class DataStreamWriter(object): - """ - Interface used to write a streaming :class:`DataFrame` to external storage systems - (e.g. file systems, key-value stores, etc). Use :func:`DataFrame.writeStream` - to access this. - - .. note:: Evolving. - - .. versionadded:: 2.0 - """ - - def __init__(self, df): - self._df = df - self._spark = df.sql_ctx - self._jwrite = df._jdf.writeStream() - - def _sq(self, jsq): - from pyspark.sql.streaming import StreamingQuery - return StreamingQuery(jsq) - - @since(2.0) - def outputMode(self, outputMode): - """Specifies how data of a streaming DataFrame/Dataset is written to a streaming sink. - - Options include: - - * `append`:Only the new rows in the streaming DataFrame/Dataset will be written to - the sink - * `complete`:All the rows in the streaming DataFrame/Dataset will be written to the sink - every time these is some updates - * `update`:only the rows that were updated in the streaming DataFrame/Dataset will be - written to the sink every time there are some updates. If the query doesn't contain - aggregations, it will be equivalent to `append` mode. - - .. note:: Evolving. - - >>> writer = sdf.writeStream.outputMode('append') - """ - if not outputMode or type(outputMode) != str or len(outputMode.strip()) == 0: - raise ValueError('The output mode must be a non-empty string. Got: %s' % outputMode) - self._jwrite = self._jwrite.outputMode(outputMode) - return self - - @since(2.0) - def format(self, source): - """Specifies the underlying output data source. - - .. note:: Evolving. - - :param source: string, name of the data source, which for now can be 'parquet'. - - >>> writer = sdf.writeStream.format('json') - """ - self._jwrite = self._jwrite.format(source) - return self - - @since(2.0) - def option(self, key, value): - """Adds an output option for the underlying data source. - - You can set the following option(s) for writing files: - * ``timeZone``: sets the string that indicates a timezone to be used to format - timestamps in the JSON/CSV datasources or partition values. - If it isn't set, it uses the default value, session local timezone. - - .. note:: Evolving. - """ - self._jwrite = self._jwrite.option(key, to_str(value)) - return self - - @since(2.0) - def options(self, **options): - """Adds output options for the underlying data source. - - You can set the following option(s) for writing files: - * ``timeZone``: sets the string that indicates a timezone to be used to format - timestamps in the JSON/CSV datasources or partition values. - If it isn't set, it uses the default value, session local timezone. - - .. note:: Evolving. - """ - for k in options: - self._jwrite = self._jwrite.option(k, to_str(options[k])) - return self - - @since(2.0) - def partitionBy(self, *cols): - """Partitions the output by the given columns on the file system. - - If specified, the output is laid out on the file system similar - to Hive's partitioning scheme. - - .. note:: Evolving. - - :param cols: name of columns - - """ - if len(cols) == 1 and isinstance(cols[0], (list, tuple)): - cols = cols[0] - self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols)) - return self - - @since(2.0) - def queryName(self, queryName): - """Specifies the name of the :class:`StreamingQuery` that can be started with - :func:`start`. This name must be unique among all the currently active queries - in the associated SparkSession. - - .. note:: Evolving. - - :param queryName: unique name for the query - - >>> writer = sdf.writeStream.queryName('streaming_query') - """ - if not queryName or type(queryName) != str or len(queryName.strip()) == 0: - raise ValueError('The queryName must be a non-empty string. Got: %s' % queryName) - self._jwrite = self._jwrite.queryName(queryName) - return self - - @keyword_only - @since(2.0) - def trigger(self, processingTime=None, once=None, continuous=None): - """Set the trigger for the stream query. If this is not set it will run the query as fast - as possible, which is equivalent to setting the trigger to ``processingTime='0 seconds'``. - - .. note:: Evolving. - - :param processingTime: a processing time interval as a string, e.g. '5 seconds', '1 minute'. - Set a trigger that runs a query periodically based on the processing - time. Only one trigger can be set. - :param once: if set to True, set a trigger that processes only one batch of data in a - streaming query then terminates the query. Only one trigger can be set. - - >>> # trigger the query for execution every 5 seconds - >>> writer = sdf.writeStream.trigger(processingTime='5 seconds') - >>> # trigger the query for just once batch of data - >>> writer = sdf.writeStream.trigger(once=True) - >>> # trigger the query for execution every 5 seconds - >>> writer = sdf.writeStream.trigger(continuous='5 seconds') - """ - params = [processingTime, once, continuous] - - if params.count(None) == 3: - raise ValueError('No trigger provided') - elif params.count(None) < 2: - raise ValueError('Multiple triggers not allowed.') - - jTrigger = None - if processingTime is not None: - if type(processingTime) != str or len(processingTime.strip()) == 0: - raise ValueError('Value for processingTime must be a non empty string. Got: %s' % - processingTime) - interval = processingTime.strip() - jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.ProcessingTime( - interval) - - elif once is not None: - if once is not True: - raise ValueError('Value for once must be True. Got: %s' % once) - jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Once() - - else: - if type(continuous) != str or len(continuous.strip()) == 0: - raise ValueError('Value for continuous must be a non empty string. Got: %s' % - continuous) - interval = continuous.strip() - jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Continuous( - interval) - - self._jwrite = self._jwrite.trigger(jTrigger) - return self - - @since(2.4) - def foreach(self, f): - """ - Sets the output of the streaming query to be processed using the provided writer ``f``. - This is often used to write the output of a streaming query to arbitrary storage systems. - The processing logic can be specified in two ways. - - #. A **function** that takes a row as input. - This is a simple way to express your processing logic. Note that this does - not allow you to deduplicate generated data when failures cause reprocessing of - some input data. That would require you to specify the processing logic in the next - way. - - #. An **object** with a ``process`` method and optional ``open`` and ``close`` methods. - The object can have the following methods. - - * ``open(partition_id, epoch_id)``: *Optional* method that initializes the processing - (for example, open a connection, start a transaction, etc). Additionally, you can - use the `partition_id` and `epoch_id` to deduplicate regenerated data - (discussed later). - - * ``process(row)``: *Non-optional* method that processes each :class:`Row`. - - * ``close(error)``: *Optional* method that finalizes and cleans up (for example, - close connection, commit transaction, etc.) after all rows have been processed. - - The object will be used by Spark in the following way. - - * A single copy of this object is responsible of all the data generated by a - single task in a query. In other words, one instance is responsible for - processing one partition of the data generated in a distributed manner. - - * This object must be serializable because each task will get a fresh - serialized-deserialized copy of the provided object. Hence, it is strongly - recommended that any initialization for writing data (e.g. opening a - connection or starting a transaction) is done after the `open(...)` - method has been called, which signifies that the task is ready to generate data. - - * The lifecycle of the methods are as follows. - - For each partition with ``partition_id``: - - ... For each batch/epoch of streaming data with ``epoch_id``: - - ....... Method ``open(partitionId, epochId)`` is called. - - ....... If ``open(...)`` returns true, for each row in the partition and - batch/epoch, method ``process(row)`` is called. - - ....... Method ``close(errorOrNull)`` is called with error (if any) seen while - processing rows. - - Important points to note: - - * The `partitionId` and `epochId` can be used to deduplicate generated data when - failures cause reprocessing of some input data. This depends on the execution - mode of the query. If the streaming query is being executed in the micro-batch - mode, then every partition represented by a unique tuple (partition_id, epoch_id) - is guaranteed to have the same data. Hence, (partition_id, epoch_id) can be used - to deduplicate and/or transactionally commit data and achieve exactly-once - guarantees. However, if the streaming query is being executed in the continuous - mode, then this guarantee does not hold and therefore should not be used for - deduplication. - - * The ``close()`` method (if exists) will be called if `open()` method exists and - returns successfully (irrespective of the return value), except if the Python - crashes in the middle. - - .. note:: Evolving. - - >>> # Print every row using a function - >>> def print_row(row): - ... print(row) - ... - >>> writer = sdf.writeStream.foreach(print_row) - >>> # Print every row using a object with process() method - >>> class RowPrinter: - ... def open(self, partition_id, epoch_id): - ... print("Opened %d, %d" % (partition_id, epoch_id)) - ... return True - ... def process(self, row): - ... print(row) - ... def close(self, error): - ... print("Closed with error: %s" % str(error)) - ... - >>> writer = sdf.writeStream.foreach(RowPrinter()) - """ - - from pyspark.rdd import _wrap_function - from pyspark.serializers import PickleSerializer, AutoBatchedSerializer - from pyspark.taskcontext import TaskContext - - if callable(f): - # The provided object is a callable function that is supposed to be called on each row. - # Construct a function that takes an iterator and calls the provided function on each - # row. - def func_without_process(_, iterator): - for x in iterator: - f(x) - return iter([]) - - func = func_without_process - - else: - # The provided object is not a callable function. Then it is expected to have a - # 'process(row)' method, and optional 'open(partition_id, epoch_id)' and - # 'close(error)' methods. - - if not hasattr(f, 'process'): - raise Exception("Provided object does not have a 'process' method") - - if not callable(getattr(f, 'process')): - raise Exception("Attribute 'process' in provided object is not callable") - - def doesMethodExist(method_name): - exists = hasattr(f, method_name) - if exists and not callable(getattr(f, method_name)): - raise Exception( - "Attribute '%s' in provided object is not callable" % method_name) - return exists - - open_exists = doesMethodExist('open') - close_exists = doesMethodExist('close') - - def func_with_open_process_close(partition_id, iterator): - epoch_id = TaskContext.get().getLocalProperty('streaming.sql.batchId') - if epoch_id: - epoch_id = int(epoch_id) - else: - raise Exception("Could not get batch id from TaskContext") - - # Check if the data should be processed - should_process = True - if open_exists: - should_process = f.open(partition_id, epoch_id) - - error = None - - try: - if should_process: - for x in iterator: - f.process(x) - except Exception as ex: - error = ex - finally: - if close_exists: - f.close(error) - if error: - raise error - - return iter([]) - - func = func_with_open_process_close - - serializer = AutoBatchedSerializer(PickleSerializer()) - wrapped_func = _wrap_function(self._spark._sc, func, serializer, serializer) - jForeachWriter = \ - self._spark._sc._jvm.org.apache.spark.sql.execution.python.PythonForeachWriter( - wrapped_func, self._df._jdf.schema()) - self._jwrite.foreach(jForeachWriter) - return self - - @since(2.4) - def foreachBatch(self, func): - """ - Sets the output of the streaming query to be processed using the provided - function. This is supported only the in the micro-batch execution modes (that is, when the - trigger is not continuous). In every micro-batch, the provided function will be called in - every micro-batch with (i) the output rows as a DataFrame and (ii) the batch identifier. - The batchId can be used deduplicate and transactionally write the output - (that is, the provided Dataset) to external systems. The output DataFrame is guaranteed - to exactly same for the same batchId (assuming all operations are deterministic in the - query). - - .. note:: Evolving. - - >>> def func(batch_df, batch_id): - ... batch_df.collect() - ... - >>> writer = sdf.writeStream.foreach(func) - """ - - from pyspark.java_gateway import ensure_callback_server_started - gw = self._spark._sc._gateway - java_import(gw.jvm, "org.apache.spark.sql.execution.streaming.sources.*") - - wrapped_func = ForeachBatchFunction(self._spark, func) - gw.jvm.PythonForeachBatchHelper.callForeachBatch(self._jwrite, wrapped_func) - ensure_callback_server_started(gw) - return self - - @ignore_unicode_prefix - @since(2.0) - def start(self, path=None, format=None, outputMode=None, partitionBy=None, queryName=None, - **options): - """Streams the contents of the :class:`DataFrame` to a data source. - - The data source is specified by the ``format`` and a set of ``options``. - If ``format`` is not specified, the default data source configured by - ``spark.sql.sources.default`` will be used. - - .. note:: Evolving. - - :param path: the path in a Hadoop supported file system - :param format: the format used to save - :param outputMode: specifies how data of a streaming DataFrame/Dataset is written to a - streaming sink. - - * `append`:Only the new rows in the streaming DataFrame/Dataset will be written to the - sink - * `complete`:All the rows in the streaming DataFrame/Dataset will be written to the sink - every time these is some updates - * `update`:only the rows that were updated in the streaming DataFrame/Dataset will be - written to the sink every time there are some updates. If the query doesn't contain - aggregations, it will be equivalent to `append` mode. - :param partitionBy: names of partitioning columns - :param queryName: unique name for the query - :param options: All other string options. You may want to provide a `checkpointLocation` - for most streams, however it is not required for a `memory` stream. - - >>> sq = sdf.writeStream.format('memory').queryName('this_query').start() - >>> sq.isActive - True - >>> sq.name - u'this_query' - >>> sq.stop() - >>> sq.isActive - False - >>> sq = sdf.writeStream.trigger(processingTime='5 seconds').start( - ... queryName='that_query', outputMode="append", format='memory') - >>> sq.name - u'that_query' - >>> sq.isActive - True - >>> sq.stop() - """ - self.options(**options) - if outputMode is not None: - self.outputMode(outputMode) - if partitionBy is not None: - self.partitionBy(partitionBy) - if format is not None: - self.format(format) - if queryName is not None: - self.queryName(queryName) - if path is None: - return self._sq(self._jwrite.start()) - else: - return self._sq(self._jwrite.start(path)) - - -def _test(): - import doctest - import os - import tempfile - from pyspark.sql import Row, SparkSession, SQLContext - import pyspark.sql.streaming - - os.chdir(os.environ["SPARK_HOME"]) - - globs = pyspark.sql.streaming.__dict__.copy() - try: - spark = SparkSession.builder.getOrCreate() - except py4j.protocol.Py4JError: - spark = SparkSession(sc) - - globs['tempfile'] = tempfile - globs['os'] = os - globs['spark'] = spark - globs['sqlContext'] = SQLContext.getOrCreate(spark.sparkContext) - globs['sdf'] = \ - spark.readStream.format('text').load('python/test_support/sql/streaming') - globs['sdf_schema'] = StructType([StructField("data", StringType(), False)]) - globs['df'] = \ - globs['spark'].readStream.format('text').load('python/test_support/sql/streaming') - - (failure_count, test_count) = doctest.testmod( - pyspark.sql.streaming, globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) - globs['spark'].stop() - - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/tests.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/tests.py deleted file mode 100644 index 2e6d015..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/tests.py +++ /dev/null @@ -1,6687 +0,0 @@ -# -*- encoding: utf-8 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Unit tests for pyspark.sql; additional tests are implemented as doctests in -individual modules. -""" -import os -import sys -import subprocess -import pydoc -import shutil -import tempfile -import threading -import pickle -import functools -import time -import datetime -import array -import ctypes -import warnings -import py4j -from contextlib import contextmanager - -try: - import xmlrunner -except ImportError: - xmlrunner = None - -if sys.version_info[:2] <= (2, 6): - try: - import unittest2 as unittest - except ImportError: - sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') - sys.exit(1) -else: - import unittest - -from pyspark.util import _exception_message - -_pandas_requirement_message = None -try: - from pyspark.sql.utils import require_minimum_pandas_version - require_minimum_pandas_version() -except ImportError as e: - # If Pandas version requirement is not satisfied, skip related tests. - _pandas_requirement_message = _exception_message(e) - -_pyarrow_requirement_message = None -try: - from pyspark.sql.utils import require_minimum_pyarrow_version - require_minimum_pyarrow_version() -except ImportError as e: - # If Arrow version requirement is not satisfied, skip related tests. - _pyarrow_requirement_message = _exception_message(e) - -_test_not_compiled_message = None -try: - from pyspark.sql.utils import require_test_compiled - require_test_compiled() -except Exception as e: - _test_not_compiled_message = _exception_message(e) - -_have_pandas = _pandas_requirement_message is None -_have_pyarrow = _pyarrow_requirement_message is None -_test_compiled = _test_not_compiled_message is None - -from pyspark import SparkContext -from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row -from pyspark.sql.types import * -from pyspark.sql.types import UserDefinedType, _infer_type, _make_type_verifier -from pyspark.sql.types import _array_signed_int_typecode_ctype_mappings, _array_type_mappings -from pyspark.sql.types import _array_unsigned_int_typecode_ctype_mappings -from pyspark.sql.types import _merge_type -from pyspark.tests import QuietTest, ReusedPySparkTestCase, PySparkTestCase, SparkSubmitTests -from pyspark.sql.functions import UserDefinedFunction, sha2, lit -from pyspark.sql.window import Window -from pyspark.sql.utils import AnalysisException, ParseException, IllegalArgumentException - - -class UTCOffsetTimezone(datetime.tzinfo): - """ - Specifies timezone in UTC offset - """ - - def __init__(self, offset=0): - self.ZERO = datetime.timedelta(hours=offset) - - def utcoffset(self, dt): - return self.ZERO - - def dst(self, dt): - return self.ZERO - - -class ExamplePointUDT(UserDefinedType): - """ - User-defined type (UDT) for ExamplePoint. - """ - - @classmethod - def sqlType(self): - return ArrayType(DoubleType(), False) - - @classmethod - def module(cls): - return 'pyspark.sql.tests' - - @classmethod - def scalaUDT(cls): - return 'org.apache.spark.sql.test.ExamplePointUDT' - - def serialize(self, obj): - return [obj.x, obj.y] - - def deserialize(self, datum): - return ExamplePoint(datum[0], datum[1]) - - -class ExamplePoint: - """ - An example class to demonstrate UDT in Scala, Java, and Python. - """ - - __UDT__ = ExamplePointUDT() - - def __init__(self, x, y): - self.x = x - self.y = y - - def __repr__(self): - return "ExamplePoint(%s,%s)" % (self.x, self.y) - - def __str__(self): - return "(%s,%s)" % (self.x, self.y) - - def __eq__(self, other): - return isinstance(other, self.__class__) and \ - other.x == self.x and other.y == self.y - - -class PythonOnlyUDT(UserDefinedType): - """ - User-defined type (UDT) for ExamplePoint. - """ - - @classmethod - def sqlType(self): - return ArrayType(DoubleType(), False) - - @classmethod - def module(cls): - return '__main__' - - def serialize(self, obj): - return [obj.x, obj.y] - - def deserialize(self, datum): - return PythonOnlyPoint(datum[0], datum[1]) - - @staticmethod - def foo(): - pass - - @property - def props(self): - return {} - - -class PythonOnlyPoint(ExamplePoint): - """ - An example class to demonstrate UDT in only Python - """ - __UDT__ = PythonOnlyUDT() - - -class MyObject(object): - def __init__(self, key, value): - self.key = key - self.value = value - - -class SQLTestUtils(object): - """ - This util assumes the instance of this to have 'spark' attribute, having a spark session. - It is usually used with 'ReusedSQLTestCase' class but can be used if you feel sure the - the implementation of this class has 'spark' attribute. - """ - - @contextmanager - def sql_conf(self, pairs): - """ - A convenient context manager to test some configuration specific logic. This sets - `value` to the configuration `key` and then restores it back when it exits. - """ - assert isinstance(pairs, dict), "pairs should be a dictionary." - assert hasattr(self, "spark"), "it should have 'spark' attribute, having a spark session." - - keys = pairs.keys() - new_values = pairs.values() - old_values = [self.spark.conf.get(key, None) for key in keys] - for key, new_value in zip(keys, new_values): - self.spark.conf.set(key, new_value) - try: - yield - finally: - for key, old_value in zip(keys, old_values): - if old_value is None: - self.spark.conf.unset(key) - else: - self.spark.conf.set(key, old_value) - - -class ReusedSQLTestCase(ReusedPySparkTestCase, SQLTestUtils): - @classmethod - def setUpClass(cls): - super(ReusedSQLTestCase, cls).setUpClass() - cls.spark = SparkSession(cls.sc) - - @classmethod - def tearDownClass(cls): - super(ReusedSQLTestCase, cls).tearDownClass() - cls.spark.stop() - - def assertPandasEqual(self, expected, result): - msg = ("DataFrames are not equal: " + - "\n\nExpected:\n%s\n%s" % (expected, expected.dtypes) + - "\n\nResult:\n%s\n%s" % (result, result.dtypes)) - self.assertTrue(expected.equals(result), msg=msg) - - -class DataTypeTests(unittest.TestCase): - # regression test for SPARK-6055 - def test_data_type_eq(self): - lt = LongType() - lt2 = pickle.loads(pickle.dumps(LongType())) - self.assertEqual(lt, lt2) - - # regression test for SPARK-7978 - def test_decimal_type(self): - t1 = DecimalType() - t2 = DecimalType(10, 2) - self.assertTrue(t2 is not t1) - self.assertNotEqual(t1, t2) - t3 = DecimalType(8) - self.assertNotEqual(t2, t3) - - # regression test for SPARK-10392 - def test_datetype_equal_zero(self): - dt = DateType() - self.assertEqual(dt.fromInternal(0), datetime.date(1970, 1, 1)) - - # regression test for SPARK-17035 - def test_timestamp_microsecond(self): - tst = TimestampType() - self.assertEqual(tst.toInternal(datetime.datetime.max) % 1000000, 999999) - - def test_empty_row(self): - row = Row() - self.assertEqual(len(row), 0) - - def test_struct_field_type_name(self): - struct_field = StructField("a", IntegerType()) - self.assertRaises(TypeError, struct_field.typeName) - - def test_invalid_create_row(self): - row_class = Row("c1", "c2") - self.assertRaises(ValueError, lambda: row_class(1, 2, 3)) - - -class SQLTests(ReusedSQLTestCase): - - @classmethod - def setUpClass(cls): - ReusedSQLTestCase.setUpClass() - cls.tempdir = tempfile.NamedTemporaryFile(delete=False) - os.unlink(cls.tempdir.name) - cls.testData = [Row(key=i, value=str(i)) for i in range(100)] - cls.df = cls.spark.createDataFrame(cls.testData) - - @classmethod - def tearDownClass(cls): - ReusedSQLTestCase.tearDownClass() - shutil.rmtree(cls.tempdir.name, ignore_errors=True) - - def test_sqlcontext_reuses_sparksession(self): - sqlContext1 = SQLContext(self.sc) - sqlContext2 = SQLContext(self.sc) - self.assertTrue(sqlContext1.sparkSession is sqlContext2.sparkSession) - - def tearDown(self): - super(SQLTests, self).tearDown() - - # tear down test_bucketed_write state - self.spark.sql("DROP TABLE IF EXISTS pyspark_bucket") - - def test_row_should_be_read_only(self): - row = Row(a=1, b=2) - self.assertEqual(1, row.a) - - def foo(): - row.a = 3 - self.assertRaises(Exception, foo) - - row2 = self.spark.range(10).first() - self.assertEqual(0, row2.id) - - def foo2(): - row2.id = 2 - self.assertRaises(Exception, foo2) - - def test_range(self): - self.assertEqual(self.spark.range(1, 1).count(), 0) - self.assertEqual(self.spark.range(1, 0, -1).count(), 1) - self.assertEqual(self.spark.range(0, 1 << 40, 1 << 39).count(), 2) - self.assertEqual(self.spark.range(-2).count(), 0) - self.assertEqual(self.spark.range(3).count(), 3) - - def test_duplicated_column_names(self): - df = self.spark.createDataFrame([(1, 2)], ["c", "c"]) - row = df.select('*').first() - self.assertEqual(1, row[0]) - self.assertEqual(2, row[1]) - self.assertEqual("Row(c=1, c=2)", str(row)) - # Cannot access columns - self.assertRaises(AnalysisException, lambda: df.select(df[0]).first()) - self.assertRaises(AnalysisException, lambda: df.select(df.c).first()) - self.assertRaises(AnalysisException, lambda: df.select(df["c"]).first()) - - def test_column_name_encoding(self): - """Ensure that created columns has `str` type consistently.""" - columns = self.spark.createDataFrame([('Alice', 1)], ['name', u'age']).columns - self.assertEqual(columns, ['name', 'age']) - self.assertTrue(isinstance(columns[0], str)) - self.assertTrue(isinstance(columns[1], str)) - - def test_explode(self): - from pyspark.sql.functions import explode, explode_outer, posexplode_outer - d = [ - Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}), - Row(a=1, intlist=[], mapfield={}), - Row(a=1, intlist=None, mapfield=None), - ] - rdd = self.sc.parallelize(d) - data = self.spark.createDataFrame(rdd) - - result = data.select(explode(data.intlist).alias("a")).select("a").collect() - self.assertEqual(result[0][0], 1) - self.assertEqual(result[1][0], 2) - self.assertEqual(result[2][0], 3) - - result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect() - self.assertEqual(result[0][0], "a") - self.assertEqual(result[0][1], "b") - - result = [tuple(x) for x in data.select(posexplode_outer("intlist")).collect()] - self.assertEqual(result, [(0, 1), (1, 2), (2, 3), (None, None), (None, None)]) - - result = [tuple(x) for x in data.select(posexplode_outer("mapfield")).collect()] - self.assertEqual(result, [(0, 'a', 'b'), (None, None, None), (None, None, None)]) - - result = [x[0] for x in data.select(explode_outer("intlist")).collect()] - self.assertEqual(result, [1, 2, 3, None, None]) - - result = [tuple(x) for x in data.select(explode_outer("mapfield")).collect()] - self.assertEqual(result, [('a', 'b'), (None, None), (None, None)]) - - def test_and_in_expression(self): - self.assertEqual(4, self.df.filter((self.df.key <= 10) & (self.df.value <= "2")).count()) - self.assertRaises(ValueError, lambda: (self.df.key <= 10) and (self.df.value <= "2")) - self.assertEqual(14, self.df.filter((self.df.key <= 3) | (self.df.value < "2")).count()) - self.assertRaises(ValueError, lambda: self.df.key <= 3 or self.df.value < "2") - self.assertEqual(99, self.df.filter(~(self.df.key == 1)).count()) - self.assertRaises(ValueError, lambda: not self.df.key == 1) - - def test_udf_with_callable(self): - d = [Row(number=i, squared=i**2) for i in range(10)] - rdd = self.sc.parallelize(d) - data = self.spark.createDataFrame(rdd) - - class PlusFour: - def __call__(self, col): - if col is not None: - return col + 4 - - call = PlusFour() - pudf = UserDefinedFunction(call, LongType()) - res = data.select(pudf(data['number']).alias('plus_four')) - self.assertEqual(res.agg({'plus_four': 'sum'}).collect()[0][0], 85) - - def test_udf_with_partial_function(self): - d = [Row(number=i, squared=i**2) for i in range(10)] - rdd = self.sc.parallelize(d) - data = self.spark.createDataFrame(rdd) - - def some_func(col, param): - if col is not None: - return col + param - - pfunc = functools.partial(some_func, param=4) - pudf = UserDefinedFunction(pfunc, LongType()) - res = data.select(pudf(data['number']).alias('plus_four')) - self.assertEqual(res.agg({'plus_four': 'sum'}).collect()[0][0], 85) - - def test_udf(self): - self.spark.catalog.registerFunction("twoArgs", lambda x, y: len(x) + y, IntegerType()) - [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect() - self.assertEqual(row[0], 5) - - # This is to check if a deprecated 'SQLContext.registerFunction' can call its alias. - sqlContext = self.spark._wrapped - sqlContext.registerFunction("oneArg", lambda x: len(x), IntegerType()) - [row] = sqlContext.sql("SELECT oneArg('test')").collect() - self.assertEqual(row[0], 4) - - def test_udf2(self): - self.spark.catalog.registerFunction("strlen", lambda string: len(string), IntegerType()) - self.spark.createDataFrame(self.sc.parallelize([Row(a="test")]))\ - .createOrReplaceTempView("test") - [res] = self.spark.sql("SELECT strlen(a) FROM test WHERE strlen(a) > 1").collect() - self.assertEqual(4, res[0]) - - def test_udf3(self): - two_args = self.spark.catalog.registerFunction( - "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y)) - self.assertEqual(two_args.deterministic, True) - [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect() - self.assertEqual(row[0], u'5') - - def test_udf_registration_return_type_none(self): - two_args = self.spark.catalog.registerFunction( - "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y, "integer"), None) - self.assertEqual(two_args.deterministic, True) - [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect() - self.assertEqual(row[0], 5) - - def test_udf_registration_return_type_not_none(self): - with QuietTest(self.sc): - with self.assertRaisesRegexp(TypeError, "Invalid returnType"): - self.spark.catalog.registerFunction( - "f", UserDefinedFunction(lambda x, y: len(x) + y, StringType()), StringType()) - - def test_nondeterministic_udf(self): - # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations - from pyspark.sql.functions import udf - import random - udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic() - self.assertEqual(udf_random_col.deterministic, False) - df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND')) - udf_add_ten = udf(lambda rand: rand + 10, IntegerType()) - [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect() - self.assertEqual(row[0] + 10, row[1]) - - def test_nondeterministic_udf2(self): - import random - from pyspark.sql.functions import udf - random_udf = udf(lambda: random.randint(6, 6), IntegerType()).asNondeterministic() - self.assertEqual(random_udf.deterministic, False) - random_udf1 = self.spark.catalog.registerFunction("randInt", random_udf) - self.assertEqual(random_udf1.deterministic, False) - [row] = self.spark.sql("SELECT randInt()").collect() - self.assertEqual(row[0], 6) - [row] = self.spark.range(1).select(random_udf1()).collect() - self.assertEqual(row[0], 6) - [row] = self.spark.range(1).select(random_udf()).collect() - self.assertEqual(row[0], 6) - # render_doc() reproduces the help() exception without printing output - pydoc.render_doc(udf(lambda: random.randint(6, 6), IntegerType())) - pydoc.render_doc(random_udf) - pydoc.render_doc(random_udf1) - pydoc.render_doc(udf(lambda x: x).asNondeterministic) - - def test_nondeterministic_udf3(self): - # regression test for SPARK-23233 - from pyspark.sql.functions import udf - f = udf(lambda x: x) - # Here we cache the JVM UDF instance. - self.spark.range(1).select(f("id")) - # This should reset the cache to set the deterministic status correctly. - f = f.asNondeterministic() - # Check the deterministic status of udf. - df = self.spark.range(1).select(f("id")) - deterministic = df._jdf.logicalPlan().projectList().head().deterministic() - self.assertFalse(deterministic) - - def test_nondeterministic_udf_in_aggregate(self): - from pyspark.sql.functions import udf, sum - import random - udf_random_col = udf(lambda: int(100 * random.random()), 'int').asNondeterministic() - df = self.spark.range(10) - - with QuietTest(self.sc): - with self.assertRaisesRegexp(AnalysisException, "nondeterministic"): - df.groupby('id').agg(sum(udf_random_col())).collect() - with self.assertRaisesRegexp(AnalysisException, "nondeterministic"): - df.agg(sum(udf_random_col())).collect() - - def test_chained_udf(self): - self.spark.catalog.registerFunction("double", lambda x: x + x, IntegerType()) - [row] = self.spark.sql("SELECT double(1)").collect() - self.assertEqual(row[0], 2) - [row] = self.spark.sql("SELECT double(double(1))").collect() - self.assertEqual(row[0], 4) - [row] = self.spark.sql("SELECT double(double(1) + 1)").collect() - self.assertEqual(row[0], 6) - - def test_single_udf_with_repeated_argument(self): - # regression test for SPARK-20685 - self.spark.catalog.registerFunction("add", lambda x, y: x + y, IntegerType()) - row = self.spark.sql("SELECT add(1, 1)").first() - self.assertEqual(tuple(row), (2, )) - - def test_multiple_udfs(self): - self.spark.catalog.registerFunction("double", lambda x: x * 2, IntegerType()) - [row] = self.spark.sql("SELECT double(1), double(2)").collect() - self.assertEqual(tuple(row), (2, 4)) - [row] = self.spark.sql("SELECT double(double(1)), double(double(2) + 2)").collect() - self.assertEqual(tuple(row), (4, 12)) - self.spark.catalog.registerFunction("add", lambda x, y: x + y, IntegerType()) - [row] = self.spark.sql("SELECT double(add(1, 2)), add(double(2), 1)").collect() - self.assertEqual(tuple(row), (6, 5)) - - def test_udf_in_filter_on_top_of_outer_join(self): - from pyspark.sql.functions import udf - left = self.spark.createDataFrame([Row(a=1)]) - right = self.spark.createDataFrame([Row(a=1)]) - df = left.join(right, on='a', how='left_outer') - df = df.withColumn('b', udf(lambda x: 'x')(df.a)) - self.assertEqual(df.filter('b = "x"').collect(), [Row(a=1, b='x')]) - - def test_udf_in_filter_on_top_of_join(self): - # regression test for SPARK-18589 - from pyspark.sql.functions import udf - left = self.spark.createDataFrame([Row(a=1)]) - right = self.spark.createDataFrame([Row(b=1)]) - f = udf(lambda a, b: a == b, BooleanType()) - df = left.crossJoin(right).filter(f("a", "b")) - self.assertEqual(df.collect(), [Row(a=1, b=1)]) - - def test_udf_in_join_condition(self): - # regression test for SPARK-25314 - from pyspark.sql.functions import udf - left = self.spark.createDataFrame([Row(a=1)]) - right = self.spark.createDataFrame([Row(b=1)]) - f = udf(lambda a, b: a == b, BooleanType()) - df = left.join(right, f("a", "b")) - with self.assertRaisesRegexp(AnalysisException, 'Detected implicit cartesian product'): - df.collect() - with self.sql_conf({"spark.sql.crossJoin.enabled": True}): - self.assertEqual(df.collect(), [Row(a=1, b=1)]) - - def test_udf_in_left_outer_join_condition(self): - # regression test for SPARK-26147 - from pyspark.sql.functions import udf, col - left = self.spark.createDataFrame([Row(a=1)]) - right = self.spark.createDataFrame([Row(b=1)]) - f = udf(lambda a: str(a), StringType()) - # The join condition can't be pushed down, as it refers to attributes from both sides. - # The Python UDF only refer to attributes from one side, so it's evaluable. - df = left.join(right, f("a") == col("b").cast("string"), how="left_outer") - with self.sql_conf({"spark.sql.crossJoin.enabled": True}): - self.assertEqual(df.collect(), [Row(a=1, b=1)]) - - def test_udf_and_common_filter_in_join_condition(self): - # regression test for SPARK-25314 - # test the complex scenario with both udf and common filter - from pyspark.sql.functions import udf - left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)]) - right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, b1=3, b2=1)]) - f = udf(lambda a, b: a == b, BooleanType()) - df = left.join(right, [f("a", "b"), left.a1 == right.b1]) - # do not need spark.sql.crossJoin.enabled=true for udf is not the only join condition. - self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)]) - - def test_udf_not_supported_in_join_condition(self): - # regression test for SPARK-25314 - # test python udf is not supported in join type except inner join. - from pyspark.sql.functions import udf - left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)]) - right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, b1=3, b2=1)]) - f = udf(lambda a, b: a == b, BooleanType()) - - def runWithJoinType(join_type, type_string): - with self.assertRaisesRegexp( - AnalysisException, - 'Using PythonUDF.*%s is not supported.' % type_string): - left.join(right, [f("a", "b"), left.a1 == right.b1], join_type).collect() - runWithJoinType("full", "FullOuter") - runWithJoinType("left", "LeftOuter") - runWithJoinType("right", "RightOuter") - runWithJoinType("leftanti", "LeftAnti") - runWithJoinType("leftsemi", "LeftSemi") - - def test_udf_without_arguments(self): - self.spark.catalog.registerFunction("foo", lambda: "bar") - [row] = self.spark.sql("SELECT foo()").collect() - self.assertEqual(row[0], "bar") - - def test_udf_with_array_type(self): - d = [Row(l=list(range(3)), d={"key": list(range(5))})] - rdd = self.sc.parallelize(d) - self.spark.createDataFrame(rdd).createOrReplaceTempView("test") - self.spark.catalog.registerFunction("copylist", lambda l: list(l), ArrayType(IntegerType())) - self.spark.catalog.registerFunction("maplen", lambda d: len(d), IntegerType()) - [(l1, l2)] = self.spark.sql("select copylist(l), maplen(d) from test").collect() - self.assertEqual(list(range(3)), l1) - self.assertEqual(1, l2) - - def test_broadcast_in_udf(self): - bar = {"a": "aa", "b": "bb", "c": "abc"} - foo = self.sc.broadcast(bar) - self.spark.catalog.registerFunction("MYUDF", lambda x: foo.value[x] if x else '') - [res] = self.spark.sql("SELECT MYUDF('c')").collect() - self.assertEqual("abc", res[0]) - [res] = self.spark.sql("SELECT MYUDF('')").collect() - self.assertEqual("", res[0]) - - def test_udf_with_filter_function(self): - df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) - from pyspark.sql.functions import udf, col - from pyspark.sql.types import BooleanType - - my_filter = udf(lambda a: a < 2, BooleanType()) - sel = df.select(col("key"), col("value")).filter((my_filter(col("key"))) & (df.value < "2")) - self.assertEqual(sel.collect(), [Row(key=1, value='1')]) - - def test_udf_with_aggregate_function(self): - df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) - from pyspark.sql.functions import udf, col, sum - from pyspark.sql.types import BooleanType - - my_filter = udf(lambda a: a == 1, BooleanType()) - sel = df.select(col("key")).distinct().filter(my_filter(col("key"))) - self.assertEqual(sel.collect(), [Row(key=1)]) - - my_copy = udf(lambda x: x, IntegerType()) - my_add = udf(lambda a, b: int(a + b), IntegerType()) - my_strlen = udf(lambda x: len(x), IntegerType()) - sel = df.groupBy(my_copy(col("key")).alias("k"))\ - .agg(sum(my_strlen(col("value"))).alias("s"))\ - .select(my_add(col("k"), col("s")).alias("t")) - self.assertEqual(sel.collect(), [Row(t=4), Row(t=3)]) - - def test_udf_in_generate(self): - from pyspark.sql.functions import udf, explode - df = self.spark.range(5) - f = udf(lambda x: list(range(x)), ArrayType(LongType())) - row = df.select(explode(f(*df))).groupBy().sum().first() - self.assertEqual(row[0], 10) - - df = self.spark.range(3) - res = df.select("id", explode(f(df.id))).collect() - self.assertEqual(res[0][0], 1) - self.assertEqual(res[0][1], 0) - self.assertEqual(res[1][0], 2) - self.assertEqual(res[1][1], 0) - self.assertEqual(res[2][0], 2) - self.assertEqual(res[2][1], 1) - - range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType())) - res = df.select("id", explode(range_udf(df.id))).collect() - self.assertEqual(res[0][0], 0) - self.assertEqual(res[0][1], -1) - self.assertEqual(res[1][0], 0) - self.assertEqual(res[1][1], 0) - self.assertEqual(res[2][0], 1) - self.assertEqual(res[2][1], 0) - self.assertEqual(res[3][0], 1) - self.assertEqual(res[3][1], 1) - - def test_udf_with_order_by_and_limit(self): - from pyspark.sql.functions import udf - my_copy = udf(lambda x: x, IntegerType()) - df = self.spark.range(10).orderBy("id") - res = df.select(df.id, my_copy(df.id).alias("copy")).limit(1) - res.explain(True) - self.assertEqual(res.collect(), [Row(id=0, copy=0)]) - - def test_udf_registration_returns_udf(self): - df = self.spark.range(10) - add_three = self.spark.udf.register("add_three", lambda x: x + 3, IntegerType()) - - self.assertListEqual( - df.selectExpr("add_three(id) AS plus_three").collect(), - df.select(add_three("id").alias("plus_three")).collect() - ) - - # This is to check if a 'SQLContext.udf' can call its alias. - sqlContext = self.spark._wrapped - add_four = sqlContext.udf.register("add_four", lambda x: x + 4, IntegerType()) - - self.assertListEqual( - df.selectExpr("add_four(id) AS plus_four").collect(), - df.select(add_four("id").alias("plus_four")).collect() - ) - - def test_non_existed_udf(self): - spark = self.spark - self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf", - lambda: spark.udf.registerJavaFunction("udf1", "non_existed_udf")) - - # This is to check if a deprecated 'SQLContext.registerJavaFunction' can call its alias. - sqlContext = spark._wrapped - self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf", - lambda: sqlContext.registerJavaFunction("udf1", "non_existed_udf")) - - def test_non_existed_udaf(self): - spark = self.spark - self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udaf", - lambda: spark.udf.registerJavaUDAF("udaf1", "non_existed_udaf")) - - def test_linesep_text(self): - df = self.spark.read.text("python/test_support/sql/ages_newlines.csv", lineSep=",") - expected = [Row(value=u'Joe'), Row(value=u'20'), Row(value=u'"Hi'), - Row(value=u'\nI am Jeo"\nTom'), Row(value=u'30'), - Row(value=u'"My name is Tom"\nHyukjin'), Row(value=u'25'), - Row(value=u'"I am Hyukjin\n\nI love Spark!"\n')] - self.assertEqual(df.collect(), expected) - - tpath = tempfile.mkdtemp() - shutil.rmtree(tpath) - try: - df.write.text(tpath, lineSep="!") - expected = [Row(value=u'Joe!20!"Hi!'), Row(value=u'I am Jeo"'), - Row(value=u'Tom!30!"My name is Tom"'), - Row(value=u'Hyukjin!25!"I am Hyukjin'), - Row(value=u''), Row(value=u'I love Spark!"'), - Row(value=u'!')] - readback = self.spark.read.text(tpath) - self.assertEqual(readback.collect(), expected) - finally: - shutil.rmtree(tpath) - - def test_multiline_json(self): - people1 = self.spark.read.json("python/test_support/sql/people.json") - people_array = self.spark.read.json("python/test_support/sql/people_array.json", - multiLine=True) - self.assertEqual(people1.collect(), people_array.collect()) - - def test_encoding_json(self): - people_array = self.spark.read\ - .json("python/test_support/sql/people_array_utf16le.json", - multiLine=True, encoding="UTF-16LE") - expected = [Row(age=30, name=u'Andy'), Row(age=19, name=u'Justin')] - self.assertEqual(people_array.collect(), expected) - - def test_linesep_json(self): - df = self.spark.read.json("python/test_support/sql/people.json", lineSep=",") - expected = [Row(_corrupt_record=None, name=u'Michael'), - Row(_corrupt_record=u' "age":30}\n{"name":"Justin"', name=None), - Row(_corrupt_record=u' "age":19}\n', name=None)] - self.assertEqual(df.collect(), expected) - - tpath = tempfile.mkdtemp() - shutil.rmtree(tpath) - try: - df = self.spark.read.json("python/test_support/sql/people.json") - df.write.json(tpath, lineSep="!!") - readback = self.spark.read.json(tpath, lineSep="!!") - self.assertEqual(readback.collect(), df.collect()) - finally: - shutil.rmtree(tpath) - - def test_multiline_csv(self): - ages_newlines = self.spark.read.csv( - "python/test_support/sql/ages_newlines.csv", multiLine=True) - expected = [Row(_c0=u'Joe', _c1=u'20', _c2=u'Hi,\nI am Jeo'), - Row(_c0=u'Tom', _c1=u'30', _c2=u'My name is Tom'), - Row(_c0=u'Hyukjin', _c1=u'25', _c2=u'I am Hyukjin\n\nI love Spark!')] - self.assertEqual(ages_newlines.collect(), expected) - - def test_ignorewhitespace_csv(self): - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) - self.spark.createDataFrame([[" a", "b ", " c "]]).write.csv( - tmpPath, - ignoreLeadingWhiteSpace=False, - ignoreTrailingWhiteSpace=False) - - expected = [Row(value=u' a,b , c ')] - readback = self.spark.read.text(tmpPath) - self.assertEqual(readback.collect(), expected) - shutil.rmtree(tmpPath) - - def test_read_multiple_orc_file(self): - df = self.spark.read.orc(["python/test_support/sql/orc_partitioned/b=0/c=0", - "python/test_support/sql/orc_partitioned/b=1/c=1"]) - self.assertEqual(2, df.count()) - - def test_udf_with_input_file_name(self): - from pyspark.sql.functions import udf, input_file_name - sourceFile = udf(lambda path: path, StringType()) - filePath = "python/test_support/sql/people1.json" - row = self.spark.read.json(filePath).select(sourceFile(input_file_name())).first() - self.assertTrue(row[0].find("people1.json") != -1) - - def test_udf_with_input_file_name_for_hadooprdd(self): - from pyspark.sql.functions import udf, input_file_name - - def filename(path): - return path - - sameText = udf(filename, StringType()) - - rdd = self.sc.textFile('python/test_support/sql/people.json') - df = self.spark.read.json(rdd).select(input_file_name().alias('file')) - row = df.select(sameText(df['file'])).first() - self.assertTrue(row[0].find("people.json") != -1) - - rdd2 = self.sc.newAPIHadoopFile( - 'python/test_support/sql/people.json', - 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', - 'org.apache.hadoop.io.LongWritable', - 'org.apache.hadoop.io.Text') - - df2 = self.spark.read.json(rdd2).select(input_file_name().alias('file')) - row2 = df2.select(sameText(df2['file'])).first() - self.assertTrue(row2[0].find("people.json") != -1) - - def test_udf_defers_judf_initialization(self): - # This is separate of UDFInitializationTests - # to avoid context initialization - # when udf is called - - from pyspark.sql.functions import UserDefinedFunction - - f = UserDefinedFunction(lambda x: x, StringType()) - - self.assertIsNone( - f._judf_placeholder, - "judf should not be initialized before the first call." - ) - - self.assertIsInstance(f("foo"), Column, "UDF call should return a Column.") - - self.assertIsNotNone( - f._judf_placeholder, - "judf should be initialized after UDF has been called." - ) - - def test_udf_with_string_return_type(self): - from pyspark.sql.functions import UserDefinedFunction - - add_one = UserDefinedFunction(lambda x: x + 1, "integer") - make_pair = UserDefinedFunction(lambda x: (-x, x), "struct") - make_array = UserDefinedFunction( - lambda x: [float(x) for x in range(x, x + 3)], "array") - - expected = (2, Row(x=-1, y=1), [1.0, 2.0, 3.0]) - actual = (self.spark.range(1, 2).toDF("x") - .select(add_one("x"), make_pair("x"), make_array("x")) - .first()) - - self.assertTupleEqual(expected, actual) - - def test_udf_shouldnt_accept_noncallable_object(self): - from pyspark.sql.functions import UserDefinedFunction - - non_callable = None - self.assertRaises(TypeError, UserDefinedFunction, non_callable, StringType()) - - def test_udf_with_decorator(self): - from pyspark.sql.functions import lit, udf - from pyspark.sql.types import IntegerType, DoubleType - - @udf(IntegerType()) - def add_one(x): - if x is not None: - return x + 1 - - @udf(returnType=DoubleType()) - def add_two(x): - if x is not None: - return float(x + 2) - - @udf - def to_upper(x): - if x is not None: - return x.upper() - - @udf() - def to_lower(x): - if x is not None: - return x.lower() - - @udf - def substr(x, start, end): - if x is not None: - return x[start:end] - - @udf("long") - def trunc(x): - return int(x) - - @udf(returnType="double") - def as_double(x): - return float(x) - - df = ( - self.spark - .createDataFrame( - [(1, "Foo", "foobar", 3.0)], ("one", "Foo", "foobar", "float")) - .select( - add_one("one"), add_two("one"), - to_upper("Foo"), to_lower("Foo"), - substr("foobar", lit(0), lit(3)), - trunc("float"), as_double("one"))) - - self.assertListEqual( - [tpe for _, tpe in df.dtypes], - ["int", "double", "string", "string", "string", "bigint", "double"] - ) - - self.assertListEqual( - list(df.first()), - [2, 3.0, "FOO", "foo", "foo", 3, 1.0] - ) - - def test_udf_wrapper(self): - from pyspark.sql.functions import udf - from pyspark.sql.types import IntegerType - - def f(x): - """Identity""" - return x - - return_type = IntegerType() - f_ = udf(f, return_type) - - self.assertTrue(f.__doc__ in f_.__doc__) - self.assertEqual(f, f_.func) - self.assertEqual(return_type, f_.returnType) - - class F(object): - """Identity""" - def __call__(self, x): - return x - - f = F() - return_type = IntegerType() - f_ = udf(f, return_type) - - self.assertTrue(f.__doc__ in f_.__doc__) - self.assertEqual(f, f_.func) - self.assertEqual(return_type, f_.returnType) - - f = functools.partial(f, x=1) - return_type = IntegerType() - f_ = udf(f, return_type) - - self.assertTrue(f.__doc__ in f_.__doc__) - self.assertEqual(f, f_.func) - self.assertEqual(return_type, f_.returnType) - - def test_validate_column_types(self): - from pyspark.sql.functions import udf, to_json - from pyspark.sql.column import _to_java_column - - self.assertTrue("Column" in _to_java_column("a").getClass().toString()) - self.assertTrue("Column" in _to_java_column(u"a").getClass().toString()) - self.assertTrue("Column" in _to_java_column(self.spark.range(1).id).getClass().toString()) - - self.assertRaisesRegexp( - TypeError, - "Invalid argument, not a string or column", - lambda: _to_java_column(1)) - - class A(): - pass - - self.assertRaises(TypeError, lambda: _to_java_column(A())) - self.assertRaises(TypeError, lambda: _to_java_column([])) - - self.assertRaisesRegexp( - TypeError, - "Invalid argument, not a string or column", - lambda: udf(lambda x: x)(None)) - self.assertRaises(TypeError, lambda: to_json(1)) - - def test_basic_functions(self): - rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}']) - df = self.spark.read.json(rdd) - df.count() - df.collect() - df.schema - - # cache and checkpoint - self.assertFalse(df.is_cached) - df.persist() - df.unpersist(True) - df.cache() - self.assertTrue(df.is_cached) - self.assertEqual(2, df.count()) - - df.createOrReplaceTempView("temp") - df = self.spark.sql("select foo from temp") - df.count() - df.collect() - - def test_apply_schema_to_row(self): - df = self.spark.read.json(self.sc.parallelize(["""{"a":2}"""])) - df2 = self.spark.createDataFrame(df.rdd.map(lambda x: x), df.schema) - self.assertEqual(df.collect(), df2.collect()) - - rdd = self.sc.parallelize(range(10)).map(lambda x: Row(a=x)) - df3 = self.spark.createDataFrame(rdd, df.schema) - self.assertEqual(10, df3.count()) - - def test_infer_schema_to_local(self): - input = [{"a": 1}, {"b": "coffee"}] - rdd = self.sc.parallelize(input) - df = self.spark.createDataFrame(input) - df2 = self.spark.createDataFrame(rdd, samplingRatio=1.0) - self.assertEqual(df.schema, df2.schema) - - rdd = self.sc.parallelize(range(10)).map(lambda x: Row(a=x, b=None)) - df3 = self.spark.createDataFrame(rdd, df.schema) - self.assertEqual(10, df3.count()) - - def test_apply_schema_to_dict_and_rows(self): - schema = StructType().add("b", StringType()).add("a", IntegerType()) - input = [{"a": 1}, {"b": "coffee"}] - rdd = self.sc.parallelize(input) - for verify in [False, True]: - df = self.spark.createDataFrame(input, schema, verifySchema=verify) - df2 = self.spark.createDataFrame(rdd, schema, verifySchema=verify) - self.assertEqual(df.schema, df2.schema) - - rdd = self.sc.parallelize(range(10)).map(lambda x: Row(a=x, b=None)) - df3 = self.spark.createDataFrame(rdd, schema, verifySchema=verify) - self.assertEqual(10, df3.count()) - input = [Row(a=x, b=str(x)) for x in range(10)] - df4 = self.spark.createDataFrame(input, schema, verifySchema=verify) - self.assertEqual(10, df4.count()) - - def test_create_dataframe_schema_mismatch(self): - input = [Row(a=1)] - rdd = self.sc.parallelize(range(3)).map(lambda i: Row(a=i)) - schema = StructType([StructField("a", IntegerType()), StructField("b", StringType())]) - df = self.spark.createDataFrame(rdd, schema) - self.assertRaises(Exception, lambda: df.show()) - - def test_serialize_nested_array_and_map(self): - d = [Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})] - rdd = self.sc.parallelize(d) - df = self.spark.createDataFrame(rdd) - row = df.head() - self.assertEqual(1, len(row.l)) - self.assertEqual(1, row.l[0].a) - self.assertEqual("2", row.d["key"].d) - - l = df.rdd.map(lambda x: x.l).first() - self.assertEqual(1, len(l)) - self.assertEqual('s', l[0].b) - - d = df.rdd.map(lambda x: x.d).first() - self.assertEqual(1, len(d)) - self.assertEqual(1.0, d["key"].c) - - row = df.rdd.map(lambda x: x.d["key"]).first() - self.assertEqual(1.0, row.c) - self.assertEqual("2", row.d) - - def test_infer_schema(self): - d = [Row(l=[], d={}, s=None), - Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}, s="")] - rdd = self.sc.parallelize(d) - df = self.spark.createDataFrame(rdd) - self.assertEqual([], df.rdd.map(lambda r: r.l).first()) - self.assertEqual([None, ""], df.rdd.map(lambda r: r.s).collect()) - df.createOrReplaceTempView("test") - result = self.spark.sql("SELECT l[0].a from test where d['key'].d = '2'") - self.assertEqual(1, result.head()[0]) - - df2 = self.spark.createDataFrame(rdd, samplingRatio=1.0) - self.assertEqual(df.schema, df2.schema) - self.assertEqual({}, df2.rdd.map(lambda r: r.d).first()) - self.assertEqual([None, ""], df2.rdd.map(lambda r: r.s).collect()) - df2.createOrReplaceTempView("test2") - result = self.spark.sql("SELECT l[0].a from test2 where d['key'].d = '2'") - self.assertEqual(1, result.head()[0]) - - def test_infer_schema_not_enough_names(self): - df = self.spark.createDataFrame([["a", "b"]], ["col1"]) - self.assertEqual(df.columns, ['col1', '_2']) - - def test_infer_schema_fails(self): - with self.assertRaisesRegexp(TypeError, 'field a'): - self.spark.createDataFrame(self.spark.sparkContext.parallelize([[1, 1], ["x", 1]]), - schema=["a", "b"], samplingRatio=0.99) - - def test_infer_nested_schema(self): - NestedRow = Row("f1", "f2") - nestedRdd1 = self.sc.parallelize([NestedRow([1, 2], {"row1": 1.0}), - NestedRow([2, 3], {"row2": 2.0})]) - df = self.spark.createDataFrame(nestedRdd1) - self.assertEqual(Row(f1=[1, 2], f2={u'row1': 1.0}), df.collect()[0]) - - nestedRdd2 = self.sc.parallelize([NestedRow([[1, 2], [2, 3]], [1, 2]), - NestedRow([[2, 3], [3, 4]], [2, 3])]) - df = self.spark.createDataFrame(nestedRdd2) - self.assertEqual(Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), df.collect()[0]) - - from collections import namedtuple - CustomRow = namedtuple('CustomRow', 'field1 field2') - rdd = self.sc.parallelize([CustomRow(field1=1, field2="row1"), - CustomRow(field1=2, field2="row2"), - CustomRow(field1=3, field2="row3")]) - df = self.spark.createDataFrame(rdd) - self.assertEqual(Row(field1=1, field2=u'row1'), df.first()) - - def test_create_dataframe_from_dict_respects_schema(self): - df = self.spark.createDataFrame([{'a': 1}], ["b"]) - self.assertEqual(df.columns, ['b']) - - def test_create_dataframe_from_objects(self): - data = [MyObject(1, "1"), MyObject(2, "2")] - df = self.spark.createDataFrame(data) - self.assertEqual(df.dtypes, [("key", "bigint"), ("value", "string")]) - self.assertEqual(df.first(), Row(key=1, value="1")) - - def test_select_null_literal(self): - df = self.spark.sql("select null as col") - self.assertEqual(Row(col=None), df.first()) - - def test_apply_schema(self): - from datetime import date, datetime - rdd = self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0, - date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1), - {"a": 1}, (2,), [1, 2, 3], None)]) - schema = StructType([ - StructField("byte1", ByteType(), False), - StructField("byte2", ByteType(), False), - StructField("short1", ShortType(), False), - StructField("short2", ShortType(), False), - StructField("int1", IntegerType(), False), - StructField("float1", FloatType(), False), - StructField("date1", DateType(), False), - StructField("time1", TimestampType(), False), - StructField("map1", MapType(StringType(), IntegerType(), False), False), - StructField("struct1", StructType([StructField("b", ShortType(), False)]), False), - StructField("list1", ArrayType(ByteType(), False), False), - StructField("null1", DoubleType(), True)]) - df = self.spark.createDataFrame(rdd, schema) - results = df.rdd.map(lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int1, x.float1, - x.date1, x.time1, x.map1["a"], x.struct1.b, x.list1, x.null1)) - r = (127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1), - datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None) - self.assertEqual(r, results.first()) - - df.createOrReplaceTempView("table2") - r = self.spark.sql("SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " + - "short1 + 1 AS short1, short2 - 1 AS short2, int1 - 1 AS int1, " + - "float1 + 1.5 as float1 FROM table2").first() - - self.assertEqual((126, -127, -32767, 32766, 2147483646, 2.5), tuple(r)) - - def test_struct_in_map(self): - d = [Row(m={Row(i=1): Row(s="")})] - df = self.sc.parallelize(d).toDF() - k, v = list(df.head().m.items())[0] - self.assertEqual(1, k.i) - self.assertEqual("", v.s) - - def test_convert_row_to_dict(self): - row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}) - self.assertEqual(1, row.asDict()['l'][0].a) - df = self.sc.parallelize([row]).toDF() - df.createOrReplaceTempView("test") - row = self.spark.sql("select l, d from test").head() - self.assertEqual(1, row.asDict()["l"][0].a) - self.assertEqual(1.0, row.asDict()['d']['key'].c) - - def test_udt(self): - from pyspark.sql.types import _parse_datatype_json_string, _infer_type, _make_type_verifier - from pyspark.sql.tests import ExamplePointUDT, ExamplePoint - - def check_datatype(datatype): - pickled = pickle.loads(pickle.dumps(datatype)) - assert datatype == pickled - scala_datatype = self.spark._jsparkSession.parseDataType(datatype.json()) - python_datatype = _parse_datatype_json_string(scala_datatype.json()) - assert datatype == python_datatype - - check_datatype(ExamplePointUDT()) - structtype_with_udt = StructType([StructField("label", DoubleType(), False), - StructField("point", ExamplePointUDT(), False)]) - check_datatype(structtype_with_udt) - p = ExamplePoint(1.0, 2.0) - self.assertEqual(_infer_type(p), ExamplePointUDT()) - _make_type_verifier(ExamplePointUDT())(ExamplePoint(1.0, 2.0)) - self.assertRaises(ValueError, lambda: _make_type_verifier(ExamplePointUDT())([1.0, 2.0])) - - check_datatype(PythonOnlyUDT()) - structtype_with_udt = StructType([StructField("label", DoubleType(), False), - StructField("point", PythonOnlyUDT(), False)]) - check_datatype(structtype_with_udt) - p = PythonOnlyPoint(1.0, 2.0) - self.assertEqual(_infer_type(p), PythonOnlyUDT()) - _make_type_verifier(PythonOnlyUDT())(PythonOnlyPoint(1.0, 2.0)) - self.assertRaises( - ValueError, - lambda: _make_type_verifier(PythonOnlyUDT())([1.0, 2.0])) - - def test_simple_udt_in_df(self): - schema = StructType().add("key", LongType()).add("val", PythonOnlyUDT()) - df = self.spark.createDataFrame( - [(i % 3, PythonOnlyPoint(float(i), float(i))) for i in range(10)], - schema=schema) - df.collect() - - def test_nested_udt_in_df(self): - schema = StructType().add("key", LongType()).add("val", ArrayType(PythonOnlyUDT())) - df = self.spark.createDataFrame( - [(i % 3, [PythonOnlyPoint(float(i), float(i))]) for i in range(10)], - schema=schema) - df.collect() - - schema = StructType().add("key", LongType()).add("val", - MapType(LongType(), PythonOnlyUDT())) - df = self.spark.createDataFrame( - [(i % 3, {i % 3: PythonOnlyPoint(float(i + 1), float(i + 1))}) for i in range(10)], - schema=schema) - df.collect() - - def test_complex_nested_udt_in_df(self): - from pyspark.sql.functions import udf - - schema = StructType().add("key", LongType()).add("val", PythonOnlyUDT()) - df = self.spark.createDataFrame( - [(i % 3, PythonOnlyPoint(float(i), float(i))) for i in range(10)], - schema=schema) - df.collect() - - gd = df.groupby("key").agg({"val": "collect_list"}) - gd.collect() - udf = udf(lambda k, v: [(k, v[0])], ArrayType(df.schema)) - gd.select(udf(*gd)).collect() - - def test_udt_with_none(self): - df = self.spark.range(0, 10, 1, 1) - - def myudf(x): - if x > 0: - return PythonOnlyPoint(float(x), float(x)) - - self.spark.catalog.registerFunction("udf", myudf, PythonOnlyUDT()) - rows = [r[0] for r in df.selectExpr("udf(id)").take(2)] - self.assertEqual(rows, [None, PythonOnlyPoint(1, 1)]) - - def test_nonparam_udf_with_aggregate(self): - import pyspark.sql.functions as f - - df = self.spark.createDataFrame([(1, 2), (1, 2)]) - f_udf = f.udf(lambda: "const_str") - rows = df.distinct().withColumn("a", f_udf()).collect() - self.assertEqual(rows, [Row(_1=1, _2=2, a=u'const_str')]) - - def test_infer_schema_with_udt(self): - from pyspark.sql.tests import ExamplePoint, ExamplePointUDT - row = Row(label=1.0, point=ExamplePoint(1.0, 2.0)) - df = self.spark.createDataFrame([row]) - schema = df.schema - field = [f for f in schema.fields if f.name == "point"][0] - self.assertEqual(type(field.dataType), ExamplePointUDT) - df.createOrReplaceTempView("labeled_point") - point = self.spark.sql("SELECT point FROM labeled_point").head().point - self.assertEqual(point, ExamplePoint(1.0, 2.0)) - - row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0)) - df = self.spark.createDataFrame([row]) - schema = df.schema - field = [f for f in schema.fields if f.name == "point"][0] - self.assertEqual(type(field.dataType), PythonOnlyUDT) - df.createOrReplaceTempView("labeled_point") - point = self.spark.sql("SELECT point FROM labeled_point").head().point - self.assertEqual(point, PythonOnlyPoint(1.0, 2.0)) - - def test_apply_schema_with_udt(self): - from pyspark.sql.tests import ExamplePoint, ExamplePointUDT - row = (1.0, ExamplePoint(1.0, 2.0)) - schema = StructType([StructField("label", DoubleType(), False), - StructField("point", ExamplePointUDT(), False)]) - df = self.spark.createDataFrame([row], schema) - point = df.head().point - self.assertEqual(point, ExamplePoint(1.0, 2.0)) - - row = (1.0, PythonOnlyPoint(1.0, 2.0)) - schema = StructType([StructField("label", DoubleType(), False), - StructField("point", PythonOnlyUDT(), False)]) - df = self.spark.createDataFrame([row], schema) - point = df.head().point - self.assertEqual(point, PythonOnlyPoint(1.0, 2.0)) - - def test_udf_with_udt(self): - from pyspark.sql.tests import ExamplePoint, ExamplePointUDT - row = Row(label=1.0, point=ExamplePoint(1.0, 2.0)) - df = self.spark.createDataFrame([row]) - self.assertEqual(1.0, df.rdd.map(lambda r: r.point.x).first()) - udf = UserDefinedFunction(lambda p: p.y, DoubleType()) - self.assertEqual(2.0, df.select(udf(df.point)).first()[0]) - udf2 = UserDefinedFunction(lambda p: ExamplePoint(p.x + 1, p.y + 1), ExamplePointUDT()) - self.assertEqual(ExamplePoint(2.0, 3.0), df.select(udf2(df.point)).first()[0]) - - row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0)) - df = self.spark.createDataFrame([row]) - self.assertEqual(1.0, df.rdd.map(lambda r: r.point.x).first()) - udf = UserDefinedFunction(lambda p: p.y, DoubleType()) - self.assertEqual(2.0, df.select(udf(df.point)).first()[0]) - udf2 = UserDefinedFunction(lambda p: PythonOnlyPoint(p.x + 1, p.y + 1), PythonOnlyUDT()) - self.assertEqual(PythonOnlyPoint(2.0, 3.0), df.select(udf2(df.point)).first()[0]) - - def test_parquet_with_udt(self): - from pyspark.sql.tests import ExamplePoint, ExamplePointUDT - row = Row(label=1.0, point=ExamplePoint(1.0, 2.0)) - df0 = self.spark.createDataFrame([row]) - output_dir = os.path.join(self.tempdir.name, "labeled_point") - df0.write.parquet(output_dir) - df1 = self.spark.read.parquet(output_dir) - point = df1.head().point - self.assertEqual(point, ExamplePoint(1.0, 2.0)) - - row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0)) - df0 = self.spark.createDataFrame([row]) - df0.write.parquet(output_dir, mode='overwrite') - df1 = self.spark.read.parquet(output_dir) - point = df1.head().point - self.assertEqual(point, PythonOnlyPoint(1.0, 2.0)) - - def test_union_with_udt(self): - from pyspark.sql.tests import ExamplePoint, ExamplePointUDT - row1 = (1.0, ExamplePoint(1.0, 2.0)) - row2 = (2.0, ExamplePoint(3.0, 4.0)) - schema = StructType([StructField("label", DoubleType(), False), - StructField("point", ExamplePointUDT(), False)]) - df1 = self.spark.createDataFrame([row1], schema) - df2 = self.spark.createDataFrame([row2], schema) - - result = df1.union(df2).orderBy("label").collect() - self.assertEqual( - result, - [ - Row(label=1.0, point=ExamplePoint(1.0, 2.0)), - Row(label=2.0, point=ExamplePoint(3.0, 4.0)) - ] - ) - - def test_cast_to_string_with_udt(self): - from pyspark.sql.tests import ExamplePointUDT, ExamplePoint - from pyspark.sql.functions import col - row = (ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0)) - schema = StructType([StructField("point", ExamplePointUDT(), False), - StructField("pypoint", PythonOnlyUDT(), False)]) - df = self.spark.createDataFrame([row], schema) - - result = df.select(col('point').cast('string'), col('pypoint').cast('string')).head() - self.assertEqual(result, Row(point=u'(1.0, 2.0)', pypoint=u'[3.0, 4.0]')) - - def test_column_operators(self): - ci = self.df.key - cs = self.df.value - c = ci == cs - self.assertTrue(isinstance((- ci - 1 - 2) % 3 * 2.5 / 3.5, Column)) - rcc = (1 + ci), (1 - ci), (1 * ci), (1 / ci), (1 % ci), (1 ** ci), (ci ** 1) - self.assertTrue(all(isinstance(c, Column) for c in rcc)) - cb = [ci == 5, ci != 0, ci > 3, ci < 4, ci >= 0, ci <= 7] - self.assertTrue(all(isinstance(c, Column) for c in cb)) - cbool = (ci & ci), (ci | ci), (~ci) - self.assertTrue(all(isinstance(c, Column) for c in cbool)) - css = cs.contains('a'), cs.like('a'), cs.rlike('a'), cs.asc(), cs.desc(),\ - cs.startswith('a'), cs.endswith('a'), ci.eqNullSafe(cs) - self.assertTrue(all(isinstance(c, Column) for c in css)) - self.assertTrue(isinstance(ci.cast(LongType()), Column)) - self.assertRaisesRegexp(ValueError, - "Cannot apply 'in' operator against a column", - lambda: 1 in cs) - - def test_column_getitem(self): - from pyspark.sql.functions import col - - self.assertIsInstance(col("foo")[1:3], Column) - self.assertIsInstance(col("foo")[0], Column) - self.assertIsInstance(col("foo")["bar"], Column) - self.assertRaises(ValueError, lambda: col("foo")[0:10:2]) - - def test_column_select(self): - df = self.df - self.assertEqual(self.testData, df.select("*").collect()) - self.assertEqual(self.testData, df.select(df.key, df.value).collect()) - self.assertEqual([Row(value='1')], df.where(df.key == 1).select(df.value).collect()) - - def test_freqItems(self): - vals = [Row(a=1, b=-2.0) if i % 2 == 0 else Row(a=i, b=i * 1.0) for i in range(100)] - df = self.sc.parallelize(vals).toDF() - items = df.stat.freqItems(("a", "b"), 0.4).collect()[0] - self.assertTrue(1 in items[0]) - self.assertTrue(-2.0 in items[1]) - - def test_aggregator(self): - df = self.df - g = df.groupBy() - self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0])) - self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) - - from pyspark.sql import functions - self.assertEqual((0, u'99'), - tuple(g.agg(functions.first(df.key), functions.last(df.value)).first())) - self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0]) - self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0]) - - def test_first_last_ignorenulls(self): - from pyspark.sql import functions - df = self.spark.range(0, 100) - df2 = df.select(functions.when(df.id % 3 == 0, None).otherwise(df.id).alias("id")) - df3 = df2.select(functions.first(df2.id, False).alias('a'), - functions.first(df2.id, True).alias('b'), - functions.last(df2.id, False).alias('c'), - functions.last(df2.id, True).alias('d')) - self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect()) - - def test_approxQuantile(self): - df = self.sc.parallelize([Row(a=i, b=i+10) for i in range(10)]).toDF() - for f in ["a", u"a"]: - aq = df.stat.approxQuantile(f, [0.1, 0.5, 0.9], 0.1) - self.assertTrue(isinstance(aq, list)) - self.assertEqual(len(aq), 3) - self.assertTrue(all(isinstance(q, float) for q in aq)) - aqs = df.stat.approxQuantile(["a", u"b"], [0.1, 0.5, 0.9], 0.1) - self.assertTrue(isinstance(aqs, list)) - self.assertEqual(len(aqs), 2) - self.assertTrue(isinstance(aqs[0], list)) - self.assertEqual(len(aqs[0]), 3) - self.assertTrue(all(isinstance(q, float) for q in aqs[0])) - self.assertTrue(isinstance(aqs[1], list)) - self.assertEqual(len(aqs[1]), 3) - self.assertTrue(all(isinstance(q, float) for q in aqs[1])) - aqt = df.stat.approxQuantile((u"a", "b"), [0.1, 0.5, 0.9], 0.1) - self.assertTrue(isinstance(aqt, list)) - self.assertEqual(len(aqt), 2) - self.assertTrue(isinstance(aqt[0], list)) - self.assertEqual(len(aqt[0]), 3) - self.assertTrue(all(isinstance(q, float) for q in aqt[0])) - self.assertTrue(isinstance(aqt[1], list)) - self.assertEqual(len(aqt[1]), 3) - self.assertTrue(all(isinstance(q, float) for q in aqt[1])) - self.assertRaises(ValueError, lambda: df.stat.approxQuantile(123, [0.1, 0.9], 0.1)) - self.assertRaises(ValueError, lambda: df.stat.approxQuantile(("a", 123), [0.1, 0.9], 0.1)) - self.assertRaises(ValueError, lambda: df.stat.approxQuantile(["a", 123], [0.1, 0.9], 0.1)) - - def test_corr(self): - import math - df = self.sc.parallelize([Row(a=i, b=math.sqrt(i)) for i in range(10)]).toDF() - corr = df.stat.corr(u"a", "b") - self.assertTrue(abs(corr - 0.95734012) < 1e-6) - - def test_sampleby(self): - df = self.sc.parallelize([Row(a=i, b=(i % 3)) for i in range(10)]).toDF() - sampled = df.stat.sampleBy(u"b", fractions={0: 0.5, 1: 0.5}, seed=0) - self.assertTrue(sampled.count() == 3) - - def test_cov(self): - df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() - cov = df.stat.cov(u"a", "b") - self.assertTrue(abs(cov - 55.0 / 3) < 1e-6) - - def test_crosstab(self): - df = self.sc.parallelize([Row(a=i % 3, b=i % 2) for i in range(1, 7)]).toDF() - ct = df.stat.crosstab(u"a", "b").collect() - ct = sorted(ct, key=lambda x: x[0]) - for i, row in enumerate(ct): - self.assertEqual(row[0], str(i)) - self.assertTrue(row[1], 1) - self.assertTrue(row[2], 1) - - def test_math_functions(self): - df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() - from pyspark.sql import functions - import math - - def get_values(l): - return [j[0] for j in l] - - def assert_close(a, b): - c = get_values(b) - diff = [abs(v - c[k]) < 1e-6 for k, v in enumerate(a)] - return sum(diff) == len(a) - assert_close([math.cos(i) for i in range(10)], - df.select(functions.cos(df.a)).collect()) - assert_close([math.cos(i) for i in range(10)], - df.select(functions.cos("a")).collect()) - assert_close([math.sin(i) for i in range(10)], - df.select(functions.sin(df.a)).collect()) - assert_close([math.sin(i) for i in range(10)], - df.select(functions.sin(df['a'])).collect()) - assert_close([math.pow(i, 2 * i) for i in range(10)], - df.select(functions.pow(df.a, df.b)).collect()) - assert_close([math.pow(i, 2) for i in range(10)], - df.select(functions.pow(df.a, 2)).collect()) - assert_close([math.pow(i, 2) for i in range(10)], - df.select(functions.pow(df.a, 2.0)).collect()) - assert_close([math.hypot(i, 2 * i) for i in range(10)], - df.select(functions.hypot(df.a, df.b)).collect()) - - def test_rand_functions(self): - df = self.df - from pyspark.sql import functions - rnd = df.select('key', functions.rand()).collect() - for row in rnd: - assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1] - rndn = df.select('key', functions.randn(5)).collect() - for row in rndn: - assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1] - - # If the specified seed is 0, we should use it. - # https://issues.apache.org/jira/browse/SPARK-9691 - rnd1 = df.select('key', functions.rand(0)).collect() - rnd2 = df.select('key', functions.rand(0)).collect() - self.assertEqual(sorted(rnd1), sorted(rnd2)) - - rndn1 = df.select('key', functions.randn(0)).collect() - rndn2 = df.select('key', functions.randn(0)).collect() - self.assertEqual(sorted(rndn1), sorted(rndn2)) - - def test_string_functions(self): - from pyspark.sql.functions import col, lit - df = self.spark.createDataFrame([['nick']], schema=['name']) - self.assertRaisesRegexp( - TypeError, - "must be the same type", - lambda: df.select(col('name').substr(0, lit(1)))) - if sys.version_info.major == 2: - self.assertRaises( - TypeError, - lambda: df.select(col('name').substr(long(0), long(1)))) - - def test_array_contains_function(self): - from pyspark.sql.functions import array_contains - - df = self.spark.createDataFrame([(["1", "2", "3"],), ([],)], ['data']) - actual = df.select(array_contains(df.data, "1").alias('b')).collect() - self.assertEqual([Row(b=True), Row(b=False)], actual) - - def test_between_function(self): - df = self.sc.parallelize([ - Row(a=1, b=2, c=3), - Row(a=2, b=1, c=3), - Row(a=4, b=1, c=4)]).toDF() - self.assertEqual([Row(a=2, b=1, c=3), Row(a=4, b=1, c=4)], - df.filter(df.a.between(df.b, df.c)).collect()) - - def test_struct_type(self): - struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) - struct2 = StructType([StructField("f1", StringType(), True), - StructField("f2", StringType(), True, None)]) - self.assertEqual(struct1.fieldNames(), struct2.names) - self.assertEqual(struct1, struct2) - - struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) - struct2 = StructType([StructField("f1", StringType(), True)]) - self.assertNotEqual(struct1.fieldNames(), struct2.names) - self.assertNotEqual(struct1, struct2) - - struct1 = (StructType().add(StructField("f1", StringType(), True)) - .add(StructField("f2", StringType(), True, None))) - struct2 = StructType([StructField("f1", StringType(), True), - StructField("f2", StringType(), True, None)]) - self.assertEqual(struct1.fieldNames(), struct2.names) - self.assertEqual(struct1, struct2) - - struct1 = (StructType().add(StructField("f1", StringType(), True)) - .add(StructField("f2", StringType(), True, None))) - struct2 = StructType([StructField("f1", StringType(), True)]) - self.assertNotEqual(struct1.fieldNames(), struct2.names) - self.assertNotEqual(struct1, struct2) - - # Catch exception raised during improper construction - self.assertRaises(ValueError, lambda: StructType().add("name")) - - struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) - for field in struct1: - self.assertIsInstance(field, StructField) - - struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) - self.assertEqual(len(struct1), 2) - - struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) - self.assertIs(struct1["f1"], struct1.fields[0]) - self.assertIs(struct1[0], struct1.fields[0]) - self.assertEqual(struct1[0:1], StructType(struct1.fields[0:1])) - self.assertRaises(KeyError, lambda: struct1["f9"]) - self.assertRaises(IndexError, lambda: struct1[9]) - self.assertRaises(TypeError, lambda: struct1[9.9]) - - def test_parse_datatype_string(self): - from pyspark.sql.types import _all_atomic_types, _parse_datatype_string - for k, t in _all_atomic_types.items(): - if t != NullType: - self.assertEqual(t(), _parse_datatype_string(k)) - self.assertEqual(IntegerType(), _parse_datatype_string("int")) - self.assertEqual(DecimalType(1, 1), _parse_datatype_string("decimal(1 ,1)")) - self.assertEqual(DecimalType(10, 1), _parse_datatype_string("decimal( 10,1 )")) - self.assertEqual(DecimalType(11, 1), _parse_datatype_string("decimal(11,1)")) - self.assertEqual( - ArrayType(IntegerType()), - _parse_datatype_string("array")) - self.assertEqual( - MapType(IntegerType(), DoubleType()), - _parse_datatype_string("map< int, double >")) - self.assertEqual( - StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]), - _parse_datatype_string("struct")) - self.assertEqual( - StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]), - _parse_datatype_string("a:int, c:double")) - self.assertEqual( - StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]), - _parse_datatype_string("a INT, c DOUBLE")) - - def test_metadata_null(self): - schema = StructType([StructField("f1", StringType(), True, None), - StructField("f2", StringType(), True, {'a': None})]) - rdd = self.sc.parallelize([["a", "b"], ["c", "d"]]) - self.spark.createDataFrame(rdd, schema) - - def test_save_and_load(self): - df = self.df - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) - df.write.json(tmpPath) - actual = self.spark.read.json(tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - - schema = StructType([StructField("value", StringType(), True)]) - actual = self.spark.read.json(tmpPath, schema) - self.assertEqual(sorted(df.select("value").collect()), sorted(actual.collect())) - - df.write.json(tmpPath, "overwrite") - actual = self.spark.read.json(tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - - df.write.save(format="json", mode="overwrite", path=tmpPath, - noUse="this options will not be used in save.") - actual = self.spark.read.load(format="json", path=tmpPath, - noUse="this options will not be used in load.") - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - - defaultDataSourceName = self.spark.conf.get("spark.sql.sources.default", - "org.apache.spark.sql.parquet") - self.spark.sql("SET spark.sql.sources.default=org.apache.spark.sql.json") - actual = self.spark.read.load(path=tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - self.spark.sql("SET spark.sql.sources.default=" + defaultDataSourceName) - - csvpath = os.path.join(tempfile.mkdtemp(), 'data') - df.write.option('quote', None).format('csv').save(csvpath) - - shutil.rmtree(tmpPath) - - def test_save_and_load_builder(self): - df = self.df - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) - df.write.json(tmpPath) - actual = self.spark.read.json(tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - - schema = StructType([StructField("value", StringType(), True)]) - actual = self.spark.read.json(tmpPath, schema) - self.assertEqual(sorted(df.select("value").collect()), sorted(actual.collect())) - - df.write.mode("overwrite").json(tmpPath) - actual = self.spark.read.json(tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - - df.write.mode("overwrite").options(noUse="this options will not be used in save.")\ - .option("noUse", "this option will not be used in save.")\ - .format("json").save(path=tmpPath) - actual =\ - self.spark.read.format("json")\ - .load(path=tmpPath, noUse="this options will not be used in load.") - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - - defaultDataSourceName = self.spark.conf.get("spark.sql.sources.default", - "org.apache.spark.sql.parquet") - self.spark.sql("SET spark.sql.sources.default=org.apache.spark.sql.json") - actual = self.spark.read.load(path=tmpPath) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - self.spark.sql("SET spark.sql.sources.default=" + defaultDataSourceName) - - shutil.rmtree(tmpPath) - - def test_stream_trigger(self): - df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') - - # Should take at least one arg - try: - df.writeStream.trigger() - except ValueError: - pass - - # Should not take multiple args - try: - df.writeStream.trigger(once=True, processingTime='5 seconds') - except ValueError: - pass - - # Should not take multiple args - try: - df.writeStream.trigger(processingTime='5 seconds', continuous='1 second') - except ValueError: - pass - - # Should take only keyword args - try: - df.writeStream.trigger('5 seconds') - self.fail("Should have thrown an exception") - except TypeError: - pass - - def test_stream_read_options(self): - schema = StructType([StructField("data", StringType(), False)]) - df = self.spark.readStream\ - .format('text')\ - .option('path', 'python/test_support/sql/streaming')\ - .schema(schema)\ - .load() - self.assertTrue(df.isStreaming) - self.assertEqual(df.schema.simpleString(), "struct") - - def test_stream_read_options_overwrite(self): - bad_schema = StructType([StructField("test", IntegerType(), False)]) - schema = StructType([StructField("data", StringType(), False)]) - df = self.spark.readStream.format('csv').option('path', 'python/test_support/sql/fake') \ - .schema(bad_schema)\ - .load(path='python/test_support/sql/streaming', schema=schema, format='text') - self.assertTrue(df.isStreaming) - self.assertEqual(df.schema.simpleString(), "struct") - - def test_stream_save_options(self): - df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') \ - .withColumn('id', lit(1)) - for q in self.spark._wrapped.streams.active: - q.stop() - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) - self.assertTrue(df.isStreaming) - out = os.path.join(tmpPath, 'out') - chk = os.path.join(tmpPath, 'chk') - q = df.writeStream.option('checkpointLocation', chk).queryName('this_query') \ - .format('parquet').partitionBy('id').outputMode('append').option('path', out).start() - try: - self.assertEqual(q.name, 'this_query') - self.assertTrue(q.isActive) - q.processAllAvailable() - output_files = [] - for _, _, files in os.walk(out): - output_files.extend([f for f in files if not f.startswith('.')]) - self.assertTrue(len(output_files) > 0) - self.assertTrue(len(os.listdir(chk)) > 0) - finally: - q.stop() - shutil.rmtree(tmpPath) - - def test_stream_save_options_overwrite(self): - df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') - for q in self.spark._wrapped.streams.active: - q.stop() - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) - self.assertTrue(df.isStreaming) - out = os.path.join(tmpPath, 'out') - chk = os.path.join(tmpPath, 'chk') - fake1 = os.path.join(tmpPath, 'fake1') - fake2 = os.path.join(tmpPath, 'fake2') - q = df.writeStream.option('checkpointLocation', fake1)\ - .format('memory').option('path', fake2) \ - .queryName('fake_query').outputMode('append') \ - .start(path=out, format='parquet', queryName='this_query', checkpointLocation=chk) - - try: - self.assertEqual(q.name, 'this_query') - self.assertTrue(q.isActive) - q.processAllAvailable() - output_files = [] - for _, _, files in os.walk(out): - output_files.extend([f for f in files if not f.startswith('.')]) - self.assertTrue(len(output_files) > 0) - self.assertTrue(len(os.listdir(chk)) > 0) - self.assertFalse(os.path.isdir(fake1)) # should not have been created - self.assertFalse(os.path.isdir(fake2)) # should not have been created - finally: - q.stop() - shutil.rmtree(tmpPath) - - def test_stream_status_and_progress(self): - df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') - for q in self.spark._wrapped.streams.active: - q.stop() - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) - self.assertTrue(df.isStreaming) - out = os.path.join(tmpPath, 'out') - chk = os.path.join(tmpPath, 'chk') - - def func(x): - time.sleep(1) - return x - - from pyspark.sql.functions import col, udf - sleep_udf = udf(func) - - # Use "sleep_udf" to delay the progress update so that we can test `lastProgress` when there - # were no updates. - q = df.select(sleep_udf(col("value")).alias('value')).writeStream \ - .start(path=out, format='parquet', queryName='this_query', checkpointLocation=chk) - try: - # "lastProgress" will return None in most cases. However, as it may be flaky when - # Jenkins is very slow, we don't assert it. If there is something wrong, "lastProgress" - # may throw error with a high chance and make this test flaky, so we should still be - # able to detect broken codes. - q.lastProgress - - q.processAllAvailable() - lastProgress = q.lastProgress - recentProgress = q.recentProgress - status = q.status - self.assertEqual(lastProgress['name'], q.name) - self.assertEqual(lastProgress['id'], q.id) - self.assertTrue(any(p == lastProgress for p in recentProgress)) - self.assertTrue( - "message" in status and - "isDataAvailable" in status and - "isTriggerActive" in status) - finally: - q.stop() - shutil.rmtree(tmpPath) - - def test_stream_await_termination(self): - df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') - for q in self.spark._wrapped.streams.active: - q.stop() - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) - self.assertTrue(df.isStreaming) - out = os.path.join(tmpPath, 'out') - chk = os.path.join(tmpPath, 'chk') - q = df.writeStream\ - .start(path=out, format='parquet', queryName='this_query', checkpointLocation=chk) - try: - self.assertTrue(q.isActive) - try: - q.awaitTermination("hello") - self.fail("Expected a value exception") - except ValueError: - pass - now = time.time() - # test should take at least 2 seconds - res = q.awaitTermination(2.6) - duration = time.time() - now - self.assertTrue(duration >= 2) - self.assertFalse(res) - finally: - q.stop() - shutil.rmtree(tmpPath) - - def test_stream_exception(self): - sdf = self.spark.readStream.format('text').load('python/test_support/sql/streaming') - sq = sdf.writeStream.format('memory').queryName('query_explain').start() - try: - sq.processAllAvailable() - self.assertEqual(sq.exception(), None) - finally: - sq.stop() - - from pyspark.sql.functions import col, udf - from pyspark.sql.utils import StreamingQueryException - bad_udf = udf(lambda x: 1 / 0) - sq = sdf.select(bad_udf(col("value")))\ - .writeStream\ - .format('memory')\ - .queryName('this_query')\ - .start() - try: - # Process some data to fail the query - sq.processAllAvailable() - self.fail("bad udf should fail the query") - except StreamingQueryException as e: - # This is expected - self.assertTrue("ZeroDivisionError" in e.desc) - finally: - sq.stop() - self.assertTrue(type(sq.exception()) is StreamingQueryException) - self.assertTrue("ZeroDivisionError" in sq.exception().desc) - - def test_query_manager_await_termination(self): - df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') - for q in self.spark._wrapped.streams.active: - q.stop() - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) - self.assertTrue(df.isStreaming) - out = os.path.join(tmpPath, 'out') - chk = os.path.join(tmpPath, 'chk') - q = df.writeStream\ - .start(path=out, format='parquet', queryName='this_query', checkpointLocation=chk) - try: - self.assertTrue(q.isActive) - try: - self.spark._wrapped.streams.awaitAnyTermination("hello") - self.fail("Expected a value exception") - except ValueError: - pass - now = time.time() - # test should take at least 2 seconds - res = self.spark._wrapped.streams.awaitAnyTermination(2.6) - duration = time.time() - now - self.assertTrue(duration >= 2) - self.assertFalse(res) - finally: - q.stop() - shutil.rmtree(tmpPath) - - class ForeachWriterTester: - - def __init__(self, spark): - self.spark = spark - - def write_open_event(self, partitionId, epochId): - self._write_event( - self.open_events_dir, - {'partition': partitionId, 'epoch': epochId}) - - def write_process_event(self, row): - self._write_event(self.process_events_dir, {'value': 'text'}) - - def write_close_event(self, error): - self._write_event(self.close_events_dir, {'error': str(error)}) - - def write_input_file(self): - self._write_event(self.input_dir, "text") - - def open_events(self): - return self._read_events(self.open_events_dir, 'partition INT, epoch INT') - - def process_events(self): - return self._read_events(self.process_events_dir, 'value STRING') - - def close_events(self): - return self._read_events(self.close_events_dir, 'error STRING') - - def run_streaming_query_on_writer(self, writer, num_files): - self._reset() - try: - sdf = self.spark.readStream.format('text').load(self.input_dir) - sq = sdf.writeStream.foreach(writer).start() - for i in range(num_files): - self.write_input_file() - sq.processAllAvailable() - finally: - self.stop_all() - - def assert_invalid_writer(self, writer, msg=None): - self._reset() - try: - sdf = self.spark.readStream.format('text').load(self.input_dir) - sq = sdf.writeStream.foreach(writer).start() - self.write_input_file() - sq.processAllAvailable() - self.fail("invalid writer %s did not fail the query" % str(writer)) # not expected - except Exception as e: - if msg: - assert msg in str(e), "%s not in %s" % (msg, str(e)) - - finally: - self.stop_all() - - def stop_all(self): - for q in self.spark._wrapped.streams.active: - q.stop() - - def _reset(self): - self.input_dir = tempfile.mkdtemp() - self.open_events_dir = tempfile.mkdtemp() - self.process_events_dir = tempfile.mkdtemp() - self.close_events_dir = tempfile.mkdtemp() - - def _read_events(self, dir, json): - rows = self.spark.read.schema(json).json(dir).collect() - dicts = [row.asDict() for row in rows] - return dicts - - def _write_event(self, dir, event): - import uuid - with open(os.path.join(dir, str(uuid.uuid4())), 'w') as f: - f.write("%s\n" % str(event)) - - def __getstate__(self): - return (self.open_events_dir, self.process_events_dir, self.close_events_dir) - - def __setstate__(self, state): - self.open_events_dir, self.process_events_dir, self.close_events_dir = state - - def test_streaming_foreach_with_simple_function(self): - tester = self.ForeachWriterTester(self.spark) - - def foreach_func(row): - tester.write_process_event(row) - - tester.run_streaming_query_on_writer(foreach_func, 2) - self.assertEqual(len(tester.process_events()), 2) - - def test_streaming_foreach_with_basic_open_process_close(self): - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def open(self, partitionId, epochId): - tester.write_open_event(partitionId, epochId) - return True - - def process(self, row): - tester.write_process_event(row) - - def close(self, error): - tester.write_close_event(error) - - tester.run_streaming_query_on_writer(ForeachWriter(), 2) - - open_events = tester.open_events() - self.assertEqual(len(open_events), 2) - self.assertSetEqual(set([e['epoch'] for e in open_events]), {0, 1}) - - self.assertEqual(len(tester.process_events()), 2) - - close_events = tester.close_events() - self.assertEqual(len(close_events), 2) - self.assertSetEqual(set([e['error'] for e in close_events]), {'None'}) - - def test_streaming_foreach_with_open_returning_false(self): - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def open(self, partition_id, epoch_id): - tester.write_open_event(partition_id, epoch_id) - return False - - def process(self, row): - tester.write_process_event(row) - - def close(self, error): - tester.write_close_event(error) - - tester.run_streaming_query_on_writer(ForeachWriter(), 2) - - self.assertEqual(len(tester.open_events()), 2) - - self.assertEqual(len(tester.process_events()), 0) # no row was processed - - close_events = tester.close_events() - self.assertEqual(len(close_events), 2) - self.assertSetEqual(set([e['error'] for e in close_events]), {'None'}) - - def test_streaming_foreach_without_open_method(self): - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def process(self, row): - tester.write_process_event(row) - - def close(self, error): - tester.write_close_event(error) - - tester.run_streaming_query_on_writer(ForeachWriter(), 2) - self.assertEqual(len(tester.open_events()), 0) # no open events - self.assertEqual(len(tester.process_events()), 2) - self.assertEqual(len(tester.close_events()), 2) - - def test_streaming_foreach_without_close_method(self): - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def open(self, partition_id, epoch_id): - tester.write_open_event(partition_id, epoch_id) - return True - - def process(self, row): - tester.write_process_event(row) - - tester.run_streaming_query_on_writer(ForeachWriter(), 2) - self.assertEqual(len(tester.open_events()), 2) # no open events - self.assertEqual(len(tester.process_events()), 2) - self.assertEqual(len(tester.close_events()), 0) - - def test_streaming_foreach_without_open_and_close_methods(self): - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def process(self, row): - tester.write_process_event(row) - - tester.run_streaming_query_on_writer(ForeachWriter(), 2) - self.assertEqual(len(tester.open_events()), 0) # no open events - self.assertEqual(len(tester.process_events()), 2) - self.assertEqual(len(tester.close_events()), 0) - - def test_streaming_foreach_with_process_throwing_error(self): - from pyspark.sql.utils import StreamingQueryException - - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def process(self, row): - raise Exception("test error") - - def close(self, error): - tester.write_close_event(error) - - try: - tester.run_streaming_query_on_writer(ForeachWriter(), 1) - self.fail("bad writer did not fail the query") # this is not expected - except StreamingQueryException as e: - # TODO: Verify whether original error message is inside the exception - pass - - self.assertEqual(len(tester.process_events()), 0) # no row was processed - close_events = tester.close_events() - self.assertEqual(len(close_events), 1) - # TODO: Verify whether original error message is inside the exception - - def test_streaming_foreach_with_invalid_writers(self): - - tester = self.ForeachWriterTester(self.spark) - - def func_with_iterator_input(iter): - for x in iter: - print(x) - - tester.assert_invalid_writer(func_with_iterator_input) - - class WriterWithoutProcess: - def open(self, partition): - pass - - tester.assert_invalid_writer(WriterWithoutProcess(), "does not have a 'process'") - - class WriterWithNonCallableProcess(): - process = True - - tester.assert_invalid_writer(WriterWithNonCallableProcess(), - "'process' in provided object is not callable") - - class WriterWithNoParamProcess(): - def process(self): - pass - - tester.assert_invalid_writer(WriterWithNoParamProcess()) - - # Abstract class for tests below - class WithProcess(): - def process(self, row): - pass - - class WriterWithNonCallableOpen(WithProcess): - open = True - - tester.assert_invalid_writer(WriterWithNonCallableOpen(), - "'open' in provided object is not callable") - - class WriterWithNoParamOpen(WithProcess): - def open(self): - pass - - tester.assert_invalid_writer(WriterWithNoParamOpen()) - - class WriterWithNonCallableClose(WithProcess): - close = True - - tester.assert_invalid_writer(WriterWithNonCallableClose(), - "'close' in provided object is not callable") - - def test_streaming_foreachBatch(self): - q = None - collected = dict() - - def collectBatch(batch_df, batch_id): - collected[batch_id] = batch_df.collect() - - try: - df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') - q = df.writeStream.foreachBatch(collectBatch).start() - q.processAllAvailable() - self.assertTrue(0 in collected) - self.assertTrue(len(collected[0]), 2) - finally: - if q: - q.stop() - - def test_streaming_foreachBatch_propagates_python_errors(self): - from pyspark.sql.utils import StreamingQueryException - - q = None - - def collectBatch(df, id): - raise Exception("this should fail the query") - - try: - df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') - q = df.writeStream.foreachBatch(collectBatch).start() - q.processAllAvailable() - self.fail("Expected a failure") - except StreamingQueryException as e: - self.assertTrue("this should fail" in str(e)) - finally: - if q: - q.stop() - - def test_help_command(self): - # Regression test for SPARK-5464 - rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}']) - df = self.spark.read.json(rdd) - # render_doc() reproduces the help() exception without printing output - pydoc.render_doc(df) - pydoc.render_doc(df.foo) - pydoc.render_doc(df.take(1)) - - def test_access_column(self): - df = self.df - self.assertTrue(isinstance(df.key, Column)) - self.assertTrue(isinstance(df['key'], Column)) - self.assertTrue(isinstance(df[0], Column)) - self.assertRaises(IndexError, lambda: df[2]) - self.assertRaises(AnalysisException, lambda: df["bad_key"]) - self.assertRaises(TypeError, lambda: df[{}]) - - def test_column_name_with_non_ascii(self): - if sys.version >= '3': - columnName = "数量" - self.assertTrue(isinstance(columnName, str)) - else: - columnName = unicode("数量", "utf-8") - self.assertTrue(isinstance(columnName, unicode)) - schema = StructType([StructField(columnName, LongType(), True)]) - df = self.spark.createDataFrame([(1,)], schema) - self.assertEqual(schema, df.schema) - self.assertEqual("DataFrame[数量: bigint]", str(df)) - self.assertEqual([("数量", 'bigint')], df.dtypes) - self.assertEqual(1, df.select("数量").first()[0]) - self.assertEqual(1, df.select(df["数量"]).first()[0]) - - def test_access_nested_types(self): - df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF() - self.assertEqual(1, df.select(df.l[0]).first()[0]) - self.assertEqual(1, df.select(df.l.getItem(0)).first()[0]) - self.assertEqual(1, df.select(df.r.a).first()[0]) - self.assertEqual("b", df.select(df.r.getField("b")).first()[0]) - self.assertEqual("v", df.select(df.d["k"]).first()[0]) - self.assertEqual("v", df.select(df.d.getItem("k")).first()[0]) - - def test_field_accessor(self): - df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF() - self.assertEqual(1, df.select(df.l[0]).first()[0]) - self.assertEqual(1, df.select(df.r["a"]).first()[0]) - self.assertEqual(1, df.select(df["r.a"]).first()[0]) - self.assertEqual("b", df.select(df.r["b"]).first()[0]) - self.assertEqual("b", df.select(df["r.b"]).first()[0]) - self.assertEqual("v", df.select(df.d["k"]).first()[0]) - - def test_infer_long_type(self): - longrow = [Row(f1='a', f2=100000000000000)] - df = self.sc.parallelize(longrow).toDF() - self.assertEqual(df.schema.fields[1].dataType, LongType()) - - # this saving as Parquet caused issues as well. - output_dir = os.path.join(self.tempdir.name, "infer_long_type") - df.write.parquet(output_dir) - df1 = self.spark.read.parquet(output_dir) - self.assertEqual('a', df1.first().f1) - self.assertEqual(100000000000000, df1.first().f2) - - self.assertEqual(_infer_type(1), LongType()) - self.assertEqual(_infer_type(2**10), LongType()) - self.assertEqual(_infer_type(2**20), LongType()) - self.assertEqual(_infer_type(2**31 - 1), LongType()) - self.assertEqual(_infer_type(2**31), LongType()) - self.assertEqual(_infer_type(2**61), LongType()) - self.assertEqual(_infer_type(2**71), LongType()) - - def test_merge_type(self): - self.assertEqual(_merge_type(LongType(), NullType()), LongType()) - self.assertEqual(_merge_type(NullType(), LongType()), LongType()) - - self.assertEqual(_merge_type(LongType(), LongType()), LongType()) - - self.assertEqual(_merge_type( - ArrayType(LongType()), - ArrayType(LongType()) - ), ArrayType(LongType())) - with self.assertRaisesRegexp(TypeError, 'element in array'): - _merge_type(ArrayType(LongType()), ArrayType(DoubleType())) - - self.assertEqual(_merge_type( - MapType(StringType(), LongType()), - MapType(StringType(), LongType()) - ), MapType(StringType(), LongType())) - with self.assertRaisesRegexp(TypeError, 'key of map'): - _merge_type( - MapType(StringType(), LongType()), - MapType(DoubleType(), LongType())) - with self.assertRaisesRegexp(TypeError, 'value of map'): - _merge_type( - MapType(StringType(), LongType()), - MapType(StringType(), DoubleType())) - - self.assertEqual(_merge_type( - StructType([StructField("f1", LongType()), StructField("f2", StringType())]), - StructType([StructField("f1", LongType()), StructField("f2", StringType())]) - ), StructType([StructField("f1", LongType()), StructField("f2", StringType())])) - with self.assertRaisesRegexp(TypeError, 'field f1'): - _merge_type( - StructType([StructField("f1", LongType()), StructField("f2", StringType())]), - StructType([StructField("f1", DoubleType()), StructField("f2", StringType())])) - - self.assertEqual(_merge_type( - StructType([StructField("f1", StructType([StructField("f2", LongType())]))]), - StructType([StructField("f1", StructType([StructField("f2", LongType())]))]) - ), StructType([StructField("f1", StructType([StructField("f2", LongType())]))])) - with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'): - _merge_type( - StructType([StructField("f1", StructType([StructField("f2", LongType())]))]), - StructType([StructField("f1", StructType([StructField("f2", StringType())]))])) - - self.assertEqual(_merge_type( - StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]), - StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]) - ), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())])) - with self.assertRaisesRegexp(TypeError, 'element in array field f1'): - _merge_type( - StructType([ - StructField("f1", ArrayType(LongType())), - StructField("f2", StringType())]), - StructType([ - StructField("f1", ArrayType(DoubleType())), - StructField("f2", StringType())])) - - self.assertEqual(_merge_type( - StructType([ - StructField("f1", MapType(StringType(), LongType())), - StructField("f2", StringType())]), - StructType([ - StructField("f1", MapType(StringType(), LongType())), - StructField("f2", StringType())]) - ), StructType([ - StructField("f1", MapType(StringType(), LongType())), - StructField("f2", StringType())])) - with self.assertRaisesRegexp(TypeError, 'value of map field f1'): - _merge_type( - StructType([ - StructField("f1", MapType(StringType(), LongType())), - StructField("f2", StringType())]), - StructType([ - StructField("f1", MapType(StringType(), DoubleType())), - StructField("f2", StringType())])) - - self.assertEqual(_merge_type( - StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]), - StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]) - ), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))])) - with self.assertRaisesRegexp(TypeError, 'key of map element in array field f1'): - _merge_type( - StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]), - StructType([StructField("f1", ArrayType(MapType(DoubleType(), LongType())))]) - ) - - def test_filter_with_datetime(self): - time = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000) - date = time.date() - row = Row(date=date, time=time) - df = self.spark.createDataFrame([row]) - self.assertEqual(1, df.filter(df.date == date).count()) - self.assertEqual(1, df.filter(df.time == time).count()) - self.assertEqual(0, df.filter(df.date > date).count()) - self.assertEqual(0, df.filter(df.time > time).count()) - - def test_filter_with_datetime_timezone(self): - dt1 = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000, tzinfo=UTCOffsetTimezone(0)) - dt2 = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000, tzinfo=UTCOffsetTimezone(1)) - row = Row(date=dt1) - df = self.spark.createDataFrame([row]) - self.assertEqual(0, df.filter(df.date == dt2).count()) - self.assertEqual(1, df.filter(df.date > dt2).count()) - self.assertEqual(0, df.filter(df.date < dt2).count()) - - def test_time_with_timezone(self): - day = datetime.date.today() - now = datetime.datetime.now() - ts = time.mktime(now.timetuple()) - # class in __main__ is not serializable - from pyspark.sql.tests import UTCOffsetTimezone - utc = UTCOffsetTimezone() - utcnow = datetime.datetime.utcfromtimestamp(ts) # without microseconds - # add microseconds to utcnow (keeping year,month,day,hour,minute,second) - utcnow = datetime.datetime(*(utcnow.timetuple()[:6] + (now.microsecond, utc))) - df = self.spark.createDataFrame([(day, now, utcnow)]) - day1, now1, utcnow1 = df.first() - self.assertEqual(day1, day) - self.assertEqual(now, now1) - self.assertEqual(now, utcnow1) - - # regression test for SPARK-19561 - def test_datetime_at_epoch(self): - epoch = datetime.datetime.fromtimestamp(0) - df = self.spark.createDataFrame([Row(date=epoch)]) - first = df.select('date', lit(epoch).alias('lit_date')).first() - self.assertEqual(first['date'], epoch) - self.assertEqual(first['lit_date'], epoch) - - def test_dayofweek(self): - from pyspark.sql.functions import dayofweek - dt = datetime.datetime(2017, 11, 6) - df = self.spark.createDataFrame([Row(date=dt)]) - row = df.select(dayofweek(df.date)).first() - self.assertEqual(row[0], 2) - - def test_decimal(self): - from decimal import Decimal - schema = StructType([StructField("decimal", DecimalType(10, 5))]) - df = self.spark.createDataFrame([(Decimal("3.14159"),)], schema) - row = df.select(df.decimal + 1).first() - self.assertEqual(row[0], Decimal("4.14159")) - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) - df.write.parquet(tmpPath) - df2 = self.spark.read.parquet(tmpPath) - row = df2.first() - self.assertEqual(row[0], Decimal("3.14159")) - - def test_dropna(self): - schema = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("height", DoubleType(), True)]) - - # shouldn't drop a non-null row - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', 50, 80.1)], schema).dropna().count(), - 1) - - # dropping rows with a single null value - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', None, 80.1)], schema).dropna().count(), - 0) - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', None, 80.1)], schema).dropna(how='any').count(), - 0) - - # if how = 'all', only drop rows if all values are null - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', None, 80.1)], schema).dropna(how='all').count(), - 1) - self.assertEqual(self.spark.createDataFrame( - [(None, None, None)], schema).dropna(how='all').count(), - 0) - - # how and subset - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', 50, None)], schema).dropna(how='any', subset=['name', 'age']).count(), - 1) - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', None, None)], schema).dropna(how='any', subset=['name', 'age']).count(), - 0) - - # threshold - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', None, 80.1)], schema).dropna(thresh=2).count(), - 1) - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', None, None)], schema).dropna(thresh=2).count(), - 0) - - # threshold and subset - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', 50, None)], schema).dropna(thresh=2, subset=['name', 'age']).count(), - 1) - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', None, 180.9)], schema).dropna(thresh=2, subset=['name', 'age']).count(), - 0) - - # thresh should take precedence over how - self.assertEqual(self.spark.createDataFrame( - [(u'Alice', 50, None)], schema).dropna( - how='any', thresh=2, subset=['name', 'age']).count(), - 1) - - def test_fillna(self): - schema = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("height", DoubleType(), True), - StructField("spy", BooleanType(), True)]) - - # fillna shouldn't change non-null values - row = self.spark.createDataFrame([(u'Alice', 10, 80.1, True)], schema).fillna(50).first() - self.assertEqual(row.age, 10) - - # fillna with int - row = self.spark.createDataFrame([(u'Alice', None, None, None)], schema).fillna(50).first() - self.assertEqual(row.age, 50) - self.assertEqual(row.height, 50.0) - - # fillna with double - row = self.spark.createDataFrame( - [(u'Alice', None, None, None)], schema).fillna(50.1).first() - self.assertEqual(row.age, 50) - self.assertEqual(row.height, 50.1) - - # fillna with bool - row = self.spark.createDataFrame( - [(u'Alice', None, None, None)], schema).fillna(True).first() - self.assertEqual(row.age, None) - self.assertEqual(row.spy, True) - - # fillna with string - row = self.spark.createDataFrame([(None, None, None, None)], schema).fillna("hello").first() - self.assertEqual(row.name, u"hello") - self.assertEqual(row.age, None) - - # fillna with subset specified for numeric cols - row = self.spark.createDataFrame( - [(None, None, None, None)], schema).fillna(50, subset=['name', 'age']).first() - self.assertEqual(row.name, None) - self.assertEqual(row.age, 50) - self.assertEqual(row.height, None) - self.assertEqual(row.spy, None) - - # fillna with subset specified for string cols - row = self.spark.createDataFrame( - [(None, None, None, None)], schema).fillna("haha", subset=['name', 'age']).first() - self.assertEqual(row.name, "haha") - self.assertEqual(row.age, None) - self.assertEqual(row.height, None) - self.assertEqual(row.spy, None) - - # fillna with subset specified for bool cols - row = self.spark.createDataFrame( - [(None, None, None, None)], schema).fillna(True, subset=['name', 'spy']).first() - self.assertEqual(row.name, None) - self.assertEqual(row.age, None) - self.assertEqual(row.height, None) - self.assertEqual(row.spy, True) - - # fillna with dictionary for boolean types - row = self.spark.createDataFrame([Row(a=None), Row(a=True)]).fillna({"a": True}).first() - self.assertEqual(row.a, True) - - def test_bitwise_operations(self): - from pyspark.sql import functions - row = Row(a=170, b=75) - df = self.spark.createDataFrame([row]) - result = df.select(df.a.bitwiseAND(df.b)).collect()[0].asDict() - self.assertEqual(170 & 75, result['(a & b)']) - result = df.select(df.a.bitwiseOR(df.b)).collect()[0].asDict() - self.assertEqual(170 | 75, result['(a | b)']) - result = df.select(df.a.bitwiseXOR(df.b)).collect()[0].asDict() - self.assertEqual(170 ^ 75, result['(a ^ b)']) - result = df.select(functions.bitwiseNOT(df.b)).collect()[0].asDict() - self.assertEqual(~75, result['~b']) - - def test_expr(self): - from pyspark.sql import functions - row = Row(a="length string", b=75) - df = self.spark.createDataFrame([row]) - result = df.select(functions.expr("length(a)")).collect()[0].asDict() - self.assertEqual(13, result["length(a)"]) - - def test_repartitionByRange_dataframe(self): - schema = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("height", DoubleType(), True)]) - - df1 = self.spark.createDataFrame( - [(u'Bob', 27, 66.0), (u'Alice', 10, 10.0), (u'Bob', 10, 66.0)], schema) - df2 = self.spark.createDataFrame( - [(u'Alice', 10, 10.0), (u'Bob', 10, 66.0), (u'Bob', 27, 66.0)], schema) - - # test repartitionByRange(numPartitions, *cols) - df3 = df1.repartitionByRange(2, "name", "age") - self.assertEqual(df3.rdd.getNumPartitions(), 2) - self.assertEqual(df3.rdd.first(), df2.rdd.first()) - self.assertEqual(df3.rdd.take(3), df2.rdd.take(3)) - - # test repartitionByRange(numPartitions, *cols) - df4 = df1.repartitionByRange(3, "name", "age") - self.assertEqual(df4.rdd.getNumPartitions(), 3) - self.assertEqual(df4.rdd.first(), df2.rdd.first()) - self.assertEqual(df4.rdd.take(3), df2.rdd.take(3)) - - # test repartitionByRange(*cols) - df5 = df1.repartitionByRange("name", "age") - self.assertEqual(df5.rdd.first(), df2.rdd.first()) - self.assertEqual(df5.rdd.take(3), df2.rdd.take(3)) - - def test_replace(self): - schema = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("height", DoubleType(), True)]) - - # replace with int - row = self.spark.createDataFrame([(u'Alice', 10, 10.0)], schema).replace(10, 20).first() - self.assertEqual(row.age, 20) - self.assertEqual(row.height, 20.0) - - # replace with double - row = self.spark.createDataFrame( - [(u'Alice', 80, 80.0)], schema).replace(80.0, 82.1).first() - self.assertEqual(row.age, 82) - self.assertEqual(row.height, 82.1) - - # replace with string - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace(u'Alice', u'Ann').first() - self.assertEqual(row.name, u"Ann") - self.assertEqual(row.age, 10) - - # replace with subset specified by a string of a column name w/ actual change - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace(10, 20, subset='age').first() - self.assertEqual(row.age, 20) - - # replace with subset specified by a string of a column name w/o actual change - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace(10, 20, subset='height').first() - self.assertEqual(row.age, 10) - - # replace with subset specified with one column replaced, another column not in subset - # stays unchanged. - row = self.spark.createDataFrame( - [(u'Alice', 10, 10.0)], schema).replace(10, 20, subset=['name', 'age']).first() - self.assertEqual(row.name, u'Alice') - self.assertEqual(row.age, 20) - self.assertEqual(row.height, 10.0) - - # replace with subset specified but no column will be replaced - row = self.spark.createDataFrame( - [(u'Alice', 10, None)], schema).replace(10, 20, subset=['name', 'height']).first() - self.assertEqual(row.name, u'Alice') - self.assertEqual(row.age, 10) - self.assertEqual(row.height, None) - - # replace with lists - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace([u'Alice'], [u'Ann']).first() - self.assertTupleEqual(row, (u'Ann', 10, 80.1)) - - # replace with dict - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace({10: 11}).first() - self.assertTupleEqual(row, (u'Alice', 11, 80.1)) - - # test backward compatibility with dummy value - dummy_value = 1 - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace({'Alice': 'Bob'}, dummy_value).first() - self.assertTupleEqual(row, (u'Bob', 10, 80.1)) - - # test dict with mixed numerics - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace({10: -10, 80.1: 90.5}).first() - self.assertTupleEqual(row, (u'Alice', -10, 90.5)) - - # replace with tuples - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace((u'Alice', ), (u'Bob', )).first() - self.assertTupleEqual(row, (u'Bob', 10, 80.1)) - - # replace multiple columns - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.0)], schema).replace((10, 80.0), (20, 90)).first() - self.assertTupleEqual(row, (u'Alice', 20, 90.0)) - - # test for mixed numerics - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.0)], schema).replace((10, 80), (20, 90.5)).first() - self.assertTupleEqual(row, (u'Alice', 20, 90.5)) - - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.0)], schema).replace({10: 20, 80: 90.5}).first() - self.assertTupleEqual(row, (u'Alice', 20, 90.5)) - - # replace with boolean - row = (self - .spark.createDataFrame([(u'Alice', 10, 80.0)], schema) - .selectExpr("name = 'Bob'", 'age <= 15') - .replace(False, True).first()) - self.assertTupleEqual(row, (True, True)) - - # replace string with None and then drop None rows - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.0)], schema).replace(u'Alice', None).dropna() - self.assertEqual(row.count(), 0) - - # replace with number and None - row = self.spark.createDataFrame( - [(u'Alice', 10, 80.0)], schema).replace([10, 80], [20, None]).first() - self.assertTupleEqual(row, (u'Alice', 20, None)) - - # should fail if subset is not list, tuple or None - with self.assertRaises(ValueError): - self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace({10: 11}, subset=1).first() - - # should fail if to_replace and value have different length - with self.assertRaises(ValueError): - self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace(["Alice", "Bob"], ["Eve"]).first() - - # should fail if when received unexpected type - with self.assertRaises(ValueError): - from datetime import datetime - self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace(datetime.now(), datetime.now()).first() - - # should fail if provided mixed type replacements - with self.assertRaises(ValueError): - self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace(["Alice", 10], ["Eve", 20]).first() - - with self.assertRaises(ValueError): - self.spark.createDataFrame( - [(u'Alice', 10, 80.1)], schema).replace({u"Alice": u"Bob", 10: 20}).first() - - with self.assertRaisesRegexp( - TypeError, - 'value argument is required when to_replace is not a dictionary.'): - self.spark.createDataFrame( - [(u'Alice', 10, 80.0)], schema).replace(["Alice", "Bob"]).first() - - def test_capture_analysis_exception(self): - self.assertRaises(AnalysisException, lambda: self.spark.sql("select abc")) - self.assertRaises(AnalysisException, lambda: self.df.selectExpr("a + b")) - - def test_capture_parse_exception(self): - self.assertRaises(ParseException, lambda: self.spark.sql("abc")) - - def test_capture_illegalargument_exception(self): - self.assertRaisesRegexp(IllegalArgumentException, "Setting negative mapred.reduce.tasks", - lambda: self.spark.sql("SET mapred.reduce.tasks=-1")) - df = self.spark.createDataFrame([(1, 2)], ["a", "b"]) - self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values", - lambda: df.select(sha2(df.a, 1024)).collect()) - try: - df.select(sha2(df.a, 1024)).collect() - except IllegalArgumentException as e: - self.assertRegexpMatches(e.desc, "1024 is not in the permitted values") - self.assertRegexpMatches(e.stackTrace, - "org.apache.spark.sql.functions") - - def test_with_column_with_existing_name(self): - keys = self.df.withColumn("key", self.df.key).select("key").collect() - self.assertEqual([r.key for r in keys], list(range(100))) - - # regression test for SPARK-10417 - def test_column_iterator(self): - - def foo(): - for x in self.df.key: - break - - self.assertRaises(TypeError, foo) - - # add test for SPARK-10577 (test broadcast join hint) - def test_functions_broadcast(self): - from pyspark.sql.functions import broadcast - - df1 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key", "value")) - df2 = self.spark.createDataFrame([(1, "1"), (2, "2")], ("key", "value")) - - # equijoin - should be converted into broadcast join - plan1 = df1.join(broadcast(df2), "key")._jdf.queryExecution().executedPlan() - self.assertEqual(1, plan1.toString().count("BroadcastHashJoin")) - - # no join key -- should not be a broadcast join - plan2 = df1.crossJoin(broadcast(df2))._jdf.queryExecution().executedPlan() - self.assertEqual(0, plan2.toString().count("BroadcastHashJoin")) - - # planner should not crash without a join - broadcast(df1)._jdf.queryExecution().executedPlan() - - def test_generic_hints(self): - from pyspark.sql import DataFrame - - df1 = self.spark.range(10e10).toDF("id") - df2 = self.spark.range(10e10).toDF("id") - - self.assertIsInstance(df1.hint("broadcast"), DataFrame) - self.assertIsInstance(df1.hint("broadcast", []), DataFrame) - - # Dummy rules - self.assertIsInstance(df1.hint("broadcast", "foo", "bar"), DataFrame) - self.assertIsInstance(df1.hint("broadcast", ["foo", "bar"]), DataFrame) - - plan = df1.join(df2.hint("broadcast"), "id")._jdf.queryExecution().executedPlan() - self.assertEqual(1, plan.toString().count("BroadcastHashJoin")) - - def test_sample(self): - self.assertRaisesRegexp( - TypeError, - "should be a bool, float and number", - lambda: self.spark.range(1).sample()) - - self.assertRaises( - TypeError, - lambda: self.spark.range(1).sample("a")) - - self.assertRaises( - TypeError, - lambda: self.spark.range(1).sample(seed="abc")) - - self.assertRaises( - IllegalArgumentException, - lambda: self.spark.range(1).sample(-1.0)) - - def test_toDF_with_schema_string(self): - data = [Row(key=i, value=str(i)) for i in range(100)] - rdd = self.sc.parallelize(data, 5) - - df = rdd.toDF("key: int, value: string") - self.assertEqual(df.schema.simpleString(), "struct") - self.assertEqual(df.collect(), data) - - # different but compatible field types can be used. - df = rdd.toDF("key: string, value: string") - self.assertEqual(df.schema.simpleString(), "struct") - self.assertEqual(df.collect(), [Row(key=str(i), value=str(i)) for i in range(100)]) - - # field names can differ. - df = rdd.toDF(" a: int, b: string ") - self.assertEqual(df.schema.simpleString(), "struct") - self.assertEqual(df.collect(), data) - - # number of fields must match. - self.assertRaisesRegexp(Exception, "Length of object", - lambda: rdd.toDF("key: int").collect()) - - # field types mismatch will cause exception at runtime. - self.assertRaisesRegexp(Exception, "FloatType can not accept", - lambda: rdd.toDF("key: float, value: string").collect()) - - # flat schema values will be wrapped into row. - df = rdd.map(lambda row: row.key).toDF("int") - self.assertEqual(df.schema.simpleString(), "struct") - self.assertEqual(df.collect(), [Row(key=i) for i in range(100)]) - - # users can use DataType directly instead of data type string. - df = rdd.map(lambda row: row.key).toDF(IntegerType()) - self.assertEqual(df.schema.simpleString(), "struct") - self.assertEqual(df.collect(), [Row(key=i) for i in range(100)]) - - def test_join_without_on(self): - df1 = self.spark.range(1).toDF("a") - df2 = self.spark.range(1).toDF("b") - - with self.sql_conf({"spark.sql.crossJoin.enabled": False}): - self.assertRaises(AnalysisException, lambda: df1.join(df2, how="inner").collect()) - - with self.sql_conf({"spark.sql.crossJoin.enabled": True}): - actual = df1.join(df2, how="inner").collect() - expected = [Row(a=0, b=0)] - self.assertEqual(actual, expected) - - # Regression test for invalid join methods when on is None, Spark-14761 - def test_invalid_join_method(self): - df1 = self.spark.createDataFrame([("Alice", 5), ("Bob", 8)], ["name", "age"]) - df2 = self.spark.createDataFrame([("Alice", 80), ("Bob", 90)], ["name", "height"]) - self.assertRaises(IllegalArgumentException, lambda: df1.join(df2, how="invalid-join-type")) - - # Cartesian products require cross join syntax - def test_require_cross(self): - from pyspark.sql.functions import broadcast - - df1 = self.spark.createDataFrame([(1, "1")], ("key", "value")) - df2 = self.spark.createDataFrame([(1, "1")], ("key", "value")) - - # joins without conditions require cross join syntax - self.assertRaises(AnalysisException, lambda: df1.join(df2).collect()) - - # works with crossJoin - self.assertEqual(1, df1.crossJoin(df2).count()) - - def test_conf(self): - spark = self.spark - spark.conf.set("bogo", "sipeo") - self.assertEqual(spark.conf.get("bogo"), "sipeo") - spark.conf.set("bogo", "ta") - self.assertEqual(spark.conf.get("bogo"), "ta") - self.assertEqual(spark.conf.get("bogo", "not.read"), "ta") - self.assertEqual(spark.conf.get("not.set", "ta"), "ta") - self.assertRaisesRegexp(Exception, "not.set", lambda: spark.conf.get("not.set")) - spark.conf.unset("bogo") - self.assertEqual(spark.conf.get("bogo", "colombia"), "colombia") - - self.assertEqual(spark.conf.get("hyukjin", None), None) - - # This returns 'STATIC' because it's the default value of - # 'spark.sql.sources.partitionOverwriteMode', and `defaultValue` in - # `spark.conf.get` is unset. - self.assertEqual(spark.conf.get("spark.sql.sources.partitionOverwriteMode"), "STATIC") - - # This returns None because 'spark.sql.sources.partitionOverwriteMode' is unset, but - # `defaultValue` in `spark.conf.get` is set to None. - self.assertEqual(spark.conf.get("spark.sql.sources.partitionOverwriteMode", None), None) - - def test_current_database(self): - spark = self.spark - spark.catalog._reset() - self.assertEquals(spark.catalog.currentDatabase(), "default") - spark.sql("CREATE DATABASE some_db") - spark.catalog.setCurrentDatabase("some_db") - self.assertEquals(spark.catalog.currentDatabase(), "some_db") - self.assertRaisesRegexp( - AnalysisException, - "does_not_exist", - lambda: spark.catalog.setCurrentDatabase("does_not_exist")) - - def test_list_databases(self): - spark = self.spark - spark.catalog._reset() - databases = [db.name for db in spark.catalog.listDatabases()] - self.assertEquals(databases, ["default"]) - spark.sql("CREATE DATABASE some_db") - databases = [db.name for db in spark.catalog.listDatabases()] - self.assertEquals(sorted(databases), ["default", "some_db"]) - - def test_list_tables(self): - from pyspark.sql.catalog import Table - spark = self.spark - spark.catalog._reset() - spark.sql("CREATE DATABASE some_db") - self.assertEquals(spark.catalog.listTables(), []) - self.assertEquals(spark.catalog.listTables("some_db"), []) - spark.createDataFrame([(1, 1)]).createOrReplaceTempView("temp_tab") - spark.sql("CREATE TABLE tab1 (name STRING, age INT) USING parquet") - spark.sql("CREATE TABLE some_db.tab2 (name STRING, age INT) USING parquet") - tables = sorted(spark.catalog.listTables(), key=lambda t: t.name) - tablesDefault = sorted(spark.catalog.listTables("default"), key=lambda t: t.name) - tablesSomeDb = sorted(spark.catalog.listTables("some_db"), key=lambda t: t.name) - self.assertEquals(tables, tablesDefault) - self.assertEquals(len(tables), 2) - self.assertEquals(len(tablesSomeDb), 2) - self.assertEquals(tables[0], Table( - name="tab1", - database="default", - description=None, - tableType="MANAGED", - isTemporary=False)) - self.assertEquals(tables[1], Table( - name="temp_tab", - database=None, - description=None, - tableType="TEMPORARY", - isTemporary=True)) - self.assertEquals(tablesSomeDb[0], Table( - name="tab2", - database="some_db", - description=None, - tableType="MANAGED", - isTemporary=False)) - self.assertEquals(tablesSomeDb[1], Table( - name="temp_tab", - database=None, - description=None, - tableType="TEMPORARY", - isTemporary=True)) - self.assertRaisesRegexp( - AnalysisException, - "does_not_exist", - lambda: spark.catalog.listTables("does_not_exist")) - - def test_list_functions(self): - from pyspark.sql.catalog import Function - spark = self.spark - spark.catalog._reset() - spark.sql("CREATE DATABASE some_db") - functions = dict((f.name, f) for f in spark.catalog.listFunctions()) - functionsDefault = dict((f.name, f) for f in spark.catalog.listFunctions("default")) - self.assertTrue(len(functions) > 200) - self.assertTrue("+" in functions) - self.assertTrue("like" in functions) - self.assertTrue("month" in functions) - self.assertTrue("to_date" in functions) - self.assertTrue("to_timestamp" in functions) - self.assertTrue("to_unix_timestamp" in functions) - self.assertTrue("current_database" in functions) - self.assertEquals(functions["+"], Function( - name="+", - description=None, - className="org.apache.spark.sql.catalyst.expressions.Add", - isTemporary=True)) - self.assertEquals(functions, functionsDefault) - spark.catalog.registerFunction("temp_func", lambda x: str(x)) - spark.sql("CREATE FUNCTION func1 AS 'org.apache.spark.data.bricks'") - spark.sql("CREATE FUNCTION some_db.func2 AS 'org.apache.spark.data.bricks'") - newFunctions = dict((f.name, f) for f in spark.catalog.listFunctions()) - newFunctionsSomeDb = dict((f.name, f) for f in spark.catalog.listFunctions("some_db")) - self.assertTrue(set(functions).issubset(set(newFunctions))) - self.assertTrue(set(functions).issubset(set(newFunctionsSomeDb))) - self.assertTrue("temp_func" in newFunctions) - self.assertTrue("func1" in newFunctions) - self.assertTrue("func2" not in newFunctions) - self.assertTrue("temp_func" in newFunctionsSomeDb) - self.assertTrue("func1" not in newFunctionsSomeDb) - self.assertTrue("func2" in newFunctionsSomeDb) - self.assertRaisesRegexp( - AnalysisException, - "does_not_exist", - lambda: spark.catalog.listFunctions("does_not_exist")) - - def test_list_columns(self): - from pyspark.sql.catalog import Column - spark = self.spark - spark.catalog._reset() - spark.sql("CREATE DATABASE some_db") - spark.sql("CREATE TABLE tab1 (name STRING, age INT) USING parquet") - spark.sql("CREATE TABLE some_db.tab2 (nickname STRING, tolerance FLOAT) USING parquet") - columns = sorted(spark.catalog.listColumns("tab1"), key=lambda c: c.name) - columnsDefault = sorted(spark.catalog.listColumns("tab1", "default"), key=lambda c: c.name) - self.assertEquals(columns, columnsDefault) - self.assertEquals(len(columns), 2) - self.assertEquals(columns[0], Column( - name="age", - description=None, - dataType="int", - nullable=True, - isPartition=False, - isBucket=False)) - self.assertEquals(columns[1], Column( - name="name", - description=None, - dataType="string", - nullable=True, - isPartition=False, - isBucket=False)) - columns2 = sorted(spark.catalog.listColumns("tab2", "some_db"), key=lambda c: c.name) - self.assertEquals(len(columns2), 2) - self.assertEquals(columns2[0], Column( - name="nickname", - description=None, - dataType="string", - nullable=True, - isPartition=False, - isBucket=False)) - self.assertEquals(columns2[1], Column( - name="tolerance", - description=None, - dataType="float", - nullable=True, - isPartition=False, - isBucket=False)) - self.assertRaisesRegexp( - AnalysisException, - "tab2", - lambda: spark.catalog.listColumns("tab2")) - self.assertRaisesRegexp( - AnalysisException, - "does_not_exist", - lambda: spark.catalog.listColumns("does_not_exist")) - - def test_cache(self): - spark = self.spark - spark.createDataFrame([(2, 2), (3, 3)]).createOrReplaceTempView("tab1") - spark.createDataFrame([(2, 2), (3, 3)]).createOrReplaceTempView("tab2") - self.assertFalse(spark.catalog.isCached("tab1")) - self.assertFalse(spark.catalog.isCached("tab2")) - spark.catalog.cacheTable("tab1") - self.assertTrue(spark.catalog.isCached("tab1")) - self.assertFalse(spark.catalog.isCached("tab2")) - spark.catalog.cacheTable("tab2") - spark.catalog.uncacheTable("tab1") - self.assertFalse(spark.catalog.isCached("tab1")) - self.assertTrue(spark.catalog.isCached("tab2")) - spark.catalog.clearCache() - self.assertFalse(spark.catalog.isCached("tab1")) - self.assertFalse(spark.catalog.isCached("tab2")) - self.assertRaisesRegexp( - AnalysisException, - "does_not_exist", - lambda: spark.catalog.isCached("does_not_exist")) - self.assertRaisesRegexp( - AnalysisException, - "does_not_exist", - lambda: spark.catalog.cacheTable("does_not_exist")) - self.assertRaisesRegexp( - AnalysisException, - "does_not_exist", - lambda: spark.catalog.uncacheTable("does_not_exist")) - - def test_read_text_file_list(self): - df = self.spark.read.text(['python/test_support/sql/text-test.txt', - 'python/test_support/sql/text-test.txt']) - count = df.count() - self.assertEquals(count, 4) - - def test_BinaryType_serialization(self): - # Pyrolite version <= 4.9 could not serialize BinaryType with Python3 SPARK-17808 - # The empty bytearray is test for SPARK-21534. - schema = StructType([StructField('mybytes', BinaryType())]) - data = [[bytearray(b'here is my data')], - [bytearray(b'and here is some more')], - [bytearray(b'')]] - df = self.spark.createDataFrame(data, schema=schema) - df.collect() - - # test for SPARK-16542 - def test_array_types(self): - # This test need to make sure that the Scala type selected is at least - # as large as the python's types. This is necessary because python's - # array types depend on C implementation on the machine. Therefore there - # is no machine independent correspondence between python's array types - # and Scala types. - # See: https://docs.python.org/2/library/array.html - - def assertCollectSuccess(typecode, value): - row = Row(myarray=array.array(typecode, [value])) - df = self.spark.createDataFrame([row]) - self.assertEqual(df.first()["myarray"][0], value) - - # supported string types - # - # String types in python's array are "u" for Py_UNICODE and "c" for char. - # "u" will be removed in python 4, and "c" is not supported in python 3. - supported_string_types = [] - if sys.version_info[0] < 4: - supported_string_types += ['u'] - # test unicode - assertCollectSuccess('u', u'a') - if sys.version_info[0] < 3: - supported_string_types += ['c'] - # test string - assertCollectSuccess('c', 'a') - - # supported float and double - # - # Test max, min, and precision for float and double, assuming IEEE 754 - # floating-point format. - supported_fractional_types = ['f', 'd'] - assertCollectSuccess('f', ctypes.c_float(1e+38).value) - assertCollectSuccess('f', ctypes.c_float(1e-38).value) - assertCollectSuccess('f', ctypes.c_float(1.123456).value) - assertCollectSuccess('d', sys.float_info.max) - assertCollectSuccess('d', sys.float_info.min) - assertCollectSuccess('d', sys.float_info.epsilon) - - # supported signed int types - # - # The size of C types changes with implementation, we need to make sure - # that there is no overflow error on the platform running this test. - supported_signed_int_types = list( - set(_array_signed_int_typecode_ctype_mappings.keys()) - .intersection(set(_array_type_mappings.keys()))) - for t in supported_signed_int_types: - ctype = _array_signed_int_typecode_ctype_mappings[t] - max_val = 2 ** (ctypes.sizeof(ctype) * 8 - 1) - assertCollectSuccess(t, max_val - 1) - assertCollectSuccess(t, -max_val) - - # supported unsigned int types - # - # JVM does not have unsigned types. We need to be very careful to make - # sure that there is no overflow error. - supported_unsigned_int_types = list( - set(_array_unsigned_int_typecode_ctype_mappings.keys()) - .intersection(set(_array_type_mappings.keys()))) - for t in supported_unsigned_int_types: - ctype = _array_unsigned_int_typecode_ctype_mappings[t] - assertCollectSuccess(t, 2 ** (ctypes.sizeof(ctype) * 8) - 1) - - # all supported types - # - # Make sure the types tested above: - # 1. are all supported types - # 2. cover all supported types - supported_types = (supported_string_types + - supported_fractional_types + - supported_signed_int_types + - supported_unsigned_int_types) - self.assertEqual(set(supported_types), set(_array_type_mappings.keys())) - - # all unsupported types - # - # Keys in _array_type_mappings is a complete list of all supported types, - # and types not in _array_type_mappings are considered unsupported. - # `array.typecodes` are not supported in python 2. - if sys.version_info[0] < 3: - all_types = set(['c', 'b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'f', 'd']) - else: - all_types = set(array.typecodes) - unsupported_types = all_types - set(supported_types) - # test unsupported types - for t in unsupported_types: - with self.assertRaises(TypeError): - a = array.array(t) - self.spark.createDataFrame([Row(myarray=a)]).collect() - - def test_bucketed_write(self): - data = [ - (1, "foo", 3.0), (2, "foo", 5.0), - (3, "bar", -1.0), (4, "bar", 6.0), - ] - df = self.spark.createDataFrame(data, ["x", "y", "z"]) - - def count_bucketed_cols(names, table="pyspark_bucket"): - """Given a sequence of column names and a table name - query the catalog and return number o columns which are - used for bucketing - """ - cols = self.spark.catalog.listColumns(table) - num = len([c for c in cols if c.name in names and c.isBucket]) - return num - - # Test write with one bucketing column - df.write.bucketBy(3, "x").mode("overwrite").saveAsTable("pyspark_bucket") - self.assertEqual(count_bucketed_cols(["x"]), 1) - self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect())) - - # Test write two bucketing columns - df.write.bucketBy(3, "x", "y").mode("overwrite").saveAsTable("pyspark_bucket") - self.assertEqual(count_bucketed_cols(["x", "y"]), 2) - self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect())) - - # Test write with bucket and sort - df.write.bucketBy(2, "x").sortBy("z").mode("overwrite").saveAsTable("pyspark_bucket") - self.assertEqual(count_bucketed_cols(["x"]), 1) - self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect())) - - # Test write with a list of columns - df.write.bucketBy(3, ["x", "y"]).mode("overwrite").saveAsTable("pyspark_bucket") - self.assertEqual(count_bucketed_cols(["x", "y"]), 2) - self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect())) - - # Test write with bucket and sort with a list of columns - (df.write.bucketBy(2, "x") - .sortBy(["y", "z"]) - .mode("overwrite").saveAsTable("pyspark_bucket")) - self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect())) - - # Test write with bucket and sort with multiple columns - (df.write.bucketBy(2, "x") - .sortBy("y", "z") - .mode("overwrite").saveAsTable("pyspark_bucket")) - self.assertSetEqual(set(data), set(self.spark.table("pyspark_bucket").collect())) - - def _to_pandas(self): - from datetime import datetime, date - schema = StructType().add("a", IntegerType()).add("b", StringType())\ - .add("c", BooleanType()).add("d", FloatType())\ - .add("dt", DateType()).add("ts", TimestampType()) - data = [ - (1, "foo", True, 3.0, date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1)), - (2, "foo", True, 5.0, None, None), - (3, "bar", False, -1.0, date(2012, 3, 3), datetime(2012, 3, 3, 3, 3, 3)), - (4, "bar", False, 6.0, date(2100, 4, 4), datetime(2100, 4, 4, 4, 4, 4)), - ] - df = self.spark.createDataFrame(data, schema) - return df.toPandas() - - @unittest.skipIf(not _have_pandas, _pandas_requirement_message) - def test_to_pandas(self): - import numpy as np - pdf = self._to_pandas() - types = pdf.dtypes - self.assertEquals(types[0], np.int32) - self.assertEquals(types[1], np.object) - self.assertEquals(types[2], np.bool) - self.assertEquals(types[3], np.float32) - self.assertEquals(types[4], np.object) # datetime.date - self.assertEquals(types[5], 'datetime64[ns]') - - @unittest.skipIf(_have_pandas, "Required Pandas was found.") - def test_to_pandas_required_pandas_not_found(self): - with QuietTest(self.sc): - with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'): - self._to_pandas() - - @unittest.skipIf(not _have_pandas, _pandas_requirement_message) - def test_to_pandas_avoid_astype(self): - import numpy as np - schema = StructType().add("a", IntegerType()).add("b", StringType())\ - .add("c", IntegerType()) - data = [(1, "foo", 16777220), (None, "bar", None)] - df = self.spark.createDataFrame(data, schema) - types = df.toPandas().dtypes - self.assertEquals(types[0], np.float64) # doesn't convert to np.int32 due to NaN value. - self.assertEquals(types[1], np.object) - self.assertEquals(types[2], np.float64) - - def test_create_dataframe_from_array_of_long(self): - import array - data = [Row(longarray=array.array('l', [-9223372036854775808, 0, 9223372036854775807]))] - df = self.spark.createDataFrame(data) - self.assertEqual(df.first(), Row(longarray=[-9223372036854775808, 0, 9223372036854775807])) - - @unittest.skipIf(not _have_pandas, _pandas_requirement_message) - def test_create_dataframe_from_pandas_with_timestamp(self): - import pandas as pd - from datetime import datetime - pdf = pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)], - "d": [pd.Timestamp.now().date()]}, columns=["d", "ts"]) - # test types are inferred correctly without specifying schema - df = self.spark.createDataFrame(pdf) - self.assertTrue(isinstance(df.schema['ts'].dataType, TimestampType)) - self.assertTrue(isinstance(df.schema['d'].dataType, DateType)) - # test with schema will accept pdf as input - df = self.spark.createDataFrame(pdf, schema="d date, ts timestamp") - self.assertTrue(isinstance(df.schema['ts'].dataType, TimestampType)) - self.assertTrue(isinstance(df.schema['d'].dataType, DateType)) - - @unittest.skipIf(_have_pandas, "Required Pandas was found.") - def test_create_dataframe_required_pandas_not_found(self): - with QuietTest(self.sc): - with self.assertRaisesRegexp( - ImportError, - "(Pandas >= .* must be installed|No module named '?pandas'?)"): - import pandas as pd - from datetime import datetime - pdf = pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)], - "d": [pd.Timestamp.now().date()]}) - self.spark.createDataFrame(pdf) - - # Regression test for SPARK-23360 - @unittest.skipIf(not _have_pandas, _pandas_requirement_message) - def test_create_dateframe_from_pandas_with_dst(self): - import pandas as pd - from datetime import datetime - - pdf = pd.DataFrame({'time': [datetime(2015, 10, 31, 22, 30)]}) - - df = self.spark.createDataFrame(pdf) - self.assertPandasEqual(pdf, df.toPandas()) - - orig_env_tz = os.environ.get('TZ', None) - try: - tz = 'America/Los_Angeles' - os.environ['TZ'] = tz - time.tzset() - with self.sql_conf({'spark.sql.session.timeZone': tz}): - df = self.spark.createDataFrame(pdf) - self.assertPandasEqual(pdf, df.toPandas()) - finally: - del os.environ['TZ'] - if orig_env_tz is not None: - os.environ['TZ'] = orig_env_tz - time.tzset() - - def test_sort_with_nulls_order(self): - from pyspark.sql import functions - - df = self.spark.createDataFrame( - [('Tom', 80), (None, 60), ('Alice', 50)], ["name", "height"]) - self.assertEquals( - df.select(df.name).orderBy(functions.asc_nulls_first('name')).collect(), - [Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')]) - self.assertEquals( - df.select(df.name).orderBy(functions.asc_nulls_last('name')).collect(), - [Row(name=u'Alice'), Row(name=u'Tom'), Row(name=None)]) - self.assertEquals( - df.select(df.name).orderBy(functions.desc_nulls_first('name')).collect(), - [Row(name=None), Row(name=u'Tom'), Row(name=u'Alice')]) - self.assertEquals( - df.select(df.name).orderBy(functions.desc_nulls_last('name')).collect(), - [Row(name=u'Tom'), Row(name=u'Alice'), Row(name=None)]) - - def test_json_sampling_ratio(self): - rdd = self.spark.sparkContext.range(0, 100, 1, 1) \ - .map(lambda x: '{"a":0.1}' if x == 1 else '{"a":%s}' % str(x)) - schema = self.spark.read.option('inferSchema', True) \ - .option('samplingRatio', 0.5) \ - .json(rdd).schema - self.assertEquals(schema, StructType([StructField("a", LongType(), True)])) - - def test_csv_sampling_ratio(self): - rdd = self.spark.sparkContext.range(0, 100, 1, 1) \ - .map(lambda x: '0.1' if x == 1 else str(x)) - schema = self.spark.read.option('inferSchema', True)\ - .csv(rdd, samplingRatio=0.5).schema - self.assertEquals(schema, StructType([StructField("_c0", IntegerType(), True)])) - - def test_checking_csv_header(self): - path = tempfile.mkdtemp() - shutil.rmtree(path) - try: - self.spark.createDataFrame([[1, 1000], [2000, 2]])\ - .toDF('f1', 'f2').write.option("header", "true").csv(path) - schema = StructType([ - StructField('f2', IntegerType(), nullable=True), - StructField('f1', IntegerType(), nullable=True)]) - df = self.spark.read.option('header', 'true').schema(schema)\ - .csv(path, enforceSchema=False) - self.assertRaisesRegexp( - Exception, - "CSV header does not conform to the schema", - lambda: df.collect()) - finally: - shutil.rmtree(path) - - def test_ignore_column_of_all_nulls(self): - path = tempfile.mkdtemp() - shutil.rmtree(path) - try: - df = self.spark.createDataFrame([["""{"a":null, "b":1, "c":3.0}"""], - ["""{"a":null, "b":null, "c":"string"}"""], - ["""{"a":null, "b":null, "c":null}"""]]) - df.write.text(path) - schema = StructType([ - StructField('b', LongType(), nullable=True), - StructField('c', StringType(), nullable=True)]) - readback = self.spark.read.json(path, dropFieldIfAllNull=True) - self.assertEquals(readback.schema, schema) - finally: - shutil.rmtree(path) - - # SPARK-24721 - @unittest.skipIf(not _test_compiled, _test_not_compiled_message) - def test_datasource_with_udf(self): - from pyspark.sql.functions import udf, lit, col - - path = tempfile.mkdtemp() - shutil.rmtree(path) - - try: - self.spark.range(1).write.mode("overwrite").format('csv').save(path) - filesource_df = self.spark.read.option('inferSchema', True).csv(path).toDF('i') - datasource_df = self.spark.read \ - .format("org.apache.spark.sql.sources.SimpleScanSource") \ - .option('from', 0).option('to', 1).load().toDF('i') - datasource_v2_df = self.spark.read \ - .format("org.apache.spark.sql.sources.v2.SimpleDataSourceV2") \ - .load().toDF('i', 'j') - - c1 = udf(lambda x: x + 1, 'int')(lit(1)) - c2 = udf(lambda x: x + 1, 'int')(col('i')) - - f1 = udf(lambda x: False, 'boolean')(lit(1)) - f2 = udf(lambda x: False, 'boolean')(col('i')) - - for df in [filesource_df, datasource_df, datasource_v2_df]: - result = df.withColumn('c', c1) - expected = df.withColumn('c', lit(2)) - self.assertEquals(expected.collect(), result.collect()) - - for df in [filesource_df, datasource_df, datasource_v2_df]: - result = df.withColumn('c', c2) - expected = df.withColumn('c', col('i') + 1) - self.assertEquals(expected.collect(), result.collect()) - - for df in [filesource_df, datasource_df, datasource_v2_df]: - for f in [f1, f2]: - result = df.filter(f) - self.assertEquals(0, result.count()) - finally: - shutil.rmtree(path) - - def test_repr_behaviors(self): - import re - pattern = re.compile(r'^ *\|', re.MULTILINE) - df = self.spark.createDataFrame([(1, "1"), (22222, "22222")], ("key", "value")) - - # test when eager evaluation is enabled and _repr_html_ will not be called - with self.sql_conf({"spark.sql.repl.eagerEval.enabled": True}): - expected1 = """+-----+-----+ - || key|value| - |+-----+-----+ - || 1| 1| - ||22222|22222| - |+-----+-----+ - |""" - self.assertEquals(re.sub(pattern, '', expected1), df.__repr__()) - with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}): - expected2 = """+---+-----+ - ||key|value| - |+---+-----+ - || 1| 1| - ||222| 222| - |+---+-----+ - |""" - self.assertEquals(re.sub(pattern, '', expected2), df.__repr__()) - with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}): - expected3 = """+---+-----+ - ||key|value| - |+---+-----+ - || 1| 1| - |+---+-----+ - |only showing top 1 row - |""" - self.assertEquals(re.sub(pattern, '', expected3), df.__repr__()) - - # test when eager evaluation is enabled and _repr_html_ will be called - with self.sql_conf({"spark.sql.repl.eagerEval.enabled": True}): - expected1 = """ - | - | - | - |
      keyvalue
      11
      2222222222
      - |""" - self.assertEquals(re.sub(pattern, '', expected1), df._repr_html_()) - with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}): - expected2 = """ - | - | - | - |
      keyvalue
      11
      222222
      - |""" - self.assertEquals(re.sub(pattern, '', expected2), df._repr_html_()) - with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}): - expected3 = """ - | - | - |
      keyvalue
      11
      - |only showing top 1 row - |""" - self.assertEquals(re.sub(pattern, '', expected3), df._repr_html_()) - - # test when eager evaluation is disabled and _repr_html_ will be called - with self.sql_conf({"spark.sql.repl.eagerEval.enabled": False}): - expected = "DataFrame[key: bigint, value: string]" - self.assertEquals(None, df._repr_html_()) - self.assertEquals(expected, df.__repr__()) - with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}): - self.assertEquals(None, df._repr_html_()) - self.assertEquals(expected, df.__repr__()) - with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}): - self.assertEquals(None, df._repr_html_()) - self.assertEquals(expected, df.__repr__()) - - # SPARK-25591 - def test_same_accumulator_in_udfs(self): - from pyspark.sql.functions import udf - - data_schema = StructType([StructField("a", IntegerType(), True), - StructField("b", IntegerType(), True)]) - data = self.spark.createDataFrame([[1, 2]], schema=data_schema) - - test_accum = self.sc.accumulator(0) - - def first_udf(x): - test_accum.add(1) - return x - - def second_udf(x): - test_accum.add(100) - return x - - func_udf = udf(first_udf, IntegerType()) - func_udf2 = udf(second_udf, IntegerType()) - data = data.withColumn("out1", func_udf(data["a"])) - data = data.withColumn("out2", func_udf2(data["b"])) - data.collect() - self.assertEqual(test_accum.value, 101) - - -class HiveSparkSubmitTests(SparkSubmitTests): - - @classmethod - def setUpClass(cls): - # get a SparkContext to check for availability of Hive - sc = SparkContext('local[4]', cls.__name__) - cls.hive_available = True - try: - sc._jvm.org.apache.hadoop.hive.conf.HiveConf() - except py4j.protocol.Py4JError: - cls.hive_available = False - except TypeError: - cls.hive_available = False - finally: - # we don't need this SparkContext for the test - sc.stop() - - def setUp(self): - super(HiveSparkSubmitTests, self).setUp() - if not self.hive_available: - self.skipTest("Hive is not available.") - - def test_hivecontext(self): - # This test checks that HiveContext is using Hive metastore (SPARK-16224). - # It sets a metastore url and checks if there is a derby dir created by - # Hive metastore. If this derby dir exists, HiveContext is using - # Hive metastore. - metastore_path = os.path.join(tempfile.mkdtemp(), "spark16224_metastore_db") - metastore_URL = "jdbc:derby:;databaseName=" + metastore_path + ";create=true" - hive_site_dir = os.path.join(self.programDir, "conf") - hive_site_file = self.createTempFile("hive-site.xml", (""" - | - | - | javax.jdo.option.ConnectionURL - | %s - | - | - """ % metastore_URL).lstrip(), "conf") - script = self.createTempFile("test.py", """ - |import os - | - |from pyspark.conf import SparkConf - |from pyspark.context import SparkContext - |from pyspark.sql import HiveContext - | - |conf = SparkConf() - |sc = SparkContext(conf=conf) - |hive_context = HiveContext(sc) - |print(hive_context.sql("show databases").collect()) - """) - proc = subprocess.Popen( - self.sparkSubmit + ["--master", "local-cluster[1,1,1024]", - "--driver-class-path", hive_site_dir, script], - stdout=subprocess.PIPE) - out, err = proc.communicate() - self.assertEqual(0, proc.returncode) - self.assertIn("default", out.decode('utf-8')) - self.assertTrue(os.path.exists(metastore_path)) - - -class SQLTests2(ReusedSQLTestCase): - - # We can't include this test into SQLTests because we will stop class's SparkContext and cause - # other tests failed. - def test_sparksession_with_stopped_sparkcontext(self): - self.sc.stop() - sc = SparkContext('local[4]', self.sc.appName) - spark = SparkSession.builder.getOrCreate() - try: - df = spark.createDataFrame([(1, 2)], ["c", "c"]) - df.collect() - finally: - spark.stop() - sc.stop() - - -class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils): - # These tests are separate because it uses 'spark.sql.queryExecutionListeners' which is - # static and immutable. This can't be set or unset, for example, via `spark.conf`. - - @classmethod - def setUpClass(cls): - import glob - from pyspark.find_spark_home import _find_spark_home - - SPARK_HOME = _find_spark_home() - filename_pattern = ( - "sql/core/target/scala-*/test-classes/org/apache/spark/sql/" - "TestQueryExecutionListener.class") - cls.has_listener = bool(glob.glob(os.path.join(SPARK_HOME, filename_pattern))) - - if cls.has_listener: - # Note that 'spark.sql.queryExecutionListeners' is a static immutable configuration. - cls.spark = SparkSession.builder \ - .master("local[4]") \ - .appName(cls.__name__) \ - .config( - "spark.sql.queryExecutionListeners", - "org.apache.spark.sql.TestQueryExecutionListener") \ - .getOrCreate() - - def setUp(self): - if not self.has_listener: - raise self.skipTest( - "'org.apache.spark.sql.TestQueryExecutionListener' is not " - "available. Will skip the related tests.") - - @classmethod - def tearDownClass(cls): - if hasattr(cls, "spark"): - cls.spark.stop() - - def tearDown(self): - self.spark._jvm.OnSuccessCall.clear() - - def test_query_execution_listener_on_collect(self): - self.assertFalse( - self.spark._jvm.OnSuccessCall.isCalled(), - "The callback from the query execution listener should not be called before 'collect'") - self.spark.sql("SELECT * FROM range(1)").collect() - self.assertTrue( - self.spark._jvm.OnSuccessCall.isCalled(), - "The callback from the query execution listener should be called after 'collect'") - - @unittest.skipIf( - not _have_pandas or not _have_pyarrow, - _pandas_requirement_message or _pyarrow_requirement_message) - def test_query_execution_listener_on_collect_with_arrow(self): - with self.sql_conf({"spark.sql.execution.arrow.enabled": True}): - self.assertFalse( - self.spark._jvm.OnSuccessCall.isCalled(), - "The callback from the query execution listener should not be " - "called before 'toPandas'") - self.spark.sql("SELECT * FROM range(1)").toPandas() - self.assertTrue( - self.spark._jvm.OnSuccessCall.isCalled(), - "The callback from the query execution listener should be called after 'toPandas'") - - -class SparkSessionTests(PySparkTestCase): - - # This test is separate because it's closely related with session's start and stop. - # See SPARK-23228. - def test_set_jvm_default_session(self): - spark = SparkSession.builder.getOrCreate() - try: - self.assertTrue(spark._jvm.SparkSession.getDefaultSession().isDefined()) - finally: - spark.stop() - self.assertTrue(spark._jvm.SparkSession.getDefaultSession().isEmpty()) - - def test_jvm_default_session_already_set(self): - # Here, we assume there is the default session already set in JVM. - jsession = self.sc._jvm.SparkSession(self.sc._jsc.sc()) - self.sc._jvm.SparkSession.setDefaultSession(jsession) - - spark = SparkSession.builder.getOrCreate() - try: - self.assertTrue(spark._jvm.SparkSession.getDefaultSession().isDefined()) - # The session should be the same with the exiting one. - self.assertTrue(jsession.equals(spark._jvm.SparkSession.getDefaultSession().get())) - finally: - spark.stop() - - -class UDFInitializationTests(unittest.TestCase): - def tearDown(self): - if SparkSession._instantiatedSession is not None: - SparkSession._instantiatedSession.stop() - - if SparkContext._active_spark_context is not None: - SparkContext._active_spark_context.stop() - - def test_udf_init_shouldnt_initialize_context(self): - from pyspark.sql.functions import UserDefinedFunction - - UserDefinedFunction(lambda x: x, StringType()) - - self.assertIsNone( - SparkContext._active_spark_context, - "SparkContext shouldn't be initialized when UserDefinedFunction is created." - ) - self.assertIsNone( - SparkSession._instantiatedSession, - "SparkSession shouldn't be initialized when UserDefinedFunction is created." - ) - - -class HiveContextSQLTests(ReusedPySparkTestCase): - - @classmethod - def setUpClass(cls): - ReusedPySparkTestCase.setUpClass() - cls.tempdir = tempfile.NamedTemporaryFile(delete=False) - cls.hive_available = True - try: - cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf() - except py4j.protocol.Py4JError: - cls.hive_available = False - except TypeError: - cls.hive_available = False - os.unlink(cls.tempdir.name) - if cls.hive_available: - cls.spark = HiveContext._createForTesting(cls.sc) - cls.testData = [Row(key=i, value=str(i)) for i in range(100)] - cls.df = cls.sc.parallelize(cls.testData).toDF() - - def setUp(self): - if not self.hive_available: - self.skipTest("Hive is not available.") - - @classmethod - def tearDownClass(cls): - ReusedPySparkTestCase.tearDownClass() - shutil.rmtree(cls.tempdir.name, ignore_errors=True) - - def test_save_and_load_table(self): - df = self.df - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) - df.write.saveAsTable("savedJsonTable", "json", "append", path=tmpPath) - actual = self.spark.createExternalTable("externalJsonTable", tmpPath, "json") - self.assertEqual(sorted(df.collect()), - sorted(self.spark.sql("SELECT * FROM savedJsonTable").collect())) - self.assertEqual(sorted(df.collect()), - sorted(self.spark.sql("SELECT * FROM externalJsonTable").collect())) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - self.spark.sql("DROP TABLE externalJsonTable") - - df.write.saveAsTable("savedJsonTable", "json", "overwrite", path=tmpPath) - schema = StructType([StructField("value", StringType(), True)]) - actual = self.spark.createExternalTable("externalJsonTable", source="json", - schema=schema, path=tmpPath, - noUse="this options will not be used") - self.assertEqual(sorted(df.collect()), - sorted(self.spark.sql("SELECT * FROM savedJsonTable").collect())) - self.assertEqual(sorted(df.select("value").collect()), - sorted(self.spark.sql("SELECT * FROM externalJsonTable").collect())) - self.assertEqual(sorted(df.select("value").collect()), sorted(actual.collect())) - self.spark.sql("DROP TABLE savedJsonTable") - self.spark.sql("DROP TABLE externalJsonTable") - - defaultDataSourceName = self.spark.getConf("spark.sql.sources.default", - "org.apache.spark.sql.parquet") - self.spark.sql("SET spark.sql.sources.default=org.apache.spark.sql.json") - df.write.saveAsTable("savedJsonTable", path=tmpPath, mode="overwrite") - actual = self.spark.createExternalTable("externalJsonTable", path=tmpPath) - self.assertEqual(sorted(df.collect()), - sorted(self.spark.sql("SELECT * FROM savedJsonTable").collect())) - self.assertEqual(sorted(df.collect()), - sorted(self.spark.sql("SELECT * FROM externalJsonTable").collect())) - self.assertEqual(sorted(df.collect()), sorted(actual.collect())) - self.spark.sql("DROP TABLE savedJsonTable") - self.spark.sql("DROP TABLE externalJsonTable") - self.spark.sql("SET spark.sql.sources.default=" + defaultDataSourceName) - - shutil.rmtree(tmpPath) - - def test_window_functions(self): - df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) - w = Window.partitionBy("value").orderBy("key") - from pyspark.sql import functions as F - sel = df.select(df.value, df.key, - F.max("key").over(w.rowsBetween(0, 1)), - F.min("key").over(w.rowsBetween(0, 1)), - F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))), - F.row_number().over(w), - F.rank().over(w), - F.dense_rank().over(w), - F.ntile(2).over(w)) - rs = sorted(sel.collect()) - expected = [ - ("1", 1, 1, 1, 1, 1, 1, 1, 1), - ("2", 1, 1, 1, 3, 1, 1, 1, 1), - ("2", 1, 2, 1, 3, 2, 1, 1, 1), - ("2", 2, 2, 2, 3, 3, 3, 2, 2) - ] - for r, ex in zip(rs, expected): - self.assertEqual(tuple(r), ex[:len(r)]) - - def test_window_functions_without_partitionBy(self): - df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) - w = Window.orderBy("key", df.value) - from pyspark.sql import functions as F - sel = df.select(df.value, df.key, - F.max("key").over(w.rowsBetween(0, 1)), - F.min("key").over(w.rowsBetween(0, 1)), - F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))), - F.row_number().over(w), - F.rank().over(w), - F.dense_rank().over(w), - F.ntile(2).over(w)) - rs = sorted(sel.collect()) - expected = [ - ("1", 1, 1, 1, 4, 1, 1, 1, 1), - ("2", 1, 1, 1, 4, 2, 2, 2, 1), - ("2", 1, 2, 1, 4, 3, 2, 2, 2), - ("2", 2, 2, 2, 4, 4, 4, 3, 2) - ] - for r, ex in zip(rs, expected): - self.assertEqual(tuple(r), ex[:len(r)]) - - def test_window_functions_cumulative_sum(self): - df = self.spark.createDataFrame([("one", 1), ("two", 2)], ["key", "value"]) - from pyspark.sql import functions as F - - # Test cumulative sum - sel = df.select( - df.key, - F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding, 0))) - rs = sorted(sel.collect()) - expected = [("one", 1), ("two", 3)] - for r, ex in zip(rs, expected): - self.assertEqual(tuple(r), ex[:len(r)]) - - # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow - sel = df.select( - df.key, - F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding - 1, 0))) - rs = sorted(sel.collect()) - expected = [("one", 1), ("two", 3)] - for r, ex in zip(rs, expected): - self.assertEqual(tuple(r), ex[:len(r)]) - - # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow - frame_end = Window.unboundedFollowing + 1 - sel = df.select( - df.key, - F.sum(df.value).over(Window.rowsBetween(Window.currentRow, frame_end))) - rs = sorted(sel.collect()) - expected = [("one", 3), ("two", 2)] - for r, ex in zip(rs, expected): - self.assertEqual(tuple(r), ex[:len(r)]) - - def test_collect_functions(self): - df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) - from pyspark.sql import functions - - self.assertEqual( - sorted(df.select(functions.collect_set(df.key).alias('r')).collect()[0].r), - [1, 2]) - self.assertEqual( - sorted(df.select(functions.collect_list(df.key).alias('r')).collect()[0].r), - [1, 1, 1, 2]) - self.assertEqual( - sorted(df.select(functions.collect_set(df.value).alias('r')).collect()[0].r), - ["1", "2"]) - self.assertEqual( - sorted(df.select(functions.collect_list(df.value).alias('r')).collect()[0].r), - ["1", "2", "2", "2"]) - - def test_limit_and_take(self): - df = self.spark.range(1, 1000, numPartitions=10) - - def assert_runs_only_one_job_stage_and_task(job_group_name, f): - tracker = self.sc.statusTracker() - self.sc.setJobGroup(job_group_name, description="") - f() - jobs = tracker.getJobIdsForGroup(job_group_name) - self.assertEqual(1, len(jobs)) - stages = tracker.getJobInfo(jobs[0]).stageIds - self.assertEqual(1, len(stages)) - self.assertEqual(1, tracker.getStageInfo(stages[0]).numTasks) - - # Regression test for SPARK-10731: take should delegate to Scala implementation - assert_runs_only_one_job_stage_and_task("take", lambda: df.take(1)) - # Regression test for SPARK-17514: limit(n).collect() should the perform same as take(n) - assert_runs_only_one_job_stage_and_task("collect_limit", lambda: df.limit(1).collect()) - - def test_datetime_functions(self): - from pyspark.sql import functions - from datetime import date, datetime - df = self.spark.range(1).selectExpr("'2017-01-22' as dateCol") - parse_result = df.select(functions.to_date(functions.col("dateCol"))).first() - self.assertEquals(date(2017, 1, 22), parse_result['to_date(`dateCol`)']) - - @unittest.skipIf(sys.version_info < (3, 3), "Unittest < 3.3 doesn't support mocking") - def test_unbounded_frames(self): - from unittest.mock import patch - from pyspark.sql import functions as F - from pyspark.sql import window - import importlib - - df = self.spark.range(0, 3) - - def rows_frame_match(): - return "ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in df.select( - F.count("*").over(window.Window.rowsBetween(-sys.maxsize, sys.maxsize)) - ).columns[0] - - def range_frame_match(): - return "RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in df.select( - F.count("*").over(window.Window.rangeBetween(-sys.maxsize, sys.maxsize)) - ).columns[0] - - with patch("sys.maxsize", 2 ** 31 - 1): - importlib.reload(window) - self.assertTrue(rows_frame_match()) - self.assertTrue(range_frame_match()) - - with patch("sys.maxsize", 2 ** 63 - 1): - importlib.reload(window) - self.assertTrue(rows_frame_match()) - self.assertTrue(range_frame_match()) - - with patch("sys.maxsize", 2 ** 127 - 1): - importlib.reload(window) - self.assertTrue(rows_frame_match()) - self.assertTrue(range_frame_match()) - - importlib.reload(window) - - -class DataTypeVerificationTests(unittest.TestCase): - - def test_verify_type_exception_msg(self): - self.assertRaisesRegexp( - ValueError, - "test_name", - lambda: _make_type_verifier(StringType(), nullable=False, name="test_name")(None)) - - schema = StructType([StructField('a', StructType([StructField('b', IntegerType())]))]) - self.assertRaisesRegexp( - TypeError, - "field b in field a", - lambda: _make_type_verifier(schema)([["data"]])) - - def test_verify_type_ok_nullable(self): - obj = None - types = [IntegerType(), FloatType(), StringType(), StructType([])] - for data_type in types: - try: - _make_type_verifier(data_type, nullable=True)(obj) - except Exception: - self.fail("verify_type(%s, %s, nullable=True)" % (obj, data_type)) - - def test_verify_type_not_nullable(self): - import array - import datetime - import decimal - - schema = StructType([ - StructField('s', StringType(), nullable=False), - StructField('i', IntegerType(), nullable=True)]) - - class MyObj: - def __init__(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) - - # obj, data_type - success_spec = [ - # String - ("", StringType()), - (u"", StringType()), - (1, StringType()), - (1.0, StringType()), - ([], StringType()), - ({}, StringType()), - - # UDT - (ExamplePoint(1.0, 2.0), ExamplePointUDT()), - - # Boolean - (True, BooleanType()), - - # Byte - (-(2**7), ByteType()), - (2**7 - 1, ByteType()), - - # Short - (-(2**15), ShortType()), - (2**15 - 1, ShortType()), - - # Integer - (-(2**31), IntegerType()), - (2**31 - 1, IntegerType()), - - # Long - (2**64, LongType()), - - # Float & Double - (1.0, FloatType()), - (1.0, DoubleType()), - - # Decimal - (decimal.Decimal("1.0"), DecimalType()), - - # Binary - (bytearray([1, 2]), BinaryType()), - - # Date/Timestamp - (datetime.date(2000, 1, 2), DateType()), - (datetime.datetime(2000, 1, 2, 3, 4), DateType()), - (datetime.datetime(2000, 1, 2, 3, 4), TimestampType()), - - # Array - ([], ArrayType(IntegerType())), - (["1", None], ArrayType(StringType(), containsNull=True)), - ([1, 2], ArrayType(IntegerType())), - ((1, 2), ArrayType(IntegerType())), - (array.array('h', [1, 2]), ArrayType(IntegerType())), - - # Map - ({}, MapType(StringType(), IntegerType())), - ({"a": 1}, MapType(StringType(), IntegerType())), - ({"a": None}, MapType(StringType(), IntegerType(), valueContainsNull=True)), - - # Struct - ({"s": "a", "i": 1}, schema), - ({"s": "a", "i": None}, schema), - ({"s": "a"}, schema), - ({"s": "a", "f": 1.0}, schema), - (Row(s="a", i=1), schema), - (Row(s="a", i=None), schema), - (Row(s="a", i=1, f=1.0), schema), - (["a", 1], schema), - (["a", None], schema), - (("a", 1), schema), - (MyObj(s="a", i=1), schema), - (MyObj(s="a", i=None), schema), - (MyObj(s="a"), schema), - ] - - # obj, data_type, exception class - failure_spec = [ - # String (match anything but None) - (None, StringType(), ValueError), - - # UDT - (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError), - - # Boolean - (1, BooleanType(), TypeError), - ("True", BooleanType(), TypeError), - ([1], BooleanType(), TypeError), - - # Byte - (-(2**7) - 1, ByteType(), ValueError), - (2**7, ByteType(), ValueError), - ("1", ByteType(), TypeError), - (1.0, ByteType(), TypeError), - - # Short - (-(2**15) - 1, ShortType(), ValueError), - (2**15, ShortType(), ValueError), - - # Integer - (-(2**31) - 1, IntegerType(), ValueError), - (2**31, IntegerType(), ValueError), - - # Float & Double - (1, FloatType(), TypeError), - (1, DoubleType(), TypeError), - - # Decimal - (1.0, DecimalType(), TypeError), - (1, DecimalType(), TypeError), - ("1.0", DecimalType(), TypeError), - - # Binary - (1, BinaryType(), TypeError), - - # Date/Timestamp - ("2000-01-02", DateType(), TypeError), - (946811040, TimestampType(), TypeError), - - # Array - (["1", None], ArrayType(StringType(), containsNull=False), ValueError), - ([1, "2"], ArrayType(IntegerType()), TypeError), - - # Map - ({"a": 1}, MapType(IntegerType(), IntegerType()), TypeError), - ({"a": "1"}, MapType(StringType(), IntegerType()), TypeError), - ({"a": None}, MapType(StringType(), IntegerType(), valueContainsNull=False), - ValueError), - - # Struct - ({"s": "a", "i": "1"}, schema, TypeError), - (Row(s="a"), schema, ValueError), # Row can't have missing field - (Row(s="a", i="1"), schema, TypeError), - (["a"], schema, ValueError), - (["a", "1"], schema, TypeError), - (MyObj(s="a", i="1"), schema, TypeError), - (MyObj(s=None, i="1"), schema, ValueError), - ] - - # Check success cases - for obj, data_type in success_spec: - try: - _make_type_verifier(data_type, nullable=False)(obj) - except Exception: - self.fail("verify_type(%s, %s, nullable=False)" % (obj, data_type)) - - # Check failure cases - for obj, data_type, exp in failure_spec: - msg = "verify_type(%s, %s, nullable=False) == %s" % (obj, data_type, exp) - with self.assertRaises(exp, msg=msg): - _make_type_verifier(data_type, nullable=False)(obj) - - -@unittest.skipIf( - not _have_pandas or not _have_pyarrow, - _pandas_requirement_message or _pyarrow_requirement_message) -class ArrowTests(ReusedSQLTestCase): - - @classmethod - def setUpClass(cls): - from datetime import date, datetime - from decimal import Decimal - from distutils.version import LooseVersion - import pyarrow as pa - super(ArrowTests, cls).setUpClass() - cls.warnings_lock = threading.Lock() - - # Synchronize default timezone between Python and Java - cls.tz_prev = os.environ.get("TZ", None) # save current tz if set - tz = "America/Los_Angeles" - os.environ["TZ"] = tz - time.tzset() - - cls.spark.conf.set("spark.sql.session.timeZone", tz) - cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true") - # Disable fallback by default to easily detect the failures. - cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false") - cls.schema = StructType([ - StructField("1_str_t", StringType(), True), - StructField("2_int_t", IntegerType(), True), - StructField("3_long_t", LongType(), True), - StructField("4_float_t", FloatType(), True), - StructField("5_double_t", DoubleType(), True), - StructField("6_decimal_t", DecimalType(38, 18), True), - StructField("7_date_t", DateType(), True), - StructField("8_timestamp_t", TimestampType(), True)]) - cls.data = [(u"a", 1, 10, 0.2, 2.0, Decimal("2.0"), - date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1)), - (u"b", 2, 20, 0.4, 4.0, Decimal("4.0"), - date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)), - (u"c", 3, 30, 0.8, 6.0, Decimal("6.0"), - date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3))] - - # TODO: remove version check once minimum pyarrow version is 0.10.0 - if LooseVersion("0.10.0") <= LooseVersion(pa.__version__): - cls.schema.add(StructField("9_binary_t", BinaryType(), True)) - cls.data[0] = cls.data[0] + (bytearray(b"a"),) - cls.data[1] = cls.data[1] + (bytearray(b"bb"),) - cls.data[2] = cls.data[2] + (bytearray(b"ccc"),) - - @classmethod - def tearDownClass(cls): - del os.environ["TZ"] - if cls.tz_prev is not None: - os.environ["TZ"] = cls.tz_prev - time.tzset() - super(ArrowTests, cls).tearDownClass() - - def create_pandas_data_frame(self): - import pandas as pd - import numpy as np - data_dict = {} - for j, name in enumerate(self.schema.names): - data_dict[name] = [self.data[i][j] for i in range(len(self.data))] - # need to convert these to numpy types first - data_dict["2_int_t"] = np.int32(data_dict["2_int_t"]) - data_dict["4_float_t"] = np.float32(data_dict["4_float_t"]) - return pd.DataFrame(data=data_dict) - - def test_toPandas_fallback_enabled(self): - import pandas as pd - - with self.sql_conf({"spark.sql.execution.arrow.fallback.enabled": True}): - schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)]) - df = self.spark.createDataFrame([({u'a': 1},)], schema=schema) - with QuietTest(self.sc): - with self.warnings_lock: - with warnings.catch_warnings(record=True) as warns: - # we want the warnings to appear even if this test is run from a subclass - warnings.simplefilter("always") - pdf = df.toPandas() - # Catch and check the last UserWarning. - user_warns = [ - warn.message for warn in warns if isinstance(warn.message, UserWarning)] - self.assertTrue(len(user_warns) > 0) - self.assertTrue( - "Attempting non-optimization" in _exception_message(user_warns[-1])) - self.assertPandasEqual(pdf, pd.DataFrame({u'map': [{u'a': 1}]})) - - def test_toPandas_fallback_disabled(self): - from distutils.version import LooseVersion - import pyarrow as pa - - schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)]) - df = self.spark.createDataFrame([(None,)], schema=schema) - with QuietTest(self.sc): - with self.warnings_lock: - with self.assertRaisesRegexp(Exception, 'Unsupported type'): - df.toPandas() - - # TODO: remove BinaryType check once minimum pyarrow version is 0.10.0 - if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): - schema = StructType([StructField("binary", BinaryType(), True)]) - df = self.spark.createDataFrame([(None,)], schema=schema) - with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, 'Unsupported type.*BinaryType'): - df.toPandas() - - def test_null_conversion(self): - df_null = self.spark.createDataFrame([tuple([None for _ in range(len(self.data[0]))])] + - self.data) - pdf = df_null.toPandas() - null_counts = pdf.isnull().sum().tolist() - self.assertTrue(all([c == 1 for c in null_counts])) - - def _toPandas_arrow_toggle(self, df): - with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): - pdf = df.toPandas() - - pdf_arrow = df.toPandas() - - return pdf, pdf_arrow - - def test_toPandas_arrow_toggle(self): - df = self.spark.createDataFrame(self.data, schema=self.schema) - pdf, pdf_arrow = self._toPandas_arrow_toggle(df) - expected = self.create_pandas_data_frame() - self.assertPandasEqual(expected, pdf) - self.assertPandasEqual(expected, pdf_arrow) - - def test_toPandas_respect_session_timezone(self): - df = self.spark.createDataFrame(self.data, schema=self.schema) - - timezone = "America/New_York" - with self.sql_conf({ - "spark.sql.execution.pandas.respectSessionTimeZone": False, - "spark.sql.session.timeZone": timezone}): - pdf_la, pdf_arrow_la = self._toPandas_arrow_toggle(df) - self.assertPandasEqual(pdf_arrow_la, pdf_la) - - with self.sql_conf({ - "spark.sql.execution.pandas.respectSessionTimeZone": True, - "spark.sql.session.timeZone": timezone}): - pdf_ny, pdf_arrow_ny = self._toPandas_arrow_toggle(df) - self.assertPandasEqual(pdf_arrow_ny, pdf_ny) - - self.assertFalse(pdf_ny.equals(pdf_la)) - - from pyspark.sql.types import _check_series_convert_timestamps_local_tz - pdf_la_corrected = pdf_la.copy() - for field in self.schema: - if isinstance(field.dataType, TimestampType): - pdf_la_corrected[field.name] = _check_series_convert_timestamps_local_tz( - pdf_la_corrected[field.name], timezone) - self.assertPandasEqual(pdf_ny, pdf_la_corrected) - - def test_pandas_round_trip(self): - pdf = self.create_pandas_data_frame() - df = self.spark.createDataFrame(self.data, schema=self.schema) - pdf_arrow = df.toPandas() - self.assertPandasEqual(pdf_arrow, pdf) - - def test_filtered_frame(self): - df = self.spark.range(3).toDF("i") - pdf = df.filter("i < 0").toPandas() - self.assertEqual(len(pdf.columns), 1) - self.assertEqual(pdf.columns[0], "i") - self.assertTrue(pdf.empty) - - def _createDataFrame_toggle(self, pdf, schema=None): - with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): - df_no_arrow = self.spark.createDataFrame(pdf, schema=schema) - - df_arrow = self.spark.createDataFrame(pdf, schema=schema) - - return df_no_arrow, df_arrow - - def test_createDataFrame_toggle(self): - pdf = self.create_pandas_data_frame() - df_no_arrow, df_arrow = self._createDataFrame_toggle(pdf, schema=self.schema) - self.assertEquals(df_no_arrow.collect(), df_arrow.collect()) - - def test_createDataFrame_respect_session_timezone(self): - from datetime import timedelta - pdf = self.create_pandas_data_frame() - timezone = "America/New_York" - with self.sql_conf({ - "spark.sql.execution.pandas.respectSessionTimeZone": False, - "spark.sql.session.timeZone": timezone}): - df_no_arrow_la, df_arrow_la = self._createDataFrame_toggle(pdf, schema=self.schema) - result_la = df_no_arrow_la.collect() - result_arrow_la = df_arrow_la.collect() - self.assertEqual(result_la, result_arrow_la) - - with self.sql_conf({ - "spark.sql.execution.pandas.respectSessionTimeZone": True, - "spark.sql.session.timeZone": timezone}): - df_no_arrow_ny, df_arrow_ny = self._createDataFrame_toggle(pdf, schema=self.schema) - result_ny = df_no_arrow_ny.collect() - result_arrow_ny = df_arrow_ny.collect() - self.assertEqual(result_ny, result_arrow_ny) - - self.assertNotEqual(result_ny, result_la) - - # Correct result_la by adjusting 3 hours difference between Los Angeles and New York - result_la_corrected = [Row(**{k: v - timedelta(hours=3) if k == '8_timestamp_t' else v - for k, v in row.asDict().items()}) - for row in result_la] - self.assertEqual(result_ny, result_la_corrected) - - def test_createDataFrame_with_schema(self): - pdf = self.create_pandas_data_frame() - df = self.spark.createDataFrame(pdf, schema=self.schema) - self.assertEquals(self.schema, df.schema) - pdf_arrow = df.toPandas() - self.assertPandasEqual(pdf_arrow, pdf) - - def test_createDataFrame_with_incorrect_schema(self): - pdf = self.create_pandas_data_frame() - fields = list(self.schema) - fields[0], fields[7] = fields[7], fields[0] # swap str with timestamp - wrong_schema = StructType(fields) - with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, ".*No cast.*string.*timestamp.*"): - self.spark.createDataFrame(pdf, schema=wrong_schema) - - def test_createDataFrame_with_names(self): - pdf = self.create_pandas_data_frame() - new_names = list(map(str, range(len(self.schema.fieldNames())))) - # Test that schema as a list of column names gets applied - df = self.spark.createDataFrame(pdf, schema=list(new_names)) - self.assertEquals(df.schema.fieldNames(), new_names) - # Test that schema as tuple of column names gets applied - df = self.spark.createDataFrame(pdf, schema=tuple(new_names)) - self.assertEquals(df.schema.fieldNames(), new_names) - - def test_createDataFrame_column_name_encoding(self): - import pandas as pd - pdf = pd.DataFrame({u'a': [1]}) - columns = self.spark.createDataFrame(pdf).columns - self.assertTrue(isinstance(columns[0], str)) - self.assertEquals(columns[0], 'a') - columns = self.spark.createDataFrame(pdf, [u'b']).columns - self.assertTrue(isinstance(columns[0], str)) - self.assertEquals(columns[0], 'b') - - def test_createDataFrame_with_single_data_type(self): - import pandas as pd - with QuietTest(self.sc): - with self.assertRaisesRegexp(ValueError, ".*IntegerType.*not supported.*"): - self.spark.createDataFrame(pd.DataFrame({"a": [1]}), schema="int") - - def test_createDataFrame_does_not_modify_input(self): - import pandas as pd - # Some series get converted for Spark to consume, this makes sure input is unchanged - pdf = self.create_pandas_data_frame() - # Use a nanosecond value to make sure it is not truncated - pdf.ix[0, '8_timestamp_t'] = pd.Timestamp(1) - # Integers with nulls will get NaNs filled with 0 and will be casted - pdf.ix[1, '2_int_t'] = None - pdf_copy = pdf.copy(deep=True) - self.spark.createDataFrame(pdf, schema=self.schema) - self.assertTrue(pdf.equals(pdf_copy)) - - def test_schema_conversion_roundtrip(self): - from pyspark.sql.types import from_arrow_schema, to_arrow_schema - arrow_schema = to_arrow_schema(self.schema) - schema_rt = from_arrow_schema(arrow_schema) - self.assertEquals(self.schema, schema_rt) - - def test_createDataFrame_with_array_type(self): - import pandas as pd - pdf = pd.DataFrame({"a": [[1, 2], [3, 4]], "b": [[u"x", u"y"], [u"y", u"z"]]}) - df, df_arrow = self._createDataFrame_toggle(pdf) - result = df.collect() - result_arrow = df_arrow.collect() - expected = [tuple(list(e) for e in rec) for rec in pdf.to_records(index=False)] - for r in range(len(expected)): - for e in range(len(expected[r])): - self.assertTrue(expected[r][e] == result_arrow[r][e] and - result[r][e] == result_arrow[r][e]) - - def test_toPandas_with_array_type(self): - expected = [([1, 2], [u"x", u"y"]), ([3, 4], [u"y", u"z"])] - array_schema = StructType([StructField("a", ArrayType(IntegerType())), - StructField("b", ArrayType(StringType()))]) - df = self.spark.createDataFrame(expected, schema=array_schema) - pdf, pdf_arrow = self._toPandas_arrow_toggle(df) - result = [tuple(list(e) for e in rec) for rec in pdf.to_records(index=False)] - result_arrow = [tuple(list(e) for e in rec) for rec in pdf_arrow.to_records(index=False)] - for r in range(len(expected)): - for e in range(len(expected[r])): - self.assertTrue(expected[r][e] == result_arrow[r][e] and - result[r][e] == result_arrow[r][e]) - - def test_createDataFrame_with_int_col_names(self): - import numpy as np - import pandas as pd - pdf = pd.DataFrame(np.random.rand(4, 2)) - df, df_arrow = self._createDataFrame_toggle(pdf) - pdf_col_names = [str(c) for c in pdf.columns] - self.assertEqual(pdf_col_names, df.columns) - self.assertEqual(pdf_col_names, df_arrow.columns) - - def test_createDataFrame_fallback_enabled(self): - import pandas as pd - - with QuietTest(self.sc): - with self.sql_conf({"spark.sql.execution.arrow.fallback.enabled": True}): - with warnings.catch_warnings(record=True) as warns: - # we want the warnings to appear even if this test is run from a subclass - warnings.simplefilter("always") - df = self.spark.createDataFrame( - pd.DataFrame([[{u'a': 1}]]), "a: map") - # Catch and check the last UserWarning. - user_warns = [ - warn.message for warn in warns if isinstance(warn.message, UserWarning)] - self.assertTrue(len(user_warns) > 0) - self.assertTrue( - "Attempting non-optimization" in _exception_message(user_warns[-1])) - self.assertEqual(df.collect(), [Row(a={u'a': 1})]) - - def test_createDataFrame_fallback_disabled(self): - from distutils.version import LooseVersion - import pandas as pd - import pyarrow as pa - - with QuietTest(self.sc): - with self.assertRaisesRegexp(TypeError, 'Unsupported type'): - self.spark.createDataFrame( - pd.DataFrame([[{u'a': 1}]]), "a: map") - - # TODO: remove BinaryType check once minimum pyarrow version is 0.10.0 - if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): - with QuietTest(self.sc): - with self.assertRaisesRegexp(TypeError, 'Unsupported type.*BinaryType'): - self.spark.createDataFrame( - pd.DataFrame([[{'a': b'aaa'}]]), "a: binary") - - # Regression test for SPARK-23314 - def test_timestamp_dst(self): - import pandas as pd - # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am - dt = [datetime.datetime(2015, 11, 1, 0, 30), - datetime.datetime(2015, 11, 1, 1, 30), - datetime.datetime(2015, 11, 1, 2, 30)] - pdf = pd.DataFrame({'time': dt}) - - df_from_python = self.spark.createDataFrame(dt, 'timestamp').toDF('time') - df_from_pandas = self.spark.createDataFrame(pdf) - - self.assertPandasEqual(pdf, df_from_python.toPandas()) - self.assertPandasEqual(pdf, df_from_pandas.toPandas()) - - -class EncryptionArrowTests(ArrowTests): - - @classmethod - def conf(cls): - return super(EncryptionArrowTests, cls).conf().set("spark.io.encryption.enabled", "true") - - -@unittest.skipIf( - not _have_pandas or not _have_pyarrow, - _pandas_requirement_message or _pyarrow_requirement_message) -class PandasUDFTests(ReusedSQLTestCase): - - def test_pandas_udf_basic(self): - from pyspark.rdd import PythonEvalType - from pyspark.sql.functions import pandas_udf, PandasUDFType - - udf = pandas_udf(lambda x: x, DoubleType()) - self.assertEqual(udf.returnType, DoubleType()) - self.assertEqual(udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) - - udf = pandas_udf(lambda x: x, DoubleType(), PandasUDFType.SCALAR) - self.assertEqual(udf.returnType, DoubleType()) - self.assertEqual(udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) - - udf = pandas_udf(lambda x: x, 'double', PandasUDFType.SCALAR) - self.assertEqual(udf.returnType, DoubleType()) - self.assertEqual(udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) - - udf = pandas_udf(lambda x: x, StructType([StructField("v", DoubleType())]), - PandasUDFType.GROUPED_MAP) - self.assertEqual(udf.returnType, StructType([StructField("v", DoubleType())])) - self.assertEqual(udf.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) - - udf = pandas_udf(lambda x: x, 'v double', PandasUDFType.GROUPED_MAP) - self.assertEqual(udf.returnType, StructType([StructField("v", DoubleType())])) - self.assertEqual(udf.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) - - udf = pandas_udf(lambda x: x, 'v double', - functionType=PandasUDFType.GROUPED_MAP) - self.assertEqual(udf.returnType, StructType([StructField("v", DoubleType())])) - self.assertEqual(udf.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) - - udf = pandas_udf(lambda x: x, returnType='v double', - functionType=PandasUDFType.GROUPED_MAP) - self.assertEqual(udf.returnType, StructType([StructField("v", DoubleType())])) - self.assertEqual(udf.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) - - def test_pandas_udf_decorator(self): - from pyspark.rdd import PythonEvalType - from pyspark.sql.functions import pandas_udf, PandasUDFType - from pyspark.sql.types import StructType, StructField, DoubleType - - @pandas_udf(DoubleType()) - def foo(x): - return x - self.assertEqual(foo.returnType, DoubleType()) - self.assertEqual(foo.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) - - @pandas_udf(returnType=DoubleType()) - def foo(x): - return x - self.assertEqual(foo.returnType, DoubleType()) - self.assertEqual(foo.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) - - schema = StructType([StructField("v", DoubleType())]) - - @pandas_udf(schema, PandasUDFType.GROUPED_MAP) - def foo(x): - return x - self.assertEqual(foo.returnType, schema) - self.assertEqual(foo.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) - - @pandas_udf('v double', PandasUDFType.GROUPED_MAP) - def foo(x): - return x - self.assertEqual(foo.returnType, schema) - self.assertEqual(foo.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) - - @pandas_udf(schema, functionType=PandasUDFType.GROUPED_MAP) - def foo(x): - return x - self.assertEqual(foo.returnType, schema) - self.assertEqual(foo.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) - - @pandas_udf(returnType='double', functionType=PandasUDFType.SCALAR) - def foo(x): - return x - self.assertEqual(foo.returnType, DoubleType()) - self.assertEqual(foo.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) - - @pandas_udf(returnType=schema, functionType=PandasUDFType.GROUPED_MAP) - def foo(x): - return x - self.assertEqual(foo.returnType, schema) - self.assertEqual(foo.evalType, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) - - def test_udf_wrong_arg(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - - with QuietTest(self.sc): - with self.assertRaises(ParseException): - @pandas_udf('blah') - def foo(x): - return x - with self.assertRaisesRegexp(ValueError, 'Invalid returnType.*None'): - @pandas_udf(functionType=PandasUDFType.SCALAR) - def foo(x): - return x - with self.assertRaisesRegexp(ValueError, 'Invalid functionType'): - @pandas_udf('double', 100) - def foo(x): - return x - - with self.assertRaisesRegexp(ValueError, '0-arg pandas_udfs.*not.*supported'): - pandas_udf(lambda: 1, LongType(), PandasUDFType.SCALAR) - with self.assertRaisesRegexp(ValueError, '0-arg pandas_udfs.*not.*supported'): - @pandas_udf(LongType(), PandasUDFType.SCALAR) - def zero_with_type(): - return 1 - - with self.assertRaisesRegexp(TypeError, 'Invalid returnType'): - @pandas_udf(returnType=PandasUDFType.GROUPED_MAP) - def foo(df): - return df - with self.assertRaisesRegexp(TypeError, 'Invalid returnType'): - @pandas_udf(returnType='double', functionType=PandasUDFType.GROUPED_MAP) - def foo(df): - return df - with self.assertRaisesRegexp(ValueError, 'Invalid function'): - @pandas_udf(returnType='k int, v double', functionType=PandasUDFType.GROUPED_MAP) - def foo(k, v, w): - return k - - def test_stopiteration_in_udf(self): - from pyspark.sql.functions import udf, pandas_udf, PandasUDFType - from py4j.protocol import Py4JJavaError - - def foo(x): - raise StopIteration() - - def foofoo(x, y): - raise StopIteration() - - exc_message = "Caught StopIteration thrown from user's code; failing the task" - df = self.spark.range(0, 100) - - # plain udf (test for SPARK-23754) - self.assertRaisesRegexp( - Py4JJavaError, - exc_message, - df.withColumn('v', udf(foo)('id')).collect - ) - - # pandas scalar udf - self.assertRaisesRegexp( - Py4JJavaError, - exc_message, - df.withColumn( - 'v', pandas_udf(foo, 'double', PandasUDFType.SCALAR)('id') - ).collect - ) - - # pandas grouped map - self.assertRaisesRegexp( - Py4JJavaError, - exc_message, - df.groupBy('id').apply( - pandas_udf(foo, df.schema, PandasUDFType.GROUPED_MAP) - ).collect - ) - - self.assertRaisesRegexp( - Py4JJavaError, - exc_message, - df.groupBy('id').apply( - pandas_udf(foofoo, df.schema, PandasUDFType.GROUPED_MAP) - ).collect - ) - - # pandas grouped agg - self.assertRaisesRegexp( - Py4JJavaError, - exc_message, - df.groupBy('id').agg( - pandas_udf(foo, 'double', PandasUDFType.GROUPED_AGG)('id') - ).collect - ) - - -@unittest.skipIf( - not _have_pandas or not _have_pyarrow, - _pandas_requirement_message or _pyarrow_requirement_message) -class ScalarPandasUDFTests(ReusedSQLTestCase): - - @classmethod - def setUpClass(cls): - ReusedSQLTestCase.setUpClass() - - # Synchronize default timezone between Python and Java - cls.tz_prev = os.environ.get("TZ", None) # save current tz if set - tz = "America/Los_Angeles" - os.environ["TZ"] = tz - time.tzset() - - cls.sc.environment["TZ"] = tz - cls.spark.conf.set("spark.sql.session.timeZone", tz) - - @classmethod - def tearDownClass(cls): - del os.environ["TZ"] - if cls.tz_prev is not None: - os.environ["TZ"] = cls.tz_prev - time.tzset() - ReusedSQLTestCase.tearDownClass() - - @property - def nondeterministic_vectorized_udf(self): - from pyspark.sql.functions import pandas_udf - - @pandas_udf('double') - def random_udf(v): - import pandas as pd - import numpy as np - return pd.Series(np.random.random(len(v))) - random_udf = random_udf.asNondeterministic() - return random_udf - - def test_pandas_udf_tokenize(self): - from pyspark.sql.functions import pandas_udf - tokenize = pandas_udf(lambda s: s.apply(lambda str: str.split(' ')), - ArrayType(StringType())) - self.assertEqual(tokenize.returnType, ArrayType(StringType())) - df = self.spark.createDataFrame([("hi boo",), ("bye boo",)], ["vals"]) - result = df.select(tokenize("vals").alias("hi")) - self.assertEqual([Row(hi=[u'hi', u'boo']), Row(hi=[u'bye', u'boo'])], result.collect()) - - def test_pandas_udf_nested_arrays(self): - from pyspark.sql.functions import pandas_udf - tokenize = pandas_udf(lambda s: s.apply(lambda str: [str.split(' ')]), - ArrayType(ArrayType(StringType()))) - self.assertEqual(tokenize.returnType, ArrayType(ArrayType(StringType()))) - df = self.spark.createDataFrame([("hi boo",), ("bye boo",)], ["vals"]) - result = df.select(tokenize("vals").alias("hi")) - self.assertEqual([Row(hi=[[u'hi', u'boo']]), Row(hi=[[u'bye', u'boo']])], result.collect()) - - def test_vectorized_udf_basic(self): - from pyspark.sql.functions import pandas_udf, col, array - df = self.spark.range(10).select( - col('id').cast('string').alias('str'), - col('id').cast('int').alias('int'), - col('id').alias('long'), - col('id').cast('float').alias('float'), - col('id').cast('double').alias('double'), - col('id').cast('decimal').alias('decimal'), - col('id').cast('boolean').alias('bool'), - array(col('id')).alias('array_long')) - f = lambda x: x - str_f = pandas_udf(f, StringType()) - int_f = pandas_udf(f, IntegerType()) - long_f = pandas_udf(f, LongType()) - float_f = pandas_udf(f, FloatType()) - double_f = pandas_udf(f, DoubleType()) - decimal_f = pandas_udf(f, DecimalType()) - bool_f = pandas_udf(f, BooleanType()) - array_long_f = pandas_udf(f, ArrayType(LongType())) - res = df.select(str_f(col('str')), int_f(col('int')), - long_f(col('long')), float_f(col('float')), - double_f(col('double')), decimal_f('decimal'), - bool_f(col('bool')), array_long_f('array_long')) - self.assertEquals(df.collect(), res.collect()) - - def test_register_nondeterministic_vectorized_udf_basic(self): - from pyspark.sql.functions import pandas_udf - from pyspark.rdd import PythonEvalType - import random - random_pandas_udf = pandas_udf( - lambda x: random.randint(6, 6) + x, IntegerType()).asNondeterministic() - self.assertEqual(random_pandas_udf.deterministic, False) - self.assertEqual(random_pandas_udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) - nondeterministic_pandas_udf = self.spark.catalog.registerFunction( - "randomPandasUDF", random_pandas_udf) - self.assertEqual(nondeterministic_pandas_udf.deterministic, False) - self.assertEqual(nondeterministic_pandas_udf.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) - [row] = self.spark.sql("SELECT randomPandasUDF(1)").collect() - self.assertEqual(row[0], 7) - - def test_vectorized_udf_null_boolean(self): - from pyspark.sql.functions import pandas_udf, col - data = [(True,), (True,), (None,), (False,)] - schema = StructType().add("bool", BooleanType()) - df = self.spark.createDataFrame(data, schema) - bool_f = pandas_udf(lambda x: x, BooleanType()) - res = df.select(bool_f(col('bool'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_null_byte(self): - from pyspark.sql.functions import pandas_udf, col - data = [(None,), (2,), (3,), (4,)] - schema = StructType().add("byte", ByteType()) - df = self.spark.createDataFrame(data, schema) - byte_f = pandas_udf(lambda x: x, ByteType()) - res = df.select(byte_f(col('byte'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_null_short(self): - from pyspark.sql.functions import pandas_udf, col - data = [(None,), (2,), (3,), (4,)] - schema = StructType().add("short", ShortType()) - df = self.spark.createDataFrame(data, schema) - short_f = pandas_udf(lambda x: x, ShortType()) - res = df.select(short_f(col('short'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_null_int(self): - from pyspark.sql.functions import pandas_udf, col - data = [(None,), (2,), (3,), (4,)] - schema = StructType().add("int", IntegerType()) - df = self.spark.createDataFrame(data, schema) - int_f = pandas_udf(lambda x: x, IntegerType()) - res = df.select(int_f(col('int'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_null_long(self): - from pyspark.sql.functions import pandas_udf, col - data = [(None,), (2,), (3,), (4,)] - schema = StructType().add("long", LongType()) - df = self.spark.createDataFrame(data, schema) - long_f = pandas_udf(lambda x: x, LongType()) - res = df.select(long_f(col('long'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_null_float(self): - from pyspark.sql.functions import pandas_udf, col - data = [(3.0,), (5.0,), (-1.0,), (None,)] - schema = StructType().add("float", FloatType()) - df = self.spark.createDataFrame(data, schema) - float_f = pandas_udf(lambda x: x, FloatType()) - res = df.select(float_f(col('float'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_null_double(self): - from pyspark.sql.functions import pandas_udf, col - data = [(3.0,), (5.0,), (-1.0,), (None,)] - schema = StructType().add("double", DoubleType()) - df = self.spark.createDataFrame(data, schema) - double_f = pandas_udf(lambda x: x, DoubleType()) - res = df.select(double_f(col('double'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_null_decimal(self): - from decimal import Decimal - from pyspark.sql.functions import pandas_udf, col - data = [(Decimal(3.0),), (Decimal(5.0),), (Decimal(-1.0),), (None,)] - schema = StructType().add("decimal", DecimalType(38, 18)) - df = self.spark.createDataFrame(data, schema) - decimal_f = pandas_udf(lambda x: x, DecimalType(38, 18)) - res = df.select(decimal_f(col('decimal'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_null_string(self): - from pyspark.sql.functions import pandas_udf, col - data = [("foo",), (None,), ("bar",), ("bar",)] - schema = StructType().add("str", StringType()) - df = self.spark.createDataFrame(data, schema) - str_f = pandas_udf(lambda x: x, StringType()) - res = df.select(str_f(col('str'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_string_in_udf(self): - from pyspark.sql.functions import pandas_udf, col - import pandas as pd - df = self.spark.range(10) - str_f = pandas_udf(lambda x: pd.Series(map(str, x)), StringType()) - actual = df.select(str_f(col('id'))) - expected = df.select(col('id').cast('string')) - self.assertEquals(expected.collect(), actual.collect()) - - def test_vectorized_udf_datatype_string(self): - from pyspark.sql.functions import pandas_udf, col - df = self.spark.range(10).select( - col('id').cast('string').alias('str'), - col('id').cast('int').alias('int'), - col('id').alias('long'), - col('id').cast('float').alias('float'), - col('id').cast('double').alias('double'), - col('id').cast('decimal').alias('decimal'), - col('id').cast('boolean').alias('bool')) - f = lambda x: x - str_f = pandas_udf(f, 'string') - int_f = pandas_udf(f, 'integer') - long_f = pandas_udf(f, 'long') - float_f = pandas_udf(f, 'float') - double_f = pandas_udf(f, 'double') - decimal_f = pandas_udf(f, 'decimal(38, 18)') - bool_f = pandas_udf(f, 'boolean') - res = df.select(str_f(col('str')), int_f(col('int')), - long_f(col('long')), float_f(col('float')), - double_f(col('double')), decimal_f('decimal'), - bool_f(col('bool'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_null_binary(self): - from distutils.version import LooseVersion - import pyarrow as pa - from pyspark.sql.functions import pandas_udf, col - if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): - with QuietTest(self.sc): - with self.assertRaisesRegexp( - NotImplementedError, - 'Invalid returnType.*scalar Pandas UDF.*BinaryType'): - pandas_udf(lambda x: x, BinaryType()) - else: - data = [(bytearray(b"a"),), (None,), (bytearray(b"bb"),), (bytearray(b"ccc"),)] - schema = StructType().add("binary", BinaryType()) - df = self.spark.createDataFrame(data, schema) - str_f = pandas_udf(lambda x: x, BinaryType()) - res = df.select(str_f(col('binary'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_array_type(self): - from pyspark.sql.functions import pandas_udf, col - data = [([1, 2],), ([3, 4],)] - array_schema = StructType([StructField("array", ArrayType(IntegerType()))]) - df = self.spark.createDataFrame(data, schema=array_schema) - array_f = pandas_udf(lambda x: x, ArrayType(IntegerType())) - result = df.select(array_f(col('array'))) - self.assertEquals(df.collect(), result.collect()) - - def test_vectorized_udf_null_array(self): - from pyspark.sql.functions import pandas_udf, col - data = [([1, 2],), (None,), (None,), ([3, 4],), (None,)] - array_schema = StructType([StructField("array", ArrayType(IntegerType()))]) - df = self.spark.createDataFrame(data, schema=array_schema) - array_f = pandas_udf(lambda x: x, ArrayType(IntegerType())) - result = df.select(array_f(col('array'))) - self.assertEquals(df.collect(), result.collect()) - - def test_vectorized_udf_complex(self): - from pyspark.sql.functions import pandas_udf, col, expr - df = self.spark.range(10).select( - col('id').cast('int').alias('a'), - col('id').cast('int').alias('b'), - col('id').cast('double').alias('c')) - add = pandas_udf(lambda x, y: x + y, IntegerType()) - power2 = pandas_udf(lambda x: 2 ** x, IntegerType()) - mul = pandas_udf(lambda x, y: x * y, DoubleType()) - res = df.select(add(col('a'), col('b')), power2(col('a')), mul(col('b'), col('c'))) - expected = df.select(expr('a + b'), expr('power(2, a)'), expr('b * c')) - self.assertEquals(expected.collect(), res.collect()) - - def test_vectorized_udf_exception(self): - from pyspark.sql.functions import pandas_udf, col - df = self.spark.range(10) - raise_exception = pandas_udf(lambda x: x * (1 / 0), LongType()) - with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, 'division( or modulo)? by zero'): - df.select(raise_exception(col('id'))).collect() - - def test_vectorized_udf_invalid_length(self): - from pyspark.sql.functions import pandas_udf, col - import pandas as pd - df = self.spark.range(10) - raise_exception = pandas_udf(lambda _: pd.Series(1), LongType()) - with QuietTest(self.sc): - with self.assertRaisesRegexp( - Exception, - 'Result vector from pandas_udf was not the required length'): - df.select(raise_exception(col('id'))).collect() - - def test_vectorized_udf_chained(self): - from pyspark.sql.functions import pandas_udf, col - df = self.spark.range(10) - f = pandas_udf(lambda x: x + 1, LongType()) - g = pandas_udf(lambda x: x - 1, LongType()) - res = df.select(g(f(col('id')))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_wrong_return_type(self): - from pyspark.sql.functions import pandas_udf, col - with QuietTest(self.sc): - with self.assertRaisesRegexp( - NotImplementedError, - 'Invalid returnType.*scalar Pandas UDF.*MapType'): - pandas_udf(lambda x: x * 1.0, MapType(LongType(), LongType())) - - def test_vectorized_udf_return_scalar(self): - from pyspark.sql.functions import pandas_udf, col - df = self.spark.range(10) - f = pandas_udf(lambda x: 1.0, DoubleType()) - with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, 'Return.*type.*Series'): - df.select(f(col('id'))).collect() - - def test_vectorized_udf_decorator(self): - from pyspark.sql.functions import pandas_udf, col - df = self.spark.range(10) - - @pandas_udf(returnType=LongType()) - def identity(x): - return x - res = df.select(identity(col('id'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_empty_partition(self): - from pyspark.sql.functions import pandas_udf, col - df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2)) - f = pandas_udf(lambda x: x, LongType()) - res = df.select(f(col('id'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_varargs(self): - from pyspark.sql.functions import pandas_udf, col - df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2)) - f = pandas_udf(lambda *v: v[0], LongType()) - res = df.select(f(col('id'))) - self.assertEquals(df.collect(), res.collect()) - - def test_vectorized_udf_unsupported_types(self): - from pyspark.sql.functions import pandas_udf - with QuietTest(self.sc): - with self.assertRaisesRegexp( - NotImplementedError, - 'Invalid returnType.*scalar Pandas UDF.*MapType'): - pandas_udf(lambda x: x, MapType(StringType(), IntegerType())) - - def test_vectorized_udf_dates(self): - from pyspark.sql.functions import pandas_udf, col - from datetime import date - schema = StructType().add("idx", LongType()).add("date", DateType()) - data = [(0, date(1969, 1, 1),), - (1, date(2012, 2, 2),), - (2, None,), - (3, date(2100, 4, 4),)] - df = self.spark.createDataFrame(data, schema=schema) - - date_copy = pandas_udf(lambda t: t, returnType=DateType()) - df = df.withColumn("date_copy", date_copy(col("date"))) - - @pandas_udf(returnType=StringType()) - def check_data(idx, date, date_copy): - import pandas as pd - msgs = [] - is_equal = date.isnull() - for i in range(len(idx)): - if (is_equal[i] and data[idx[i]][1] is None) or \ - date[i] == data[idx[i]][1]: - msgs.append(None) - else: - msgs.append( - "date values are not equal (date='%s': data[%d][1]='%s')" - % (date[i], idx[i], data[idx[i]][1])) - return pd.Series(msgs) - - result = df.withColumn("check_data", - check_data(col("idx"), col("date"), col("date_copy"))).collect() - - self.assertEquals(len(data), len(result)) - for i in range(len(result)): - self.assertEquals(data[i][1], result[i][1]) # "date" col - self.assertEquals(data[i][1], result[i][2]) # "date_copy" col - self.assertIsNone(result[i][3]) # "check_data" col - - def test_vectorized_udf_timestamps(self): - from pyspark.sql.functions import pandas_udf, col - from datetime import datetime - schema = StructType([ - StructField("idx", LongType(), True), - StructField("timestamp", TimestampType(), True)]) - data = [(0, datetime(1969, 1, 1, 1, 1, 1)), - (1, datetime(2012, 2, 2, 2, 2, 2)), - (2, None), - (3, datetime(2100, 3, 3, 3, 3, 3))] - - df = self.spark.createDataFrame(data, schema=schema) - - # Check that a timestamp passed through a pandas_udf will not be altered by timezone calc - f_timestamp_copy = pandas_udf(lambda t: t, returnType=TimestampType()) - df = df.withColumn("timestamp_copy", f_timestamp_copy(col("timestamp"))) - - @pandas_udf(returnType=StringType()) - def check_data(idx, timestamp, timestamp_copy): - import pandas as pd - msgs = [] - is_equal = timestamp.isnull() # use this array to check values are equal - for i in range(len(idx)): - # Check that timestamps are as expected in the UDF - if (is_equal[i] and data[idx[i]][1] is None) or \ - timestamp[i].to_pydatetime() == data[idx[i]][1]: - msgs.append(None) - else: - msgs.append( - "timestamp values are not equal (timestamp='%s': data[%d][1]='%s')" - % (timestamp[i], idx[i], data[idx[i]][1])) - return pd.Series(msgs) - - result = df.withColumn("check_data", check_data(col("idx"), col("timestamp"), - col("timestamp_copy"))).collect() - # Check that collection values are correct - self.assertEquals(len(data), len(result)) - for i in range(len(result)): - self.assertEquals(data[i][1], result[i][1]) # "timestamp" col - self.assertEquals(data[i][1], result[i][2]) # "timestamp_copy" col - self.assertIsNone(result[i][3]) # "check_data" col - - def test_vectorized_udf_return_timestamp_tz(self): - from pyspark.sql.functions import pandas_udf, col - import pandas as pd - df = self.spark.range(10) - - @pandas_udf(returnType=TimestampType()) - def gen_timestamps(id): - ts = [pd.Timestamp(i, unit='D', tz='America/Los_Angeles') for i in id] - return pd.Series(ts) - - result = df.withColumn("ts", gen_timestamps(col("id"))).collect() - spark_ts_t = TimestampType() - for r in result: - i, ts = r - ts_tz = pd.Timestamp(i, unit='D', tz='America/Los_Angeles').to_pydatetime() - expected = spark_ts_t.fromInternal(spark_ts_t.toInternal(ts_tz)) - self.assertEquals(expected, ts) - - def test_vectorized_udf_check_config(self): - from pyspark.sql.functions import pandas_udf, col - import pandas as pd - with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": 3}): - df = self.spark.range(10, numPartitions=1) - - @pandas_udf(returnType=LongType()) - def check_records_per_batch(x): - return pd.Series(x.size).repeat(x.size) - - result = df.select(check_records_per_batch(col("id"))).collect() - for (r,) in result: - self.assertTrue(r <= 3) - - def test_vectorized_udf_timestamps_respect_session_timezone(self): - from pyspark.sql.functions import pandas_udf, col - from datetime import datetime - import pandas as pd - schema = StructType([ - StructField("idx", LongType(), True), - StructField("timestamp", TimestampType(), True)]) - data = [(1, datetime(1969, 1, 1, 1, 1, 1)), - (2, datetime(2012, 2, 2, 2, 2, 2)), - (3, None), - (4, datetime(2100, 3, 3, 3, 3, 3))] - df = self.spark.createDataFrame(data, schema=schema) - - f_timestamp_copy = pandas_udf(lambda ts: ts, TimestampType()) - internal_value = pandas_udf( - lambda ts: ts.apply(lambda ts: ts.value if ts is not pd.NaT else None), LongType()) - - timezone = "America/New_York" - with self.sql_conf({ - "spark.sql.execution.pandas.respectSessionTimeZone": False, - "spark.sql.session.timeZone": timezone}): - df_la = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \ - .withColumn("internal_value", internal_value(col("timestamp"))) - result_la = df_la.select(col("idx"), col("internal_value")).collect() - # Correct result_la by adjusting 3 hours difference between Los Angeles and New York - diff = 3 * 60 * 60 * 1000 * 1000 * 1000 - result_la_corrected = \ - df_la.select(col("idx"), col("tscopy"), col("internal_value") + diff).collect() - - with self.sql_conf({ - "spark.sql.execution.pandas.respectSessionTimeZone": True, - "spark.sql.session.timeZone": timezone}): - df_ny = df.withColumn("tscopy", f_timestamp_copy(col("timestamp"))) \ - .withColumn("internal_value", internal_value(col("timestamp"))) - result_ny = df_ny.select(col("idx"), col("tscopy"), col("internal_value")).collect() - - self.assertNotEqual(result_ny, result_la) - self.assertEqual(result_ny, result_la_corrected) - - def test_nondeterministic_vectorized_udf(self): - # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations - from pyspark.sql.functions import udf, pandas_udf, col - - @pandas_udf('double') - def plus_ten(v): - return v + 10 - random_udf = self.nondeterministic_vectorized_udf - - df = self.spark.range(10).withColumn('rand', random_udf(col('id'))) - result1 = df.withColumn('plus_ten(rand)', plus_ten(df['rand'])).toPandas() - - self.assertEqual(random_udf.deterministic, False) - self.assertTrue(result1['plus_ten(rand)'].equals(result1['rand'] + 10)) - - def test_nondeterministic_vectorized_udf_in_aggregate(self): - from pyspark.sql.functions import pandas_udf, sum - - df = self.spark.range(10) - random_udf = self.nondeterministic_vectorized_udf - - with QuietTest(self.sc): - with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'): - df.groupby(df.id).agg(sum(random_udf(df.id))).collect() - with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'): - df.agg(sum(random_udf(df.id))).collect() - - def test_register_vectorized_udf_basic(self): - from pyspark.rdd import PythonEvalType - from pyspark.sql.functions import pandas_udf, col, expr - df = self.spark.range(10).select( - col('id').cast('int').alias('a'), - col('id').cast('int').alias('b')) - original_add = pandas_udf(lambda x, y: x + y, IntegerType()) - self.assertEqual(original_add.deterministic, True) - self.assertEqual(original_add.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) - new_add = self.spark.catalog.registerFunction("add1", original_add) - res1 = df.select(new_add(col('a'), col('b'))) - res2 = self.spark.sql( - "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t") - expected = df.select(expr('a + b')) - self.assertEquals(expected.collect(), res1.collect()) - self.assertEquals(expected.collect(), res2.collect()) - - # Regression test for SPARK-23314 - def test_timestamp_dst(self): - from pyspark.sql.functions import pandas_udf - # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am - dt = [datetime.datetime(2015, 11, 1, 0, 30), - datetime.datetime(2015, 11, 1, 1, 30), - datetime.datetime(2015, 11, 1, 2, 30)] - df = self.spark.createDataFrame(dt, 'timestamp').toDF('time') - foo_udf = pandas_udf(lambda x: x, 'timestamp') - result = df.withColumn('time', foo_udf(df.time)) - self.assertEquals(df.collect(), result.collect()) - - @unittest.skipIf(sys.version_info[:2] < (3, 5), "Type hints are supported from Python 3.5.") - def test_type_annotation(self): - from pyspark.sql.functions import pandas_udf - # Regression test to check if type hints can be used. See SPARK-23569. - # Note that it throws an error during compilation in lower Python versions if 'exec' - # is not used. Also, note that we explicitly use another dictionary to avoid modifications - # in the current 'locals()'. - # - # Hyukjin: I think it's an ugly way to test issues about syntax specific in - # higher versions of Python, which we shouldn't encourage. This was the last resort - # I could come up with at that time. - _locals = {} - exec( - "import pandas as pd\ndef noop(col: pd.Series) -> pd.Series: return col", - _locals) - df = self.spark.range(1).select(pandas_udf(f=_locals['noop'], returnType='bigint')('id')) - self.assertEqual(df.first()[0], 0) - - def test_mixed_udf(self): - import pandas as pd - from pyspark.sql.functions import col, udf, pandas_udf - - df = self.spark.range(0, 1).toDF('v') - - # Test mixture of multiple UDFs and Pandas UDFs. - - @udf('int') - def f1(x): - assert type(x) == int - return x + 1 - - @pandas_udf('int') - def f2(x): - assert type(x) == pd.Series - return x + 10 - - @udf('int') - def f3(x): - assert type(x) == int - return x + 100 - - @pandas_udf('int') - def f4(x): - assert type(x) == pd.Series - return x + 1000 - - # Test single expression with chained UDFs - df_chained_1 = df.withColumn('f2_f1', f2(f1(df['v']))) - df_chained_2 = df.withColumn('f3_f2_f1', f3(f2(f1(df['v'])))) - df_chained_3 = df.withColumn('f4_f3_f2_f1', f4(f3(f2(f1(df['v']))))) - df_chained_4 = df.withColumn('f4_f2_f1', f4(f2(f1(df['v'])))) - df_chained_5 = df.withColumn('f4_f3_f1', f4(f3(f1(df['v'])))) - - expected_chained_1 = df.withColumn('f2_f1', df['v'] + 11) - expected_chained_2 = df.withColumn('f3_f2_f1', df['v'] + 111) - expected_chained_3 = df.withColumn('f4_f3_f2_f1', df['v'] + 1111) - expected_chained_4 = df.withColumn('f4_f2_f1', df['v'] + 1011) - expected_chained_5 = df.withColumn('f4_f3_f1', df['v'] + 1101) - - self.assertEquals(expected_chained_1.collect(), df_chained_1.collect()) - self.assertEquals(expected_chained_2.collect(), df_chained_2.collect()) - self.assertEquals(expected_chained_3.collect(), df_chained_3.collect()) - self.assertEquals(expected_chained_4.collect(), df_chained_4.collect()) - self.assertEquals(expected_chained_5.collect(), df_chained_5.collect()) - - # Test multiple mixed UDF expressions in a single projection - df_multi_1 = df \ - .withColumn('f1', f1(col('v'))) \ - .withColumn('f2', f2(col('v'))) \ - .withColumn('f3', f3(col('v'))) \ - .withColumn('f4', f4(col('v'))) \ - .withColumn('f2_f1', f2(col('f1'))) \ - .withColumn('f3_f1', f3(col('f1'))) \ - .withColumn('f4_f1', f4(col('f1'))) \ - .withColumn('f3_f2', f3(col('f2'))) \ - .withColumn('f4_f2', f4(col('f2'))) \ - .withColumn('f4_f3', f4(col('f3'))) \ - .withColumn('f3_f2_f1', f3(col('f2_f1'))) \ - .withColumn('f4_f2_f1', f4(col('f2_f1'))) \ - .withColumn('f4_f3_f1', f4(col('f3_f1'))) \ - .withColumn('f4_f3_f2', f4(col('f3_f2'))) \ - .withColumn('f4_f3_f2_f1', f4(col('f3_f2_f1'))) - - # Test mixed udfs in a single expression - df_multi_2 = df \ - .withColumn('f1', f1(col('v'))) \ - .withColumn('f2', f2(col('v'))) \ - .withColumn('f3', f3(col('v'))) \ - .withColumn('f4', f4(col('v'))) \ - .withColumn('f2_f1', f2(f1(col('v')))) \ - .withColumn('f3_f1', f3(f1(col('v')))) \ - .withColumn('f4_f1', f4(f1(col('v')))) \ - .withColumn('f3_f2', f3(f2(col('v')))) \ - .withColumn('f4_f2', f4(f2(col('v')))) \ - .withColumn('f4_f3', f4(f3(col('v')))) \ - .withColumn('f3_f2_f1', f3(f2(f1(col('v'))))) \ - .withColumn('f4_f2_f1', f4(f2(f1(col('v'))))) \ - .withColumn('f4_f3_f1', f4(f3(f1(col('v'))))) \ - .withColumn('f4_f3_f2', f4(f3(f2(col('v'))))) \ - .withColumn('f4_f3_f2_f1', f4(f3(f2(f1(col('v')))))) - - expected = df \ - .withColumn('f1', df['v'] + 1) \ - .withColumn('f2', df['v'] + 10) \ - .withColumn('f3', df['v'] + 100) \ - .withColumn('f4', df['v'] + 1000) \ - .withColumn('f2_f1', df['v'] + 11) \ - .withColumn('f3_f1', df['v'] + 101) \ - .withColumn('f4_f1', df['v'] + 1001) \ - .withColumn('f3_f2', df['v'] + 110) \ - .withColumn('f4_f2', df['v'] + 1010) \ - .withColumn('f4_f3', df['v'] + 1100) \ - .withColumn('f3_f2_f1', df['v'] + 111) \ - .withColumn('f4_f2_f1', df['v'] + 1011) \ - .withColumn('f4_f3_f1', df['v'] + 1101) \ - .withColumn('f4_f3_f2', df['v'] + 1110) \ - .withColumn('f4_f3_f2_f1', df['v'] + 1111) - - self.assertEquals(expected.collect(), df_multi_1.collect()) - self.assertEquals(expected.collect(), df_multi_2.collect()) - - def test_mixed_udf_and_sql(self): - import pandas as pd - from pyspark.sql import Column - from pyspark.sql.functions import udf, pandas_udf - - df = self.spark.range(0, 1).toDF('v') - - # Test mixture of UDFs, Pandas UDFs and SQL expression. - - @udf('int') - def f1(x): - assert type(x) == int - return x + 1 - - def f2(x): - assert type(x) == Column - return x + 10 - - @pandas_udf('int') - def f3(x): - assert type(x) == pd.Series - return x + 100 - - df1 = df.withColumn('f1', f1(df['v'])) \ - .withColumn('f2', f2(df['v'])) \ - .withColumn('f3', f3(df['v'])) \ - .withColumn('f1_f2', f1(f2(df['v']))) \ - .withColumn('f1_f3', f1(f3(df['v']))) \ - .withColumn('f2_f1', f2(f1(df['v']))) \ - .withColumn('f2_f3', f2(f3(df['v']))) \ - .withColumn('f3_f1', f3(f1(df['v']))) \ - .withColumn('f3_f2', f3(f2(df['v']))) \ - .withColumn('f1_f2_f3', f1(f2(f3(df['v'])))) \ - .withColumn('f1_f3_f2', f1(f3(f2(df['v'])))) \ - .withColumn('f2_f1_f3', f2(f1(f3(df['v'])))) \ - .withColumn('f2_f3_f1', f2(f3(f1(df['v'])))) \ - .withColumn('f3_f1_f2', f3(f1(f2(df['v'])))) \ - .withColumn('f3_f2_f1', f3(f2(f1(df['v'])))) - - expected = df.withColumn('f1', df['v'] + 1) \ - .withColumn('f2', df['v'] + 10) \ - .withColumn('f3', df['v'] + 100) \ - .withColumn('f1_f2', df['v'] + 11) \ - .withColumn('f1_f3', df['v'] + 101) \ - .withColumn('f2_f1', df['v'] + 11) \ - .withColumn('f2_f3', df['v'] + 110) \ - .withColumn('f3_f1', df['v'] + 101) \ - .withColumn('f3_f2', df['v'] + 110) \ - .withColumn('f1_f2_f3', df['v'] + 111) \ - .withColumn('f1_f3_f2', df['v'] + 111) \ - .withColumn('f2_f1_f3', df['v'] + 111) \ - .withColumn('f2_f3_f1', df['v'] + 111) \ - .withColumn('f3_f1_f2', df['v'] + 111) \ - .withColumn('f3_f2_f1', df['v'] + 111) - - self.assertEquals(expected.collect(), df1.collect()) - - # SPARK-24721 - @unittest.skipIf(not _test_compiled, _test_not_compiled_message) - def test_datasource_with_udf(self): - # Same as SQLTests.test_datasource_with_udf, but with Pandas UDF - # This needs to a separate test because Arrow dependency is optional - import pandas as pd - import numpy as np - from pyspark.sql.functions import pandas_udf, lit, col - - path = tempfile.mkdtemp() - shutil.rmtree(path) - - try: - self.spark.range(1).write.mode("overwrite").format('csv').save(path) - filesource_df = self.spark.read.option('inferSchema', True).csv(path).toDF('i') - datasource_df = self.spark.read \ - .format("org.apache.spark.sql.sources.SimpleScanSource") \ - .option('from', 0).option('to', 1).load().toDF('i') - datasource_v2_df = self.spark.read \ - .format("org.apache.spark.sql.sources.v2.SimpleDataSourceV2") \ - .load().toDF('i', 'j') - - c1 = pandas_udf(lambda x: x + 1, 'int')(lit(1)) - c2 = pandas_udf(lambda x: x + 1, 'int')(col('i')) - - f1 = pandas_udf(lambda x: pd.Series(np.repeat(False, len(x))), 'boolean')(lit(1)) - f2 = pandas_udf(lambda x: pd.Series(np.repeat(False, len(x))), 'boolean')(col('i')) - - for df in [filesource_df, datasource_df, datasource_v2_df]: - result = df.withColumn('c', c1) - expected = df.withColumn('c', lit(2)) - self.assertEquals(expected.collect(), result.collect()) - - for df in [filesource_df, datasource_df, datasource_v2_df]: - result = df.withColumn('c', c2) - expected = df.withColumn('c', col('i') + 1) - self.assertEquals(expected.collect(), result.collect()) - - for df in [filesource_df, datasource_df, datasource_v2_df]: - for f in [f1, f2]: - result = df.filter(f) - self.assertEquals(0, result.count()) - finally: - shutil.rmtree(path) - - -@unittest.skipIf( - not _have_pandas or not _have_pyarrow, - _pandas_requirement_message or _pyarrow_requirement_message) -class GroupedMapPandasUDFTests(ReusedSQLTestCase): - - @property - def data(self): - from pyspark.sql.functions import array, explode, col, lit - return self.spark.range(10).toDF('id') \ - .withColumn("vs", array([lit(i) for i in range(20, 30)])) \ - .withColumn("v", explode(col('vs'))).drop('vs') - - def test_supported_types(self): - from decimal import Decimal - from distutils.version import LooseVersion - import pyarrow as pa - from pyspark.sql.functions import pandas_udf, PandasUDFType - - values = [ - 1, 2, 3, - 4, 5, 1.1, - 2.2, Decimal(1.123), - [1, 2, 2], True, 'hello' - ] - output_fields = [ - ('id', IntegerType()), ('byte', ByteType()), ('short', ShortType()), - ('int', IntegerType()), ('long', LongType()), ('float', FloatType()), - ('double', DoubleType()), ('decim', DecimalType(10, 3)), - ('array', ArrayType(IntegerType())), ('bool', BooleanType()), ('str', StringType()) - ] - - # TODO: Add BinaryType to variables above once minimum pyarrow version is 0.10.0 - if LooseVersion(pa.__version__) >= LooseVersion("0.10.0"): - values.append(bytearray([0x01, 0x02])) - output_fields.append(('bin', BinaryType())) - - output_schema = StructType([StructField(*x) for x in output_fields]) - df = self.spark.createDataFrame([values], schema=output_schema) - - # Different forms of group map pandas UDF, results of these are the same - udf1 = pandas_udf( - lambda pdf: pdf.assign( - byte=pdf.byte * 2, - short=pdf.short * 2, - int=pdf.int * 2, - long=pdf.long * 2, - float=pdf.float * 2, - double=pdf.double * 2, - decim=pdf.decim * 2, - bool=False if pdf.bool else True, - str=pdf.str + 'there', - array=pdf.array, - ), - output_schema, - PandasUDFType.GROUPED_MAP - ) - - udf2 = pandas_udf( - lambda _, pdf: pdf.assign( - byte=pdf.byte * 2, - short=pdf.short * 2, - int=pdf.int * 2, - long=pdf.long * 2, - float=pdf.float * 2, - double=pdf.double * 2, - decim=pdf.decim * 2, - bool=False if pdf.bool else True, - str=pdf.str + 'there', - array=pdf.array, - ), - output_schema, - PandasUDFType.GROUPED_MAP - ) - - udf3 = pandas_udf( - lambda key, pdf: pdf.assign( - id=key[0], - byte=pdf.byte * 2, - short=pdf.short * 2, - int=pdf.int * 2, - long=pdf.long * 2, - float=pdf.float * 2, - double=pdf.double * 2, - decim=pdf.decim * 2, - bool=False if pdf.bool else True, - str=pdf.str + 'there', - array=pdf.array, - ), - output_schema, - PandasUDFType.GROUPED_MAP - ) - - result1 = df.groupby('id').apply(udf1).sort('id').toPandas() - expected1 = df.toPandas().groupby('id').apply(udf1.func).reset_index(drop=True) - - result2 = df.groupby('id').apply(udf2).sort('id').toPandas() - expected2 = expected1 - - result3 = df.groupby('id').apply(udf3).sort('id').toPandas() - expected3 = expected1 - - self.assertPandasEqual(expected1, result1) - self.assertPandasEqual(expected2, result2) - self.assertPandasEqual(expected3, result3) - - def test_array_type_correct(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType, array, col - - df = self.data.withColumn("arr", array(col("id"))).repartition(1, "id") - - output_schema = StructType( - [StructField('id', LongType()), - StructField('v', IntegerType()), - StructField('arr', ArrayType(LongType()))]) - - udf = pandas_udf( - lambda pdf: pdf, - output_schema, - PandasUDFType.GROUPED_MAP - ) - - result = df.groupby('id').apply(udf).sort('id').toPandas() - expected = df.toPandas().groupby('id').apply(udf.func).reset_index(drop=True) - self.assertPandasEqual(expected, result) - - def test_register_grouped_map_udf(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - - foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUPED_MAP) - with QuietTest(self.sc): - with self.assertRaisesRegexp( - ValueError, - 'f.*SQL_BATCHED_UDF.*SQL_SCALAR_PANDAS_UDF.*SQL_GROUPED_AGG_PANDAS_UDF.*'): - self.spark.catalog.registerFunction("foo_udf", foo_udf) - - def test_decorator(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - df = self.data - - @pandas_udf( - 'id long, v int, v1 double, v2 long', - PandasUDFType.GROUPED_MAP - ) - def foo(pdf): - return pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id) - - result = df.groupby('id').apply(foo).sort('id').toPandas() - expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True) - self.assertPandasEqual(expected, result) - - def test_coerce(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - df = self.data - - foo = pandas_udf( - lambda pdf: pdf, - 'id long, v double', - PandasUDFType.GROUPED_MAP - ) - - result = df.groupby('id').apply(foo).sort('id').toPandas() - expected = df.toPandas().groupby('id').apply(foo.func).reset_index(drop=True) - expected = expected.assign(v=expected.v.astype('float64')) - self.assertPandasEqual(expected, result) - - def test_complex_groupby(self): - from pyspark.sql.functions import pandas_udf, col, PandasUDFType - df = self.data - - @pandas_udf( - 'id long, v int, norm double', - PandasUDFType.GROUPED_MAP - ) - def normalize(pdf): - v = pdf.v - return pdf.assign(norm=(v - v.mean()) / v.std()) - - result = df.groupby(col('id') % 2 == 0).apply(normalize).sort('id', 'v').toPandas() - pdf = df.toPandas() - expected = pdf.groupby(pdf['id'] % 2 == 0).apply(normalize.func) - expected = expected.sort_values(['id', 'v']).reset_index(drop=True) - expected = expected.assign(norm=expected.norm.astype('float64')) - self.assertPandasEqual(expected, result) - - def test_empty_groupby(self): - from pyspark.sql.functions import pandas_udf, col, PandasUDFType - df = self.data - - @pandas_udf( - 'id long, v int, norm double', - PandasUDFType.GROUPED_MAP - ) - def normalize(pdf): - v = pdf.v - return pdf.assign(norm=(v - v.mean()) / v.std()) - - result = df.groupby().apply(normalize).sort('id', 'v').toPandas() - pdf = df.toPandas() - expected = normalize.func(pdf) - expected = expected.sort_values(['id', 'v']).reset_index(drop=True) - expected = expected.assign(norm=expected.norm.astype('float64')) - self.assertPandasEqual(expected, result) - - def test_datatype_string(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - df = self.data - - foo_udf = pandas_udf( - lambda pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id), - 'id long, v int, v1 double, v2 long', - PandasUDFType.GROUPED_MAP - ) - - result = df.groupby('id').apply(foo_udf).sort('id').toPandas() - expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True) - self.assertPandasEqual(expected, result) - - def test_wrong_return_type(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - - with QuietTest(self.sc): - with self.assertRaisesRegexp( - NotImplementedError, - 'Invalid returnType.*grouped map Pandas UDF.*MapType'): - pandas_udf( - lambda pdf: pdf, - 'id long, v map', - PandasUDFType.GROUPED_MAP) - - def test_wrong_args(self): - from pyspark.sql.functions import udf, pandas_udf, sum, PandasUDFType - df = self.data - - with QuietTest(self.sc): - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): - df.groupby('id').apply(lambda x: x) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): - df.groupby('id').apply(udf(lambda x: x, DoubleType())) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): - df.groupby('id').apply(sum(df.v)) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): - df.groupby('id').apply(df.v + 1) - with self.assertRaisesRegexp(ValueError, 'Invalid function'): - df.groupby('id').apply( - pandas_udf(lambda: 1, StructType([StructField("d", DoubleType())]))) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): - df.groupby('id').apply(pandas_udf(lambda x, y: x, DoubleType())) - with self.assertRaisesRegexp(ValueError, 'Invalid udf.*GROUPED_MAP'): - df.groupby('id').apply( - pandas_udf(lambda x, y: x, DoubleType(), PandasUDFType.SCALAR)) - - def test_unsupported_types(self): - from distutils.version import LooseVersion - import pyarrow as pa - from pyspark.sql.functions import pandas_udf, PandasUDFType - - common_err_msg = 'Invalid returnType.*grouped map Pandas UDF.*' - unsupported_types = [ - StructField('map', MapType(StringType(), IntegerType())), - StructField('arr_ts', ArrayType(TimestampType())), - StructField('null', NullType()), - ] - - # TODO: Remove this if-statement once minimum pyarrow version is 0.10.0 - if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): - unsupported_types.append(StructField('bin', BinaryType())) - - for unsupported_type in unsupported_types: - schema = StructType([StructField('id', LongType(), True), unsupported_type]) - with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, common_err_msg): - pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP) - - # Regression test for SPARK-23314 - def test_timestamp_dst(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - # Daylight saving time for Los Angeles for 2015 is Sun, Nov 1 at 2:00 am - dt = [datetime.datetime(2015, 11, 1, 0, 30), - datetime.datetime(2015, 11, 1, 1, 30), - datetime.datetime(2015, 11, 1, 2, 30)] - df = self.spark.createDataFrame(dt, 'timestamp').toDF('time') - foo_udf = pandas_udf(lambda pdf: pdf, 'time timestamp', PandasUDFType.GROUPED_MAP) - result = df.groupby('time').apply(foo_udf).sort('time') - self.assertPandasEqual(df.toPandas(), result.toPandas()) - - def test_udf_with_key(self): - from pyspark.sql.functions import pandas_udf, col, PandasUDFType - df = self.data - pdf = df.toPandas() - - def foo1(key, pdf): - import numpy as np - assert type(key) == tuple - assert type(key[0]) == np.int64 - - return pdf.assign(v1=key[0], - v2=pdf.v * key[0], - v3=pdf.v * pdf.id, - v4=pdf.v * pdf.id.mean()) - - def foo2(key, pdf): - import numpy as np - assert type(key) == tuple - assert type(key[0]) == np.int64 - assert type(key[1]) == np.int32 - - return pdf.assign(v1=key[0], - v2=key[1], - v3=pdf.v * key[0], - v4=pdf.v + key[1]) - - def foo3(key, pdf): - assert type(key) == tuple - assert len(key) == 0 - return pdf.assign(v1=pdf.v * pdf.id) - - # v2 is int because numpy.int64 * pd.Series results in pd.Series - # v3 is long because pd.Series * pd.Series results in pd.Series - udf1 = pandas_udf( - foo1, - 'id long, v int, v1 long, v2 int, v3 long, v4 double', - PandasUDFType.GROUPED_MAP) - - udf2 = pandas_udf( - foo2, - 'id long, v int, v1 long, v2 int, v3 int, v4 int', - PandasUDFType.GROUPED_MAP) - - udf3 = pandas_udf( - foo3, - 'id long, v int, v1 long', - PandasUDFType.GROUPED_MAP) - - # Test groupby column - result1 = df.groupby('id').apply(udf1).sort('id', 'v').toPandas() - expected1 = pdf.groupby('id')\ - .apply(lambda x: udf1.func((x.id.iloc[0],), x))\ - .sort_values(['id', 'v']).reset_index(drop=True) - self.assertPandasEqual(expected1, result1) - - # Test groupby expression - result2 = df.groupby(df.id % 2).apply(udf1).sort('id', 'v').toPandas() - expected2 = pdf.groupby(pdf.id % 2)\ - .apply(lambda x: udf1.func((x.id.iloc[0] % 2,), x))\ - .sort_values(['id', 'v']).reset_index(drop=True) - self.assertPandasEqual(expected2, result2) - - # Test complex groupby - result3 = df.groupby(df.id, df.v % 2).apply(udf2).sort('id', 'v').toPandas() - expected3 = pdf.groupby([pdf.id, pdf.v % 2])\ - .apply(lambda x: udf2.func((x.id.iloc[0], (x.v % 2).iloc[0],), x))\ - .sort_values(['id', 'v']).reset_index(drop=True) - self.assertPandasEqual(expected3, result3) - - # Test empty groupby - result4 = df.groupby().apply(udf3).sort('id', 'v').toPandas() - expected4 = udf3.func((), pdf) - self.assertPandasEqual(expected4, result4) - - def test_column_order(self): - from collections import OrderedDict - import pandas as pd - from pyspark.sql.functions import pandas_udf, PandasUDFType - - # Helper function to set column names from a list - def rename_pdf(pdf, names): - pdf.rename(columns={old: new for old, new in - zip(pd_result.columns, names)}, inplace=True) - - df = self.data - grouped_df = df.groupby('id') - grouped_pdf = df.toPandas().groupby('id') - - # Function returns a pdf with required column names, but order could be arbitrary using dict - def change_col_order(pdf): - # Constructing a DataFrame from a dict should result in the same order, - # but use from_items to ensure the pdf column order is different than schema - return pd.DataFrame.from_items([ - ('id', pdf.id), - ('u', pdf.v * 2), - ('v', pdf.v)]) - - ordered_udf = pandas_udf( - change_col_order, - 'id long, v int, u int', - PandasUDFType.GROUPED_MAP - ) - - # The UDF result should assign columns by name from the pdf - result = grouped_df.apply(ordered_udf).sort('id', 'v')\ - .select('id', 'u', 'v').toPandas() - pd_result = grouped_pdf.apply(change_col_order) - expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True) - self.assertPandasEqual(expected, result) - - # Function returns a pdf with positional columns, indexed by range - def range_col_order(pdf): - # Create a DataFrame with positional columns, fix types to long - return pd.DataFrame(list(zip(pdf.id, pdf.v * 3, pdf.v)), dtype='int64') - - range_udf = pandas_udf( - range_col_order, - 'id long, u long, v long', - PandasUDFType.GROUPED_MAP - ) - - # The UDF result uses positional columns from the pdf - result = grouped_df.apply(range_udf).sort('id', 'v') \ - .select('id', 'u', 'v').toPandas() - pd_result = grouped_pdf.apply(range_col_order) - rename_pdf(pd_result, ['id', 'u', 'v']) - expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True) - self.assertPandasEqual(expected, result) - - # Function returns a pdf with columns indexed with integers - def int_index(pdf): - return pd.DataFrame(OrderedDict([(0, pdf.id), (1, pdf.v * 4), (2, pdf.v)])) - - int_index_udf = pandas_udf( - int_index, - 'id long, u int, v int', - PandasUDFType.GROUPED_MAP - ) - - # The UDF result should assign columns by position of integer index - result = grouped_df.apply(int_index_udf).sort('id', 'v') \ - .select('id', 'u', 'v').toPandas() - pd_result = grouped_pdf.apply(int_index) - rename_pdf(pd_result, ['id', 'u', 'v']) - expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True) - self.assertPandasEqual(expected, result) - - @pandas_udf('id long, v int', PandasUDFType.GROUPED_MAP) - def column_name_typo(pdf): - return pd.DataFrame({'iid': pdf.id, 'v': pdf.v}) - - @pandas_udf('id long, v int', PandasUDFType.GROUPED_MAP) - def invalid_positional_types(pdf): - return pd.DataFrame([(u'a', 1.2)]) - - with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "KeyError: 'id'"): - grouped_df.apply(column_name_typo).collect() - from distutils.version import LooseVersion - import pyarrow as pa - if LooseVersion(pa.__version__) < LooseVersion("0.11.0"): - # TODO: see ARROW-1949. Remove when the minimum PyArrow version becomes 0.11.0. - with self.assertRaisesRegexp(Exception, "No cast implemented"): - grouped_df.apply(invalid_positional_types).collect() - else: - with self.assertRaisesRegexp(Exception, "an integer is required"): - grouped_df.apply(invalid_positional_types).collect() - - def test_positional_assignment_conf(self): - import pandas as pd - from pyspark.sql.functions import pandas_udf, PandasUDFType - - with self.sql_conf({ - "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName": False}): - - @pandas_udf("a string, b float", PandasUDFType.GROUPED_MAP) - def foo(_): - return pd.DataFrame([('hi', 1)], columns=['x', 'y']) - - df = self.data - result = df.groupBy('id').apply(foo).select('a', 'b').collect() - for r in result: - self.assertEqual(r.a, 'hi') - self.assertEqual(r.b, 1) - - def test_self_join_with_pandas(self): - import pyspark.sql.functions as F - - @F.pandas_udf('key long, col string', F.PandasUDFType.GROUPED_MAP) - def dummy_pandas_udf(df): - return df[['key', 'col']] - - df = self.spark.createDataFrame([Row(key=1, col='A'), Row(key=1, col='B'), - Row(key=2, col='C')]) - df_with_pandas = df.groupBy('key').apply(dummy_pandas_udf) - - # this was throwing an AnalysisException before SPARK-24208 - res = df_with_pandas.alias('temp0').join(df_with_pandas.alias('temp1'), - F.col('temp0.key') == F.col('temp1.key')) - self.assertEquals(res.count(), 5) - - def test_mixed_scalar_udfs_followed_by_grouby_apply(self): - import pandas as pd - from pyspark.sql.functions import udf, pandas_udf, PandasUDFType - - df = self.spark.range(0, 10).toDF('v1') - df = df.withColumn('v2', udf(lambda x: x + 1, 'int')(df['v1'])) \ - .withColumn('v3', pandas_udf(lambda x: x + 2, 'int')(df['v1'])) - - result = df.groupby() \ - .apply(pandas_udf(lambda x: pd.DataFrame([x.sum().sum()]), - 'sum int', - PandasUDFType.GROUPED_MAP)) - - self.assertEquals(result.collect()[0]['sum'], 165) - - -@unittest.skipIf( - not _have_pandas or not _have_pyarrow, - _pandas_requirement_message or _pyarrow_requirement_message) -class GroupedAggPandasUDFTests(ReusedSQLTestCase): - - @property - def data(self): - from pyspark.sql.functions import array, explode, col, lit - return self.spark.range(10).toDF('id') \ - .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ - .withColumn("v", explode(col('vs'))) \ - .drop('vs') \ - .withColumn('w', lit(1.0)) - - @property - def python_plus_one(self): - from pyspark.sql.functions import udf - - @udf('double') - def plus_one(v): - assert isinstance(v, (int, float)) - return v + 1 - return plus_one - - @property - def pandas_scalar_plus_two(self): - import pandas as pd - from pyspark.sql.functions import pandas_udf, PandasUDFType - - @pandas_udf('double', PandasUDFType.SCALAR) - def plus_two(v): - assert isinstance(v, pd.Series) - return v + 2 - return plus_two - - @property - def pandas_agg_mean_udf(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - - @pandas_udf('double', PandasUDFType.GROUPED_AGG) - def avg(v): - return v.mean() - return avg - - @property - def pandas_agg_sum_udf(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - - @pandas_udf('double', PandasUDFType.GROUPED_AGG) - def sum(v): - return v.sum() - return sum - - @property - def pandas_agg_weighted_mean_udf(self): - import numpy as np - from pyspark.sql.functions import pandas_udf, PandasUDFType - - @pandas_udf('double', PandasUDFType.GROUPED_AGG) - def weighted_mean(v, w): - return np.average(v, weights=w) - return weighted_mean - - def test_manual(self): - from pyspark.sql.functions import pandas_udf, array - - df = self.data - sum_udf = self.pandas_agg_sum_udf - mean_udf = self.pandas_agg_mean_udf - mean_arr_udf = pandas_udf( - self.pandas_agg_mean_udf.func, - ArrayType(self.pandas_agg_mean_udf.returnType), - self.pandas_agg_mean_udf.evalType) - - result1 = df.groupby('id').agg( - sum_udf(df.v), - mean_udf(df.v), - mean_arr_udf(array(df.v))).sort('id') - expected1 = self.spark.createDataFrame( - [[0, 245.0, 24.5, [24.5]], - [1, 255.0, 25.5, [25.5]], - [2, 265.0, 26.5, [26.5]], - [3, 275.0, 27.5, [27.5]], - [4, 285.0, 28.5, [28.5]], - [5, 295.0, 29.5, [29.5]], - [6, 305.0, 30.5, [30.5]], - [7, 315.0, 31.5, [31.5]], - [8, 325.0, 32.5, [32.5]], - [9, 335.0, 33.5, [33.5]]], - ['id', 'sum(v)', 'avg(v)', 'avg(array(v))']) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - - def test_basic(self): - from pyspark.sql.functions import col, lit, sum, mean - - df = self.data - weighted_mean_udf = self.pandas_agg_weighted_mean_udf - - # Groupby one column and aggregate one UDF with literal - result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id') - expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id') - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - - # Groupby one expression and aggregate one UDF with literal - result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\ - .sort(df.id + 1) - expected2 = df.groupby((col('id') + 1))\ - .agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort(df.id + 1) - self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) - - # Groupby one column and aggregate one UDF without literal - result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id') - expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, w)')).sort('id') - self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) - - # Groupby one expression and aggregate one UDF without literal - result4 = df.groupby((col('id') + 1).alias('id'))\ - .agg(weighted_mean_udf(df.v, df.w))\ - .sort('id') - expected4 = df.groupby((col('id') + 1).alias('id'))\ - .agg(mean(df.v).alias('weighted_mean(v, w)'))\ - .sort('id') - self.assertPandasEqual(expected4.toPandas(), result4.toPandas()) - - def test_unsupported_types(self): - from pyspark.sql.types import DoubleType, MapType - from pyspark.sql.functions import pandas_udf, PandasUDFType - - with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, 'not supported'): - pandas_udf( - lambda x: x, - ArrayType(ArrayType(TimestampType())), - PandasUDFType.GROUPED_AGG) - - with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, 'not supported'): - @pandas_udf('mean double, std double', PandasUDFType.GROUPED_AGG) - def mean_and_std_udf(v): - return v.mean(), v.std() - - with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, 'not supported'): - @pandas_udf(MapType(DoubleType(), DoubleType()), PandasUDFType.GROUPED_AGG) - def mean_and_std_udf(v): - return {v.mean(): v.std()} - - def test_alias(self): - from pyspark.sql.functions import mean - - df = self.data - mean_udf = self.pandas_agg_mean_udf - - result1 = df.groupby('id').agg(mean_udf(df.v).alias('mean_alias')) - expected1 = df.groupby('id').agg(mean(df.v).alias('mean_alias')) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - - def test_mixed_sql(self): - """ - Test mixing group aggregate pandas UDF with sql expression. - """ - from pyspark.sql.functions import sum, mean - - df = self.data - sum_udf = self.pandas_agg_sum_udf - - # Mix group aggregate pandas UDF with sql expression - result1 = (df.groupby('id') - .agg(sum_udf(df.v) + 1) - .sort('id')) - expected1 = (df.groupby('id') - .agg(sum(df.v) + 1) - .sort('id')) - - # Mix group aggregate pandas UDF with sql expression (order swapped) - result2 = (df.groupby('id') - .agg(sum_udf(df.v + 1)) - .sort('id')) - - expected2 = (df.groupby('id') - .agg(sum(df.v + 1)) - .sort('id')) - - # Wrap group aggregate pandas UDF with two sql expressions - result3 = (df.groupby('id') - .agg(sum_udf(df.v + 1) + 2) - .sort('id')) - expected3 = (df.groupby('id') - .agg(sum(df.v + 1) + 2) - .sort('id')) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) - self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) - - def test_mixed_udfs(self): - """ - Test mixing group aggregate pandas UDF with python UDF and scalar pandas UDF. - """ - from pyspark.sql.functions import sum, mean - - df = self.data - plus_one = self.python_plus_one - plus_two = self.pandas_scalar_plus_two - sum_udf = self.pandas_agg_sum_udf - - # Mix group aggregate pandas UDF and python UDF - result1 = (df.groupby('id') - .agg(plus_one(sum_udf(df.v))) - .sort('id')) - expected1 = (df.groupby('id') - .agg(plus_one(sum(df.v))) - .sort('id')) - - # Mix group aggregate pandas UDF and python UDF (order swapped) - result2 = (df.groupby('id') - .agg(sum_udf(plus_one(df.v))) - .sort('id')) - expected2 = (df.groupby('id') - .agg(sum(plus_one(df.v))) - .sort('id')) - - # Mix group aggregate pandas UDF and scalar pandas UDF - result3 = (df.groupby('id') - .agg(sum_udf(plus_two(df.v))) - .sort('id')) - expected3 = (df.groupby('id') - .agg(sum(plus_two(df.v))) - .sort('id')) - - # Mix group aggregate pandas UDF and scalar pandas UDF (order swapped) - result4 = (df.groupby('id') - .agg(plus_two(sum_udf(df.v))) - .sort('id')) - expected4 = (df.groupby('id') - .agg(plus_two(sum(df.v))) - .sort('id')) - - # Wrap group aggregate pandas UDF with two python UDFs and use python UDF in groupby - result5 = (df.groupby(plus_one(df.id)) - .agg(plus_one(sum_udf(plus_one(df.v)))) - .sort('plus_one(id)')) - expected5 = (df.groupby(plus_one(df.id)) - .agg(plus_one(sum(plus_one(df.v)))) - .sort('plus_one(id)')) - - # Wrap group aggregate pandas UDF with two scala pandas UDF and user scala pandas UDF in - # groupby - result6 = (df.groupby(plus_two(df.id)) - .agg(plus_two(sum_udf(plus_two(df.v)))) - .sort('plus_two(id)')) - expected6 = (df.groupby(plus_two(df.id)) - .agg(plus_two(sum(plus_two(df.v)))) - .sort('plus_two(id)')) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) - self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) - self.assertPandasEqual(expected4.toPandas(), result4.toPandas()) - self.assertPandasEqual(expected5.toPandas(), result5.toPandas()) - self.assertPandasEqual(expected6.toPandas(), result6.toPandas()) - - def test_multiple_udfs(self): - """ - Test multiple group aggregate pandas UDFs in one agg function. - """ - from pyspark.sql.functions import col, lit, sum, mean - - df = self.data - mean_udf = self.pandas_agg_mean_udf - sum_udf = self.pandas_agg_sum_udf - weighted_mean_udf = self.pandas_agg_weighted_mean_udf - - result1 = (df.groupBy('id') - .agg(mean_udf(df.v), - sum_udf(df.v), - weighted_mean_udf(df.v, df.w)) - .sort('id') - .toPandas()) - expected1 = (df.groupBy('id') - .agg(mean(df.v), - sum(df.v), - mean(df.v).alias('weighted_mean(v, w)')) - .sort('id') - .toPandas()) - - self.assertPandasEqual(expected1, result1) - - def test_complex_groupby(self): - from pyspark.sql.functions import lit, sum - - df = self.data - sum_udf = self.pandas_agg_sum_udf - plus_one = self.python_plus_one - plus_two = self.pandas_scalar_plus_two - - # groupby one expression - result1 = df.groupby(df.v % 2).agg(sum_udf(df.v)) - expected1 = df.groupby(df.v % 2).agg(sum(df.v)) - - # empty groupby - result2 = df.groupby().agg(sum_udf(df.v)) - expected2 = df.groupby().agg(sum(df.v)) - - # groupby one column and one sql expression - result3 = df.groupby(df.id, df.v % 2).agg(sum_udf(df.v)).orderBy(df.id, df.v % 2) - expected3 = df.groupby(df.id, df.v % 2).agg(sum(df.v)).orderBy(df.id, df.v % 2) - - # groupby one python UDF - result4 = df.groupby(plus_one(df.id)).agg(sum_udf(df.v)) - expected4 = df.groupby(plus_one(df.id)).agg(sum(df.v)) - - # groupby one scalar pandas UDF - result5 = df.groupby(plus_two(df.id)).agg(sum_udf(df.v)) - expected5 = df.groupby(plus_two(df.id)).agg(sum(df.v)) - - # groupby one expression and one python UDF - result6 = df.groupby(df.v % 2, plus_one(df.id)).agg(sum_udf(df.v)) - expected6 = df.groupby(df.v % 2, plus_one(df.id)).agg(sum(df.v)) - - # groupby one expression and one scalar pandas UDF - result7 = df.groupby(df.v % 2, plus_two(df.id)).agg(sum_udf(df.v)).sort('sum(v)') - expected7 = df.groupby(df.v % 2, plus_two(df.id)).agg(sum(df.v)).sort('sum(v)') - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) - self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) - self.assertPandasEqual(expected4.toPandas(), result4.toPandas()) - self.assertPandasEqual(expected5.toPandas(), result5.toPandas()) - self.assertPandasEqual(expected6.toPandas(), result6.toPandas()) - self.assertPandasEqual(expected7.toPandas(), result7.toPandas()) - - def test_complex_expressions(self): - from pyspark.sql.functions import col, sum - - df = self.data - plus_one = self.python_plus_one - plus_two = self.pandas_scalar_plus_two - sum_udf = self.pandas_agg_sum_udf - - # Test complex expressions with sql expression, python UDF and - # group aggregate pandas UDF - result1 = (df.withColumn('v1', plus_one(df.v)) - .withColumn('v2', df.v + 2) - .groupby(df.id, df.v % 2) - .agg(sum_udf(col('v')), - sum_udf(col('v1') + 3), - sum_udf(col('v2')) + 5, - plus_one(sum_udf(col('v1'))), - sum_udf(plus_one(col('v2')))) - .sort('id') - .toPandas()) - - expected1 = (df.withColumn('v1', df.v + 1) - .withColumn('v2', df.v + 2) - .groupby(df.id, df.v % 2) - .agg(sum(col('v')), - sum(col('v1') + 3), - sum(col('v2')) + 5, - plus_one(sum(col('v1'))), - sum(plus_one(col('v2')))) - .sort('id') - .toPandas()) - - # Test complex expressions with sql expression, scala pandas UDF and - # group aggregate pandas UDF - result2 = (df.withColumn('v1', plus_one(df.v)) - .withColumn('v2', df.v + 2) - .groupby(df.id, df.v % 2) - .agg(sum_udf(col('v')), - sum_udf(col('v1') + 3), - sum_udf(col('v2')) + 5, - plus_two(sum_udf(col('v1'))), - sum_udf(plus_two(col('v2')))) - .sort('id') - .toPandas()) - - expected2 = (df.withColumn('v1', df.v + 1) - .withColumn('v2', df.v + 2) - .groupby(df.id, df.v % 2) - .agg(sum(col('v')), - sum(col('v1') + 3), - sum(col('v2')) + 5, - plus_two(sum(col('v1'))), - sum(plus_two(col('v2')))) - .sort('id') - .toPandas()) - - # Test sequential groupby aggregate - result3 = (df.groupby('id') - .agg(sum_udf(df.v).alias('v')) - .groupby('id') - .agg(sum_udf(col('v'))) - .sort('id') - .toPandas()) - - expected3 = (df.groupby('id') - .agg(sum(df.v).alias('v')) - .groupby('id') - .agg(sum(col('v'))) - .sort('id') - .toPandas()) - - self.assertPandasEqual(expected1, result1) - self.assertPandasEqual(expected2, result2) - self.assertPandasEqual(expected3, result3) - - def test_retain_group_columns(self): - from pyspark.sql.functions import sum, lit, col - with self.sql_conf({"spark.sql.retainGroupColumns": False}): - df = self.data - sum_udf = self.pandas_agg_sum_udf - - result1 = df.groupby(df.id).agg(sum_udf(df.v)) - expected1 = df.groupby(df.id).agg(sum(df.v)) - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - - def test_array_type(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - - df = self.data - - array_udf = pandas_udf(lambda x: [1.0, 2.0], 'array', PandasUDFType.GROUPED_AGG) - result1 = df.groupby('id').agg(array_udf(df['v']).alias('v2')) - self.assertEquals(result1.first()['v2'], [1.0, 2.0]) - - def test_invalid_args(self): - from pyspark.sql.functions import mean - - df = self.data - plus_one = self.python_plus_one - mean_udf = self.pandas_agg_mean_udf - - with QuietTest(self.sc): - with self.assertRaisesRegexp( - AnalysisException, - 'nor.*aggregate function'): - df.groupby(df.id).agg(plus_one(df.v)).collect() - - with QuietTest(self.sc): - with self.assertRaisesRegexp( - AnalysisException, - 'aggregate function.*argument.*aggregate function'): - df.groupby(df.id).agg(mean_udf(mean_udf(df.v))).collect() - - with QuietTest(self.sc): - with self.assertRaisesRegexp( - AnalysisException, - 'mixture.*aggregate function.*group aggregate pandas UDF'): - df.groupby(df.id).agg(mean_udf(df.v), mean(df.v)).collect() - - def test_register_vectorized_udf_basic(self): - from pyspark.sql.functions import pandas_udf - from pyspark.rdd import PythonEvalType - - sum_pandas_udf = pandas_udf( - lambda v: v.sum(), "integer", PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF) - - self.assertEqual(sum_pandas_udf.evalType, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF) - group_agg_pandas_udf = self.spark.udf.register("sum_pandas_udf", sum_pandas_udf) - self.assertEqual(group_agg_pandas_udf.evalType, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF) - q = "SELECT sum_pandas_udf(v1) FROM VALUES (3, 0), (2, 0), (1, 1) tbl(v1, v2) GROUP BY v2" - actual = sorted(map(lambda r: r[0], self.spark.sql(q).collect())) - expected = [1, 5] - self.assertEqual(actual, expected) - - -@unittest.skipIf( - not _have_pandas or not _have_pyarrow, - _pandas_requirement_message or _pyarrow_requirement_message) -class WindowPandasUDFTests(ReusedSQLTestCase): - @property - def data(self): - from pyspark.sql.functions import array, explode, col, lit - return self.spark.range(10).toDF('id') \ - .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ - .withColumn("v", explode(col('vs'))) \ - .drop('vs') \ - .withColumn('w', lit(1.0)) - - @property - def python_plus_one(self): - from pyspark.sql.functions import udf - return udf(lambda v: v + 1, 'double') - - @property - def pandas_scalar_time_two(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - return pandas_udf(lambda v: v * 2, 'double') - - @property - def pandas_agg_mean_udf(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - - @pandas_udf('double', PandasUDFType.GROUPED_AGG) - def avg(v): - return v.mean() - return avg - - @property - def pandas_agg_max_udf(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - - @pandas_udf('double', PandasUDFType.GROUPED_AGG) - def max(v): - return v.max() - return max - - @property - def pandas_agg_min_udf(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - - @pandas_udf('double', PandasUDFType.GROUPED_AGG) - def min(v): - return v.min() - return min - - @property - def unbounded_window(self): - return Window.partitionBy('id') \ - .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) - - @property - def ordered_window(self): - return Window.partitionBy('id').orderBy('v') - - @property - def unpartitioned_window(self): - return Window.partitionBy() - - def test_simple(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType, percent_rank, mean, max - - df = self.data - w = self.unbounded_window - - mean_udf = self.pandas_agg_mean_udf - - result1 = df.withColumn('mean_v', mean_udf(df['v']).over(w)) - expected1 = df.withColumn('mean_v', mean(df['v']).over(w)) - - result2 = df.select(mean_udf(df['v']).over(w)) - expected2 = df.select(mean(df['v']).over(w)) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) - - def test_multiple_udfs(self): - from pyspark.sql.functions import max, min, mean - - df = self.data - w = self.unbounded_window - - result1 = df.withColumn('mean_v', self.pandas_agg_mean_udf(df['v']).over(w)) \ - .withColumn('max_v', self.pandas_agg_max_udf(df['v']).over(w)) \ - .withColumn('min_w', self.pandas_agg_min_udf(df['w']).over(w)) - - expected1 = df.withColumn('mean_v', mean(df['v']).over(w)) \ - .withColumn('max_v', max(df['v']).over(w)) \ - .withColumn('min_w', min(df['w']).over(w)) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - - def test_replace_existing(self): - from pyspark.sql.functions import mean - - df = self.data - w = self.unbounded_window - - result1 = df.withColumn('v', self.pandas_agg_mean_udf(df['v']).over(w)) - expected1 = df.withColumn('v', mean(df['v']).over(w)) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - - def test_mixed_sql(self): - from pyspark.sql.functions import mean - - df = self.data - w = self.unbounded_window - mean_udf = self.pandas_agg_mean_udf - - result1 = df.withColumn('v', mean_udf(df['v'] * 2).over(w) + 1) - expected1 = df.withColumn('v', mean(df['v'] * 2).over(w) + 1) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - - def test_mixed_udf(self): - from pyspark.sql.functions import mean - - df = self.data - w = self.unbounded_window - - plus_one = self.python_plus_one - time_two = self.pandas_scalar_time_two - mean_udf = self.pandas_agg_mean_udf - - result1 = df.withColumn( - 'v2', - plus_one(mean_udf(plus_one(df['v'])).over(w))) - expected1 = df.withColumn( - 'v2', - plus_one(mean(plus_one(df['v'])).over(w))) - - result2 = df.withColumn( - 'v2', - time_two(mean_udf(time_two(df['v'])).over(w))) - expected2 = df.withColumn( - 'v2', - time_two(mean(time_two(df['v'])).over(w))) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) - - def test_without_partitionBy(self): - from pyspark.sql.functions import mean - - df = self.data - w = self.unpartitioned_window - mean_udf = self.pandas_agg_mean_udf - - result1 = df.withColumn('v2', mean_udf(df['v']).over(w)) - expected1 = df.withColumn('v2', mean(df['v']).over(w)) - - result2 = df.select(mean_udf(df['v']).over(w)) - expected2 = df.select(mean(df['v']).over(w)) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) - - def test_mixed_sql_and_udf(self): - from pyspark.sql.functions import max, min, rank, col - - df = self.data - w = self.unbounded_window - ow = self.ordered_window - max_udf = self.pandas_agg_max_udf - min_udf = self.pandas_agg_min_udf - - result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w)) - expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w)) - - # Test mixing sql window function and window udf in the same expression - result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w)) - expected2 = expected1 - - # Test chaining sql aggregate function and udf - result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \ - .withColumn('min_v', min(df['v']).over(w)) \ - .withColumn('v_diff', col('max_v') - col('min_v')) \ - .drop('max_v', 'min_v') - expected3 = expected1 - - # Test mixing sql window function and udf - result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \ - .withColumn('rank', rank().over(ow)) - expected4 = df.withColumn('max_v', max(df['v']).over(w)) \ - .withColumn('rank', rank().over(ow)) - - self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) - self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) - self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) - self.assertPandasEqual(expected4.toPandas(), result4.toPandas()) - - def test_array_type(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType - - df = self.data - w = self.unbounded_window - - array_udf = pandas_udf(lambda x: [1.0, 2.0], 'array', PandasUDFType.GROUPED_AGG) - result1 = df.withColumn('v2', array_udf(df['v']).over(w)) - self.assertEquals(result1.first()['v2'], [1.0, 2.0]) - - def test_invalid_args(self): - from pyspark.sql.functions import mean, pandas_udf, PandasUDFType - - df = self.data - w = self.unbounded_window - ow = self.ordered_window - mean_udf = self.pandas_agg_mean_udf - - with QuietTest(self.sc): - with self.assertRaisesRegexp( - AnalysisException, - '.*not supported within a window function'): - foo_udf = pandas_udf(lambda x: x, 'v double', PandasUDFType.GROUPED_MAP) - df.withColumn('v2', foo_udf(df['v']).over(w)) - - with QuietTest(self.sc): - with self.assertRaisesRegexp( - AnalysisException, - '.*Only unbounded window frame is supported.*'): - df.withColumn('mean_v', mean_udf(df['v']).over(ow)) - - -if __name__ == "__main__": - from pyspark.sql.tests import * - if xmlrunner: - unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2) - else: - unittest.main(verbosity=2) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py deleted file mode 100644 index 1d24c40..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py +++ /dev/null @@ -1,1883 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import decimal -import time -import datetime -import calendar -import json -import re -import base64 -from array import array -import ctypes - -if sys.version >= "3": - long = int - basestring = unicode = str - -from py4j.protocol import register_input_converter -from py4j.java_gateway import JavaClass - -from pyspark import SparkContext -from pyspark.serializers import CloudPickleSerializer - -__all__ = [ - "DataType", "NullType", "StringType", "BinaryType", "BooleanType", "DateType", - "TimestampType", "DecimalType", "DoubleType", "FloatType", "ByteType", "IntegerType", - "LongType", "ShortType", "ArrayType", "MapType", "StructField", "StructType"] - - -class DataType(object): - """Base class for data types.""" - - def __repr__(self): - return self.__class__.__name__ - - def __hash__(self): - return hash(str(self)) - - def __eq__(self, other): - return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ - - def __ne__(self, other): - return not self.__eq__(other) - - @classmethod - def typeName(cls): - return cls.__name__[:-4].lower() - - def simpleString(self): - return self.typeName() - - def jsonValue(self): - return self.typeName() - - def json(self): - return json.dumps(self.jsonValue(), - separators=(',', ':'), - sort_keys=True) - - def needConversion(self): - """ - Does this type need to conversion between Python object and internal SQL object. - - This is used to avoid the unnecessary conversion for ArrayType/MapType/StructType. - """ - return False - - def toInternal(self, obj): - """ - Converts a Python object into an internal SQL object. - """ - return obj - - def fromInternal(self, obj): - """ - Converts an internal SQL object into a native Python object. - """ - return obj - - -# This singleton pattern does not work with pickle, you will get -# another object after pickle and unpickle -class DataTypeSingleton(type): - """Metaclass for DataType""" - - _instances = {} - - def __call__(cls): - if cls not in cls._instances: - cls._instances[cls] = super(DataTypeSingleton, cls).__call__() - return cls._instances[cls] - - -class NullType(DataType): - """Null type. - - The data type representing None, used for the types that cannot be inferred. - """ - - __metaclass__ = DataTypeSingleton - - -class AtomicType(DataType): - """An internal type used to represent everything that is not - null, UDTs, arrays, structs, and maps.""" - - -class NumericType(AtomicType): - """Numeric data types. - """ - - -class IntegralType(NumericType): - """Integral data types. - """ - - __metaclass__ = DataTypeSingleton - - -class FractionalType(NumericType): - """Fractional data types. - """ - - -class StringType(AtomicType): - """String data type. - """ - - __metaclass__ = DataTypeSingleton - - -class BinaryType(AtomicType): - """Binary (byte array) data type. - """ - - __metaclass__ = DataTypeSingleton - - -class BooleanType(AtomicType): - """Boolean data type. - """ - - __metaclass__ = DataTypeSingleton - - -class DateType(AtomicType): - """Date (datetime.date) data type. - """ - - __metaclass__ = DataTypeSingleton - - EPOCH_ORDINAL = datetime.datetime(1970, 1, 1).toordinal() - - def needConversion(self): - return True - - def toInternal(self, d): - if d is not None: - return d.toordinal() - self.EPOCH_ORDINAL - - def fromInternal(self, v): - if v is not None: - return datetime.date.fromordinal(v + self.EPOCH_ORDINAL) - - -class TimestampType(AtomicType): - """Timestamp (datetime.datetime) data type. - """ - - __metaclass__ = DataTypeSingleton - - def needConversion(self): - return True - - def toInternal(self, dt): - if dt is not None: - seconds = (calendar.timegm(dt.utctimetuple()) if dt.tzinfo - else time.mktime(dt.timetuple())) - return int(seconds) * 1000000 + dt.microsecond - - def fromInternal(self, ts): - if ts is not None: - # using int to avoid precision loss in float - return datetime.datetime.fromtimestamp(ts // 1000000).replace(microsecond=ts % 1000000) - - -class DecimalType(FractionalType): - """Decimal (decimal.Decimal) data type. - - The DecimalType must have fixed precision (the maximum total number of digits) - and scale (the number of digits on the right of dot). For example, (5, 2) can - support the value from [-999.99 to 999.99]. - - The precision can be up to 38, the scale must be less or equal to precision. - - When create a DecimalType, the default precision and scale is (10, 0). When infer - schema from decimal.Decimal objects, it will be DecimalType(38, 18). - - :param precision: the maximum total number of digits (default: 10) - :param scale: the number of digits on right side of dot. (default: 0) - """ - - def __init__(self, precision=10, scale=0): - self.precision = precision - self.scale = scale - self.hasPrecisionInfo = True # this is public API - - def simpleString(self): - return "decimal(%d,%d)" % (self.precision, self.scale) - - def jsonValue(self): - return "decimal(%d,%d)" % (self.precision, self.scale) - - def __repr__(self): - return "DecimalType(%d,%d)" % (self.precision, self.scale) - - -class DoubleType(FractionalType): - """Double data type, representing double precision floats. - """ - - __metaclass__ = DataTypeSingleton - - -class FloatType(FractionalType): - """Float data type, representing single precision floats. - """ - - __metaclass__ = DataTypeSingleton - - -class ByteType(IntegralType): - """Byte data type, i.e. a signed integer in a single byte. - """ - def simpleString(self): - return 'tinyint' - - -class IntegerType(IntegralType): - """Int data type, i.e. a signed 32-bit integer. - """ - def simpleString(self): - return 'int' - - -class LongType(IntegralType): - """Long data type, i.e. a signed 64-bit integer. - - If the values are beyond the range of [-9223372036854775808, 9223372036854775807], - please use :class:`DecimalType`. - """ - def simpleString(self): - return 'bigint' - - -class ShortType(IntegralType): - """Short data type, i.e. a signed 16-bit integer. - """ - def simpleString(self): - return 'smallint' - - -class ArrayType(DataType): - """Array data type. - - :param elementType: :class:`DataType` of each element in the array. - :param containsNull: boolean, whether the array can contain null (None) values. - """ - - def __init__(self, elementType, containsNull=True): - """ - >>> ArrayType(StringType()) == ArrayType(StringType(), True) - True - >>> ArrayType(StringType(), False) == ArrayType(StringType()) - False - """ - assert isinstance(elementType, DataType),\ - "elementType %s should be an instance of %s" % (elementType, DataType) - self.elementType = elementType - self.containsNull = containsNull - - def simpleString(self): - return 'array<%s>' % self.elementType.simpleString() - - def __repr__(self): - return "ArrayType(%s,%s)" % (self.elementType, - str(self.containsNull).lower()) - - def jsonValue(self): - return {"type": self.typeName(), - "elementType": self.elementType.jsonValue(), - "containsNull": self.containsNull} - - @classmethod - def fromJson(cls, json): - return ArrayType(_parse_datatype_json_value(json["elementType"]), - json["containsNull"]) - - def needConversion(self): - return self.elementType.needConversion() - - def toInternal(self, obj): - if not self.needConversion(): - return obj - return obj and [self.elementType.toInternal(v) for v in obj] - - def fromInternal(self, obj): - if not self.needConversion(): - return obj - return obj and [self.elementType.fromInternal(v) for v in obj] - - -class MapType(DataType): - """Map data type. - - :param keyType: :class:`DataType` of the keys in the map. - :param valueType: :class:`DataType` of the values in the map. - :param valueContainsNull: indicates whether values can contain null (None) values. - - Keys in a map data type are not allowed to be null (None). - """ - - def __init__(self, keyType, valueType, valueContainsNull=True): - """ - >>> (MapType(StringType(), IntegerType()) - ... == MapType(StringType(), IntegerType(), True)) - True - >>> (MapType(StringType(), IntegerType(), False) - ... == MapType(StringType(), FloatType())) - False - """ - assert isinstance(keyType, DataType),\ - "keyType %s should be an instance of %s" % (keyType, DataType) - assert isinstance(valueType, DataType),\ - "valueType %s should be an instance of %s" % (valueType, DataType) - self.keyType = keyType - self.valueType = valueType - self.valueContainsNull = valueContainsNull - - def simpleString(self): - return 'map<%s,%s>' % (self.keyType.simpleString(), self.valueType.simpleString()) - - def __repr__(self): - return "MapType(%s,%s,%s)" % (self.keyType, self.valueType, - str(self.valueContainsNull).lower()) - - def jsonValue(self): - return {"type": self.typeName(), - "keyType": self.keyType.jsonValue(), - "valueType": self.valueType.jsonValue(), - "valueContainsNull": self.valueContainsNull} - - @classmethod - def fromJson(cls, json): - return MapType(_parse_datatype_json_value(json["keyType"]), - _parse_datatype_json_value(json["valueType"]), - json["valueContainsNull"]) - - def needConversion(self): - return self.keyType.needConversion() or self.valueType.needConversion() - - def toInternal(self, obj): - if not self.needConversion(): - return obj - return obj and dict((self.keyType.toInternal(k), self.valueType.toInternal(v)) - for k, v in obj.items()) - - def fromInternal(self, obj): - if not self.needConversion(): - return obj - return obj and dict((self.keyType.fromInternal(k), self.valueType.fromInternal(v)) - for k, v in obj.items()) - - -class StructField(DataType): - """A field in :class:`StructType`. - - :param name: string, name of the field. - :param dataType: :class:`DataType` of the field. - :param nullable: boolean, whether the field can be null (None) or not. - :param metadata: a dict from string to simple type that can be toInternald to JSON automatically - """ - - def __init__(self, name, dataType, nullable=True, metadata=None): - """ - >>> (StructField("f1", StringType(), True) - ... == StructField("f1", StringType(), True)) - True - >>> (StructField("f1", StringType(), True) - ... == StructField("f2", StringType(), True)) - False - """ - assert isinstance(dataType, DataType),\ - "dataType %s should be an instance of %s" % (dataType, DataType) - assert isinstance(name, basestring), "field name %s should be string" % (name) - if not isinstance(name, str): - name = name.encode('utf-8') - self.name = name - self.dataType = dataType - self.nullable = nullable - self.metadata = metadata or {} - - def simpleString(self): - return '%s:%s' % (self.name, self.dataType.simpleString()) - - def __repr__(self): - return "StructField(%s,%s,%s)" % (self.name, self.dataType, - str(self.nullable).lower()) - - def jsonValue(self): - return {"name": self.name, - "type": self.dataType.jsonValue(), - "nullable": self.nullable, - "metadata": self.metadata} - - @classmethod - def fromJson(cls, json): - return StructField(json["name"], - _parse_datatype_json_value(json["type"]), - json["nullable"], - json["metadata"]) - - def needConversion(self): - return self.dataType.needConversion() - - def toInternal(self, obj): - return self.dataType.toInternal(obj) - - def fromInternal(self, obj): - return self.dataType.fromInternal(obj) - - def typeName(self): - raise TypeError( - "StructField does not have typeName. " - "Use typeName on its type explicitly instead.") - - -class StructType(DataType): - """Struct type, consisting of a list of :class:`StructField`. - - This is the data type representing a :class:`Row`. - - Iterating a :class:`StructType` will iterate its :class:`StructField`\\s. - A contained :class:`StructField` can be accessed by name or position. - - >>> struct1 = StructType([StructField("f1", StringType(), True)]) - >>> struct1["f1"] - StructField(f1,StringType,true) - >>> struct1[0] - StructField(f1,StringType,true) - """ - def __init__(self, fields=None): - """ - >>> struct1 = StructType([StructField("f1", StringType(), True)]) - >>> struct2 = StructType([StructField("f1", StringType(), True)]) - >>> struct1 == struct2 - True - >>> struct1 = StructType([StructField("f1", StringType(), True)]) - >>> struct2 = StructType([StructField("f1", StringType(), True), - ... StructField("f2", IntegerType(), False)]) - >>> struct1 == struct2 - False - """ - if not fields: - self.fields = [] - self.names = [] - else: - self.fields = fields - self.names = [f.name for f in fields] - assert all(isinstance(f, StructField) for f in fields),\ - "fields should be a list of StructField" - # Precalculated list of fields that need conversion with fromInternal/toInternal functions - self._needConversion = [f.needConversion() for f in self] - self._needSerializeAnyField = any(self._needConversion) - - def add(self, field, data_type=None, nullable=True, metadata=None): - """ - Construct a StructType by adding new elements to it to define the schema. The method accepts - either: - - a) A single parameter which is a StructField object. - b) Between 2 and 4 parameters as (name, data_type, nullable (optional), - metadata(optional). The data_type parameter may be either a String or a - DataType object. - - >>> struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None) - >>> struct2 = StructType([StructField("f1", StringType(), True), \\ - ... StructField("f2", StringType(), True, None)]) - >>> struct1 == struct2 - True - >>> struct1 = StructType().add(StructField("f1", StringType(), True)) - >>> struct2 = StructType([StructField("f1", StringType(), True)]) - >>> struct1 == struct2 - True - >>> struct1 = StructType().add("f1", "string", True) - >>> struct2 = StructType([StructField("f1", StringType(), True)]) - >>> struct1 == struct2 - True - - :param field: Either the name of the field or a StructField object - :param data_type: If present, the DataType of the StructField to create - :param nullable: Whether the field to add should be nullable (default True) - :param metadata: Any additional metadata (default None) - :return: a new updated StructType - """ - if isinstance(field, StructField): - self.fields.append(field) - self.names.append(field.name) - else: - if isinstance(field, str) and data_type is None: - raise ValueError("Must specify DataType if passing name of struct_field to create.") - - if isinstance(data_type, str): - data_type_f = _parse_datatype_json_value(data_type) - else: - data_type_f = data_type - self.fields.append(StructField(field, data_type_f, nullable, metadata)) - self.names.append(field) - # Precalculated list of fields that need conversion with fromInternal/toInternal functions - self._needConversion = [f.needConversion() for f in self] - self._needSerializeAnyField = any(self._needConversion) - return self - - def __iter__(self): - """Iterate the fields""" - return iter(self.fields) - - def __len__(self): - """Return the number of fields.""" - return len(self.fields) - - def __getitem__(self, key): - """Access fields by name or slice.""" - if isinstance(key, str): - for field in self: - if field.name == key: - return field - raise KeyError('No StructField named {0}'.format(key)) - elif isinstance(key, int): - try: - return self.fields[key] - except IndexError: - raise IndexError('StructType index out of range') - elif isinstance(key, slice): - return StructType(self.fields[key]) - else: - raise TypeError('StructType keys should be strings, integers or slices') - - def simpleString(self): - return 'struct<%s>' % (','.join(f.simpleString() for f in self)) - - def __repr__(self): - return ("StructType(List(%s))" % - ",".join(str(field) for field in self)) - - def jsonValue(self): - return {"type": self.typeName(), - "fields": [f.jsonValue() for f in self]} - - @classmethod - def fromJson(cls, json): - return StructType([StructField.fromJson(f) for f in json["fields"]]) - - def fieldNames(self): - """ - Returns all field names in a list. - - >>> struct = StructType([StructField("f1", StringType(), True)]) - >>> struct.fieldNames() - ['f1'] - """ - return list(self.names) - - def needConversion(self): - # We need convert Row()/namedtuple into tuple() - return True - - def toInternal(self, obj): - if obj is None: - return - - if self._needSerializeAnyField: - # Only calling toInternal function for fields that need conversion - if isinstance(obj, dict): - return tuple(f.toInternal(obj.get(n)) if c else obj.get(n) - for n, f, c in zip(self.names, self.fields, self._needConversion)) - elif isinstance(obj, (tuple, list)): - return tuple(f.toInternal(v) if c else v - for f, v, c in zip(self.fields, obj, self._needConversion)) - elif hasattr(obj, "__dict__"): - d = obj.__dict__ - return tuple(f.toInternal(d.get(n)) if c else d.get(n) - for n, f, c in zip(self.names, self.fields, self._needConversion)) - else: - raise ValueError("Unexpected tuple %r with StructType" % obj) - else: - if isinstance(obj, dict): - return tuple(obj.get(n) for n in self.names) - elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False): - return tuple(obj[n] for n in self.names) - elif isinstance(obj, (list, tuple)): - return tuple(obj) - elif hasattr(obj, "__dict__"): - d = obj.__dict__ - return tuple(d.get(n) for n in self.names) - else: - raise ValueError("Unexpected tuple %r with StructType" % obj) - - def fromInternal(self, obj): - if obj is None: - return - if isinstance(obj, Row): - # it's already converted by pickler - return obj - if self._needSerializeAnyField: - # Only calling fromInternal function for fields that need conversion - values = [f.fromInternal(v) if c else v - for f, v, c in zip(self.fields, obj, self._needConversion)] - else: - values = obj - return _create_row(self.names, values) - - -class UserDefinedType(DataType): - """User-defined type (UDT). - - .. note:: WARN: Spark Internal Use Only - """ - - @classmethod - def typeName(cls): - return cls.__name__.lower() - - @classmethod - def sqlType(cls): - """ - Underlying SQL storage type for this UDT. - """ - raise NotImplementedError("UDT must implement sqlType().") - - @classmethod - def module(cls): - """ - The Python module of the UDT. - """ - raise NotImplementedError("UDT must implement module().") - - @classmethod - def scalaUDT(cls): - """ - The class name of the paired Scala UDT (could be '', if there - is no corresponding one). - """ - return '' - - def needConversion(self): - return True - - @classmethod - def _cachedSqlType(cls): - """ - Cache the sqlType() into class, because it's heavy used in `toInternal`. - """ - if not hasattr(cls, "_cached_sql_type"): - cls._cached_sql_type = cls.sqlType() - return cls._cached_sql_type - - def toInternal(self, obj): - if obj is not None: - return self._cachedSqlType().toInternal(self.serialize(obj)) - - def fromInternal(self, obj): - v = self._cachedSqlType().fromInternal(obj) - if v is not None: - return self.deserialize(v) - - def serialize(self, obj): - """ - Converts the a user-type object into a SQL datum. - """ - raise NotImplementedError("UDT must implement toInternal().") - - def deserialize(self, datum): - """ - Converts a SQL datum into a user-type object. - """ - raise NotImplementedError("UDT must implement fromInternal().") - - def simpleString(self): - return 'udt' - - def json(self): - return json.dumps(self.jsonValue(), separators=(',', ':'), sort_keys=True) - - def jsonValue(self): - if self.scalaUDT(): - assert self.module() != '__main__', 'UDT in __main__ cannot work with ScalaUDT' - schema = { - "type": "udt", - "class": self.scalaUDT(), - "pyClass": "%s.%s" % (self.module(), type(self).__name__), - "sqlType": self.sqlType().jsonValue() - } - else: - ser = CloudPickleSerializer() - b = ser.dumps(type(self)) - schema = { - "type": "udt", - "pyClass": "%s.%s" % (self.module(), type(self).__name__), - "serializedClass": base64.b64encode(b).decode('utf8'), - "sqlType": self.sqlType().jsonValue() - } - return schema - - @classmethod - def fromJson(cls, json): - pyUDT = str(json["pyClass"]) # convert unicode to str - split = pyUDT.rfind(".") - pyModule = pyUDT[:split] - pyClass = pyUDT[split+1:] - m = __import__(pyModule, globals(), locals(), [pyClass]) - if not hasattr(m, pyClass): - s = base64.b64decode(json['serializedClass'].encode('utf-8')) - UDT = CloudPickleSerializer().loads(s) - else: - UDT = getattr(m, pyClass) - return UDT() - - def __eq__(self, other): - return type(self) == type(other) - - -_atomic_types = [StringType, BinaryType, BooleanType, DecimalType, FloatType, DoubleType, - ByteType, ShortType, IntegerType, LongType, DateType, TimestampType, NullType] -_all_atomic_types = dict((t.typeName(), t) for t in _atomic_types) -_all_complex_types = dict((v.typeName(), v) - for v in [ArrayType, MapType, StructType]) - - -_FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)") - - -def _parse_datatype_string(s): - """ - Parses the given data type string to a :class:`DataType`. The data type string format equals - to :class:`DataType.simpleString`, except that top level struct type can omit - the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead - of ``tinyint`` for :class:`ByteType`. We can also use ``int`` as a short name - for :class:`IntegerType`. Since Spark 2.3, this also supports a schema in a DDL-formatted - string and case-insensitive strings. - - >>> _parse_datatype_string("int ") - IntegerType - >>> _parse_datatype_string("INT ") - IntegerType - >>> _parse_datatype_string("a: byte, b: decimal( 16 , 8 ) ") - StructType(List(StructField(a,ByteType,true),StructField(b,DecimalType(16,8),true))) - >>> _parse_datatype_string("a DOUBLE, b STRING") - StructType(List(StructField(a,DoubleType,true),StructField(b,StringType,true))) - >>> _parse_datatype_string("a: array< short>") - StructType(List(StructField(a,ArrayType(ShortType,true),true))) - >>> _parse_datatype_string(" map ") - MapType(StringType,StringType,true) - - >>> # Error cases - >>> _parse_datatype_string("blabla") # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ParseException:... - >>> _parse_datatype_string("a: int,") # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ParseException:... - >>> _parse_datatype_string("array>> _parse_datatype_string("map>") # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ParseException:... - """ - sc = SparkContext._active_spark_context - - def from_ddl_schema(type_str): - return _parse_datatype_json_string( - sc._jvm.org.apache.spark.sql.types.StructType.fromDDL(type_str).json()) - - def from_ddl_datatype(type_str): - return _parse_datatype_json_string( - sc._jvm.org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str).json()) - - try: - # DDL format, "fieldname datatype, fieldname datatype". - return from_ddl_schema(s) - except Exception as e: - try: - # For backwards compatibility, "integer", "struct" and etc. - return from_ddl_datatype(s) - except: - try: - # For backwards compatibility, "fieldname: datatype, fieldname: datatype" case. - return from_ddl_datatype("struct<%s>" % s.strip()) - except: - raise e - - -def _parse_datatype_json_string(json_string): - """Parses the given data type JSON string. - >>> import pickle - >>> def check_datatype(datatype): - ... pickled = pickle.loads(pickle.dumps(datatype)) - ... assert datatype == pickled - ... scala_datatype = spark._jsparkSession.parseDataType(datatype.json()) - ... python_datatype = _parse_datatype_json_string(scala_datatype.json()) - ... assert datatype == python_datatype - >>> for cls in _all_atomic_types.values(): - ... check_datatype(cls()) - - >>> # Simple ArrayType. - >>> simple_arraytype = ArrayType(StringType(), True) - >>> check_datatype(simple_arraytype) - - >>> # Simple MapType. - >>> simple_maptype = MapType(StringType(), LongType()) - >>> check_datatype(simple_maptype) - - >>> # Simple StructType. - >>> simple_structtype = StructType([ - ... StructField("a", DecimalType(), False), - ... StructField("b", BooleanType(), True), - ... StructField("c", LongType(), True), - ... StructField("d", BinaryType(), False)]) - >>> check_datatype(simple_structtype) - - >>> # Complex StructType. - >>> complex_structtype = StructType([ - ... StructField("simpleArray", simple_arraytype, True), - ... StructField("simpleMap", simple_maptype, True), - ... StructField("simpleStruct", simple_structtype, True), - ... StructField("boolean", BooleanType(), False), - ... StructField("withMeta", DoubleType(), False, {"name": "age"})]) - >>> check_datatype(complex_structtype) - - >>> # Complex ArrayType. - >>> complex_arraytype = ArrayType(complex_structtype, True) - >>> check_datatype(complex_arraytype) - - >>> # Complex MapType. - >>> complex_maptype = MapType(complex_structtype, - ... complex_arraytype, False) - >>> check_datatype(complex_maptype) - """ - return _parse_datatype_json_value(json.loads(json_string)) - - -def _parse_datatype_json_value(json_value): - if not isinstance(json_value, dict): - if json_value in _all_atomic_types.keys(): - return _all_atomic_types[json_value]() - elif json_value == 'decimal': - return DecimalType() - elif _FIXED_DECIMAL.match(json_value): - m = _FIXED_DECIMAL.match(json_value) - return DecimalType(int(m.group(1)), int(m.group(2))) - else: - raise ValueError("Could not parse datatype: %s" % json_value) - else: - tpe = json_value["type"] - if tpe in _all_complex_types: - return _all_complex_types[tpe].fromJson(json_value) - elif tpe == 'udt': - return UserDefinedType.fromJson(json_value) - else: - raise ValueError("not supported type: %s" % tpe) - - -# Mapping Python types to Spark SQL DataType -_type_mappings = { - type(None): NullType, - bool: BooleanType, - int: LongType, - float: DoubleType, - str: StringType, - bytearray: BinaryType, - decimal.Decimal: DecimalType, - datetime.date: DateType, - datetime.datetime: TimestampType, - datetime.time: TimestampType, -} - -if sys.version < "3": - _type_mappings.update({ - unicode: StringType, - long: LongType, - }) - -# Mapping Python array types to Spark SQL DataType -# We should be careful here. The size of these types in python depends on C -# implementation. We need to make sure that this conversion does not lose any -# precision. Also, JVM only support signed types, when converting unsigned types, -# keep in mind that it required 1 more bit when stored as singed types. -# -# Reference for C integer size, see: -# ISO/IEC 9899:201x specification, chapter 5.2.4.2.1 Sizes of integer types . -# Reference for python array typecode, see: -# https://docs.python.org/2/library/array.html -# https://docs.python.org/3.6/library/array.html -# Reference for JVM's supported integral types: -# http://docs.oracle.com/javase/specs/jvms/se8/html/jvms-2.html#jvms-2.3.1 - -_array_signed_int_typecode_ctype_mappings = { - 'b': ctypes.c_byte, - 'h': ctypes.c_short, - 'i': ctypes.c_int, - 'l': ctypes.c_long, -} - -_array_unsigned_int_typecode_ctype_mappings = { - 'B': ctypes.c_ubyte, - 'H': ctypes.c_ushort, - 'I': ctypes.c_uint, - 'L': ctypes.c_ulong -} - - -def _int_size_to_type(size): - """ - Return the Catalyst datatype from the size of integers. - """ - if size <= 8: - return ByteType - if size <= 16: - return ShortType - if size <= 32: - return IntegerType - if size <= 64: - return LongType - -# The list of all supported array typecodes is stored here -_array_type_mappings = { - # Warning: Actual properties for float and double in C is not specified in C. - # On almost every system supported by both python and JVM, they are IEEE 754 - # single-precision binary floating-point format and IEEE 754 double-precision - # binary floating-point format. And we do assume the same thing here for now. - 'f': FloatType, - 'd': DoubleType -} - -# compute array typecode mappings for signed integer types -for _typecode in _array_signed_int_typecode_ctype_mappings.keys(): - size = ctypes.sizeof(_array_signed_int_typecode_ctype_mappings[_typecode]) * 8 - dt = _int_size_to_type(size) - if dt is not None: - _array_type_mappings[_typecode] = dt - -# compute array typecode mappings for unsigned integer types -for _typecode in _array_unsigned_int_typecode_ctype_mappings.keys(): - # JVM does not have unsigned types, so use signed types that is at least 1 - # bit larger to store - size = ctypes.sizeof(_array_unsigned_int_typecode_ctype_mappings[_typecode]) * 8 + 1 - dt = _int_size_to_type(size) - if dt is not None: - _array_type_mappings[_typecode] = dt - -# Type code 'u' in Python's array is deprecated since version 3.3, and will be -# removed in version 4.0. See: https://docs.python.org/3/library/array.html -if sys.version_info[0] < 4: - _array_type_mappings['u'] = StringType - -# Type code 'c' are only available at python 2 -if sys.version_info[0] < 3: - _array_type_mappings['c'] = StringType - -# SPARK-21465: -# In python2, array of 'L' happened to be mistakenly partially supported. To -# avoid breaking user's code, we should keep this partial support. Below is a -# dirty hacking to keep this partial support and make the unit test passes -import platform -if sys.version_info[0] < 3 and platform.python_implementation() != 'PyPy': - if 'L' not in _array_type_mappings.keys(): - _array_type_mappings['L'] = LongType - _array_unsigned_int_typecode_ctype_mappings['L'] = ctypes.c_uint - - -def _infer_type(obj): - """Infer the DataType from obj - """ - if obj is None: - return NullType() - - if hasattr(obj, '__UDT__'): - return obj.__UDT__ - - dataType = _type_mappings.get(type(obj)) - if dataType is DecimalType: - # the precision and scale of `obj` may be different from row to row. - return DecimalType(38, 18) - elif dataType is not None: - return dataType() - - if isinstance(obj, dict): - for key, value in obj.items(): - if key is not None and value is not None: - return MapType(_infer_type(key), _infer_type(value), True) - else: - return MapType(NullType(), NullType(), True) - elif isinstance(obj, list): - for v in obj: - if v is not None: - return ArrayType(_infer_type(obj[0]), True) - else: - return ArrayType(NullType(), True) - elif isinstance(obj, array): - if obj.typecode in _array_type_mappings: - return ArrayType(_array_type_mappings[obj.typecode](), False) - else: - raise TypeError("not supported type: array(%s)" % obj.typecode) - else: - try: - return _infer_schema(obj) - except TypeError: - raise TypeError("not supported type: %s" % type(obj)) - - -def _infer_schema(row, names=None): - """Infer the schema from dict/namedtuple/object""" - if isinstance(row, dict): - items = sorted(row.items()) - - elif isinstance(row, (tuple, list)): - if hasattr(row, "__fields__"): # Row - items = zip(row.__fields__, tuple(row)) - elif hasattr(row, "_fields"): # namedtuple - items = zip(row._fields, tuple(row)) - else: - if names is None: - names = ['_%d' % i for i in range(1, len(row) + 1)] - elif len(names) < len(row): - names.extend('_%d' % i for i in range(len(names) + 1, len(row) + 1)) - items = zip(names, row) - - elif hasattr(row, "__dict__"): # object - items = sorted(row.__dict__.items()) - - else: - raise TypeError("Can not infer schema for type: %s" % type(row)) - - fields = [StructField(k, _infer_type(v), True) for k, v in items] - return StructType(fields) - - -def _has_nulltype(dt): - """ Return whether there is NullType in `dt` or not """ - if isinstance(dt, StructType): - return any(_has_nulltype(f.dataType) for f in dt.fields) - elif isinstance(dt, ArrayType): - return _has_nulltype((dt.elementType)) - elif isinstance(dt, MapType): - return _has_nulltype(dt.keyType) or _has_nulltype(dt.valueType) - else: - return isinstance(dt, NullType) - - -def _merge_type(a, b, name=None): - if name is None: - new_msg = lambda msg: msg - new_name = lambda n: "field %s" % n - else: - new_msg = lambda msg: "%s: %s" % (name, msg) - new_name = lambda n: "field %s in %s" % (n, name) - - if isinstance(a, NullType): - return b - elif isinstance(b, NullType): - return a - elif type(a) is not type(b): - # TODO: type cast (such as int -> long) - raise TypeError(new_msg("Can not merge type %s and %s" % (type(a), type(b)))) - - # same type - if isinstance(a, StructType): - nfs = dict((f.name, f.dataType) for f in b.fields) - fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType()), - name=new_name(f.name))) - for f in a.fields] - names = set([f.name for f in fields]) - for n in nfs: - if n not in names: - fields.append(StructField(n, nfs[n])) - return StructType(fields) - - elif isinstance(a, ArrayType): - return ArrayType(_merge_type(a.elementType, b.elementType, - name='element in array %s' % name), True) - - elif isinstance(a, MapType): - return MapType(_merge_type(a.keyType, b.keyType, name='key of map %s' % name), - _merge_type(a.valueType, b.valueType, name='value of map %s' % name), - True) - else: - return a - - -def _need_converter(dataType): - if isinstance(dataType, StructType): - return True - elif isinstance(dataType, ArrayType): - return _need_converter(dataType.elementType) - elif isinstance(dataType, MapType): - return _need_converter(dataType.keyType) or _need_converter(dataType.valueType) - elif isinstance(dataType, NullType): - return True - else: - return False - - -def _create_converter(dataType): - """Create a converter to drop the names of fields in obj """ - if not _need_converter(dataType): - return lambda x: x - - if isinstance(dataType, ArrayType): - conv = _create_converter(dataType.elementType) - return lambda row: [conv(v) for v in row] - - elif isinstance(dataType, MapType): - kconv = _create_converter(dataType.keyType) - vconv = _create_converter(dataType.valueType) - return lambda row: dict((kconv(k), vconv(v)) for k, v in row.items()) - - elif isinstance(dataType, NullType): - return lambda x: None - - elif not isinstance(dataType, StructType): - return lambda x: x - - # dataType must be StructType - names = [f.name for f in dataType.fields] - converters = [_create_converter(f.dataType) for f in dataType.fields] - convert_fields = any(_need_converter(f.dataType) for f in dataType.fields) - - def convert_struct(obj): - if obj is None: - return - - if isinstance(obj, (tuple, list)): - if convert_fields: - return tuple(conv(v) for v, conv in zip(obj, converters)) - else: - return tuple(obj) - - if isinstance(obj, dict): - d = obj - elif hasattr(obj, "__dict__"): # object - d = obj.__dict__ - else: - raise TypeError("Unexpected obj type: %s" % type(obj)) - - if convert_fields: - return tuple([conv(d.get(name)) for name, conv in zip(names, converters)]) - else: - return tuple([d.get(name) for name in names]) - - return convert_struct - - -_acceptable_types = { - BooleanType: (bool,), - ByteType: (int, long), - ShortType: (int, long), - IntegerType: (int, long), - LongType: (int, long), - FloatType: (float,), - DoubleType: (float,), - DecimalType: (decimal.Decimal,), - StringType: (str, unicode), - BinaryType: (bytearray,), - DateType: (datetime.date, datetime.datetime), - TimestampType: (datetime.datetime,), - ArrayType: (list, tuple, array), - MapType: (dict,), - StructType: (tuple, list, dict), -} - - -def _make_type_verifier(dataType, nullable=True, name=None): - """ - Make a verifier that checks the type of obj against dataType and raises a TypeError if they do - not match. - - This verifier also checks the value of obj against datatype and raises a ValueError if it's not - within the allowed range, e.g. using 128 as ByteType will overflow. Note that, Python float is - not checked, so it will become infinity when cast to Java float if it overflows. - - >>> _make_type_verifier(StructType([]))(None) - >>> _make_type_verifier(StringType())("") - >>> _make_type_verifier(LongType())(0) - >>> _make_type_verifier(ArrayType(ShortType()))(list(range(3))) - >>> _make_type_verifier(ArrayType(StringType()))(set()) # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - TypeError:... - >>> _make_type_verifier(MapType(StringType(), IntegerType()))({}) - >>> _make_type_verifier(StructType([]))(()) - >>> _make_type_verifier(StructType([]))([]) - >>> _make_type_verifier(StructType([]))([1]) # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ValueError:... - >>> # Check if numeric values are within the allowed range. - >>> _make_type_verifier(ByteType())(12) - >>> _make_type_verifier(ByteType())(1234) # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ValueError:... - >>> _make_type_verifier(ByteType(), False)(None) # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ValueError:... - >>> _make_type_verifier( - ... ArrayType(ShortType(), False))([1, None]) # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ValueError:... - >>> _make_type_verifier(MapType(StringType(), IntegerType()))({None: 1}) - Traceback (most recent call last): - ... - ValueError:... - >>> schema = StructType().add("a", IntegerType()).add("b", StringType(), False) - >>> _make_type_verifier(schema)((1, None)) # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ValueError:... - """ - - if name is None: - new_msg = lambda msg: msg - new_name = lambda n: "field %s" % n - else: - new_msg = lambda msg: "%s: %s" % (name, msg) - new_name = lambda n: "field %s in %s" % (n, name) - - def verify_nullability(obj): - if obj is None: - if nullable: - return True - else: - raise ValueError(new_msg("This field is not nullable, but got None")) - else: - return False - - _type = type(dataType) - - def assert_acceptable_types(obj): - assert _type in _acceptable_types, \ - new_msg("unknown datatype: %s for object %r" % (dataType, obj)) - - def verify_acceptable_types(obj): - # subclass of them can not be fromInternal in JVM - if type(obj) not in _acceptable_types[_type]: - raise TypeError(new_msg("%s can not accept object %r in type %s" - % (dataType, obj, type(obj)))) - - if isinstance(dataType, StringType): - # StringType can work with any types - verify_value = lambda _: _ - - elif isinstance(dataType, UserDefinedType): - verifier = _make_type_verifier(dataType.sqlType(), name=name) - - def verify_udf(obj): - if not (hasattr(obj, '__UDT__') and obj.__UDT__ == dataType): - raise ValueError(new_msg("%r is not an instance of type %r" % (obj, dataType))) - verifier(dataType.toInternal(obj)) - - verify_value = verify_udf - - elif isinstance(dataType, ByteType): - def verify_byte(obj): - assert_acceptable_types(obj) - verify_acceptable_types(obj) - if obj < -128 or obj > 127: - raise ValueError(new_msg("object of ByteType out of range, got: %s" % obj)) - - verify_value = verify_byte - - elif isinstance(dataType, ShortType): - def verify_short(obj): - assert_acceptable_types(obj) - verify_acceptable_types(obj) - if obj < -32768 or obj > 32767: - raise ValueError(new_msg("object of ShortType out of range, got: %s" % obj)) - - verify_value = verify_short - - elif isinstance(dataType, IntegerType): - def verify_integer(obj): - assert_acceptable_types(obj) - verify_acceptable_types(obj) - if obj < -2147483648 or obj > 2147483647: - raise ValueError( - new_msg("object of IntegerType out of range, got: %s" % obj)) - - verify_value = verify_integer - - elif isinstance(dataType, ArrayType): - element_verifier = _make_type_verifier( - dataType.elementType, dataType.containsNull, name="element in array %s" % name) - - def verify_array(obj): - assert_acceptable_types(obj) - verify_acceptable_types(obj) - for i in obj: - element_verifier(i) - - verify_value = verify_array - - elif isinstance(dataType, MapType): - key_verifier = _make_type_verifier(dataType.keyType, False, name="key of map %s" % name) - value_verifier = _make_type_verifier( - dataType.valueType, dataType.valueContainsNull, name="value of map %s" % name) - - def verify_map(obj): - assert_acceptable_types(obj) - verify_acceptable_types(obj) - for k, v in obj.items(): - key_verifier(k) - value_verifier(v) - - verify_value = verify_map - - elif isinstance(dataType, StructType): - verifiers = [] - for f in dataType.fields: - verifier = _make_type_verifier(f.dataType, f.nullable, name=new_name(f.name)) - verifiers.append((f.name, verifier)) - - def verify_struct(obj): - assert_acceptable_types(obj) - - if isinstance(obj, dict): - for f, verifier in verifiers: - verifier(obj.get(f)) - elif isinstance(obj, Row) and getattr(obj, "__from_dict__", False): - # the order in obj could be different than dataType.fields - for f, verifier in verifiers: - verifier(obj[f]) - elif isinstance(obj, (tuple, list)): - if len(obj) != len(verifiers): - raise ValueError( - new_msg("Length of object (%d) does not match with " - "length of fields (%d)" % (len(obj), len(verifiers)))) - for v, (_, verifier) in zip(obj, verifiers): - verifier(v) - elif hasattr(obj, "__dict__"): - d = obj.__dict__ - for f, verifier in verifiers: - verifier(d.get(f)) - else: - raise TypeError(new_msg("StructType can not accept object %r in type %s" - % (obj, type(obj)))) - verify_value = verify_struct - - else: - def verify_default(obj): - assert_acceptable_types(obj) - verify_acceptable_types(obj) - - verify_value = verify_default - - def verify(obj): - if not verify_nullability(obj): - verify_value(obj) - - return verify - - -# This is used to unpickle a Row from JVM -def _create_row_inbound_converter(dataType): - return lambda *a: dataType.fromInternal(a) - - -def _create_row(fields, values): - row = Row(*values) - row.__fields__ = fields - return row - - -class Row(tuple): - - """ - A row in L{DataFrame}. - The fields in it can be accessed: - - * like attributes (``row.key``) - * like dictionary values (``row[key]``) - - ``key in row`` will search through row keys. - - Row can be used to create a row object by using named arguments, - the fields will be sorted by names. It is not allowed to omit - a named argument to represent the value is None or missing. This should be - explicitly set to None in this case. - - >>> row = Row(name="Alice", age=11) - >>> row - Row(age=11, name='Alice') - >>> row['name'], row['age'] - ('Alice', 11) - >>> row.name, row.age - ('Alice', 11) - >>> 'name' in row - True - >>> 'wrong_key' in row - False - - Row also can be used to create another Row like class, then it - could be used to create Row objects, such as - - >>> Person = Row("name", "age") - >>> Person - - >>> 'name' in Person - True - >>> 'wrong_key' in Person - False - >>> Person("Alice", 11) - Row(name='Alice', age=11) - """ - - def __new__(self, *args, **kwargs): - if args and kwargs: - raise ValueError("Can not use both args " - "and kwargs to create Row") - if kwargs: - # create row objects - names = sorted(kwargs.keys()) - row = tuple.__new__(self, [kwargs[n] for n in names]) - row.__fields__ = names - row.__from_dict__ = True - return row - - else: - # create row class or objects - return tuple.__new__(self, args) - - def asDict(self, recursive=False): - """ - Return as an dict - - :param recursive: turns the nested Row as dict (default: False). - - >>> Row(name="Alice", age=11).asDict() == {'name': 'Alice', 'age': 11} - True - >>> row = Row(key=1, value=Row(name='a', age=2)) - >>> row.asDict() == {'key': 1, 'value': Row(age=2, name='a')} - True - >>> row.asDict(True) == {'key': 1, 'value': {'name': 'a', 'age': 2}} - True - """ - if not hasattr(self, "__fields__"): - raise TypeError("Cannot convert a Row class into dict") - - if recursive: - def conv(obj): - if isinstance(obj, Row): - return obj.asDict(True) - elif isinstance(obj, list): - return [conv(o) for o in obj] - elif isinstance(obj, dict): - return dict((k, conv(v)) for k, v in obj.items()) - else: - return obj - return dict(zip(self.__fields__, (conv(o) for o in self))) - else: - return dict(zip(self.__fields__, self)) - - def __contains__(self, item): - if hasattr(self, "__fields__"): - return item in self.__fields__ - else: - return super(Row, self).__contains__(item) - - # let object acts like class - def __call__(self, *args): - """create new Row object""" - if len(args) > len(self): - raise ValueError("Can not create Row with fields %s, expected %d values " - "but got %s" % (self, len(self), args)) - return _create_row(self, args) - - def __getitem__(self, item): - if isinstance(item, (int, slice)): - return super(Row, self).__getitem__(item) - try: - # it will be slow when it has many fields, - # but this will not be used in normal cases - idx = self.__fields__.index(item) - return super(Row, self).__getitem__(idx) - except IndexError: - raise KeyError(item) - except ValueError: - raise ValueError(item) - - def __getattr__(self, item): - if item.startswith("__"): - raise AttributeError(item) - try: - # it will be slow when it has many fields, - # but this will not be used in normal cases - idx = self.__fields__.index(item) - return self[idx] - except IndexError: - raise AttributeError(item) - except ValueError: - raise AttributeError(item) - - def __setattr__(self, key, value): - if key != '__fields__' and key != "__from_dict__": - raise Exception("Row is read-only") - self.__dict__[key] = value - - def __reduce__(self): - """Returns a tuple so Python knows how to pickle Row.""" - if hasattr(self, "__fields__"): - return (_create_row, (self.__fields__, tuple(self))) - else: - return tuple.__reduce__(self) - - def __repr__(self): - """Printable representation of Row used in Python REPL.""" - if hasattr(self, "__fields__"): - return "Row(%s)" % ", ".join("%s=%r" % (k, v) - for k, v in zip(self.__fields__, tuple(self))) - else: - return "" % ", ".join(self) - - -class DateConverter(object): - def can_convert(self, obj): - return isinstance(obj, datetime.date) - - def convert(self, obj, gateway_client): - Date = JavaClass("java.sql.Date", gateway_client) - return Date.valueOf(obj.strftime("%Y-%m-%d")) - - -class DatetimeConverter(object): - def can_convert(self, obj): - return isinstance(obj, datetime.datetime) - - def convert(self, obj, gateway_client): - Timestamp = JavaClass("java.sql.Timestamp", gateway_client) - seconds = (calendar.timegm(obj.utctimetuple()) if obj.tzinfo - else time.mktime(obj.timetuple())) - t = Timestamp(int(seconds) * 1000) - t.setNanos(obj.microsecond * 1000) - return t - -# datetime is a subclass of date, we should register DatetimeConverter first -register_input_converter(DatetimeConverter()) -register_input_converter(DateConverter()) - - -def to_arrow_type(dt): - """ Convert Spark data type to pyarrow type - """ - from distutils.version import LooseVersion - import pyarrow as pa - if type(dt) == BooleanType: - arrow_type = pa.bool_() - elif type(dt) == ByteType: - arrow_type = pa.int8() - elif type(dt) == ShortType: - arrow_type = pa.int16() - elif type(dt) == IntegerType: - arrow_type = pa.int32() - elif type(dt) == LongType: - arrow_type = pa.int64() - elif type(dt) == FloatType: - arrow_type = pa.float32() - elif type(dt) == DoubleType: - arrow_type = pa.float64() - elif type(dt) == DecimalType: - arrow_type = pa.decimal128(dt.precision, dt.scale) - elif type(dt) == StringType: - arrow_type = pa.string() - elif type(dt) == BinaryType: - # TODO: remove version check once minimum pyarrow version is 0.10.0 - if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): - raise TypeError("Unsupported type in conversion to Arrow: " + str(dt) + - "\nPlease install pyarrow >= 0.10.0 for BinaryType support.") - arrow_type = pa.binary() - elif type(dt) == DateType: - arrow_type = pa.date32() - elif type(dt) == TimestampType: - # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read - arrow_type = pa.timestamp('us', tz='UTC') - elif type(dt) == ArrayType: - if type(dt.elementType) == TimestampType: - raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) - arrow_type = pa.list_(to_arrow_type(dt.elementType)) - else: - raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) - return arrow_type - - -def to_arrow_schema(schema): - """ Convert a schema from Spark to Arrow - """ - import pyarrow as pa - fields = [pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable) - for field in schema] - return pa.schema(fields) - - -def from_arrow_type(at): - """ Convert pyarrow type to Spark data type. - """ - from distutils.version import LooseVersion - import pyarrow as pa - import pyarrow.types as types - if types.is_boolean(at): - spark_type = BooleanType() - elif types.is_int8(at): - spark_type = ByteType() - elif types.is_int16(at): - spark_type = ShortType() - elif types.is_int32(at): - spark_type = IntegerType() - elif types.is_int64(at): - spark_type = LongType() - elif types.is_float32(at): - spark_type = FloatType() - elif types.is_float64(at): - spark_type = DoubleType() - elif types.is_decimal(at): - spark_type = DecimalType(precision=at.precision, scale=at.scale) - elif types.is_string(at): - spark_type = StringType() - elif types.is_binary(at): - # TODO: remove version check once minimum pyarrow version is 0.10.0 - if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): - raise TypeError("Unsupported type in conversion from Arrow: " + str(at) + - "\nPlease install pyarrow >= 0.10.0 for BinaryType support.") - spark_type = BinaryType() - elif types.is_date32(at): - spark_type = DateType() - elif types.is_timestamp(at): - spark_type = TimestampType() - elif types.is_list(at): - if types.is_timestamp(at.value_type): - raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) - spark_type = ArrayType(from_arrow_type(at.value_type)) - else: - raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) - return spark_type - - -def from_arrow_schema(arrow_schema): - """ Convert schema from Arrow to Spark. - """ - return StructType( - [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) - for field in arrow_schema]) - - -def _check_series_convert_date(series, data_type): - """ - Cast the series to datetime.date if it's a date type, otherwise returns the original series. - - :param series: pandas.Series - :param data_type: a Spark data type for the series - """ - if type(data_type) == DateType: - return series.dt.date - else: - return series - - -def _check_dataframe_convert_date(pdf, schema): - """ Correct date type value to use datetime.date. - - Pandas DataFrame created from PyArrow uses datetime64[ns] for date type values, but we should - use datetime.date to match the behavior with when Arrow optimization is disabled. - - :param pdf: pandas.DataFrame - :param schema: a Spark schema of the pandas.DataFrame - """ - for field in schema: - pdf[field.name] = _check_series_convert_date(pdf[field.name], field.dataType) - return pdf - - -def _get_local_timezone(): - """ Get local timezone using pytz with environment variable, or dateutil. - - If there is a 'TZ' environment variable, pass it to pandas to use pytz and use it as timezone - string, otherwise use the special word 'dateutil/:' which means that pandas uses dateutil and - it reads system configuration to know the system local timezone. - - See also: - - https://github.com/pandas-dev/pandas/blob/0.19.x/pandas/tslib.pyx#L1753 - - https://github.com/dateutil/dateutil/blob/2.6.1/dateutil/tz/tz.py#L1338 - """ - import os - return os.environ.get('TZ', 'dateutil/:') - - -def _check_series_localize_timestamps(s, timezone): - """ - Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone. - - If the input series is not a timestamp series, then the same series is returned. If the input - series is a timestamp series, then a converted series is returned. - - :param s: pandas.Series - :param timezone: the timezone to convert. if None then use local timezone - :return pandas.Series that have been converted to tz-naive - """ - from pyspark.sql.utils import require_minimum_pandas_version - require_minimum_pandas_version() - - from pandas.api.types import is_datetime64tz_dtype - tz = timezone or _get_local_timezone() - # TODO: handle nested timestamps, such as ArrayType(TimestampType())? - if is_datetime64tz_dtype(s.dtype): - return s.dt.tz_convert(tz).dt.tz_localize(None) - else: - return s - - -def _check_dataframe_localize_timestamps(pdf, timezone): - """ - Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone - - :param pdf: pandas.DataFrame - :param timezone: the timezone to convert. if None then use local timezone - :return pandas.DataFrame where any timezone aware columns have been converted to tz-naive - """ - from pyspark.sql.utils import require_minimum_pandas_version - require_minimum_pandas_version() - - for column, series in pdf.iteritems(): - pdf[column] = _check_series_localize_timestamps(series, timezone) - return pdf - - -def _check_series_convert_timestamps_internal(s, timezone): - """ - Convert a tz-naive timestamp in the specified timezone or local timezone to UTC normalized for - Spark internal storage - - :param s: a pandas.Series - :param timezone: the timezone to convert. if None then use local timezone - :return pandas.Series where if it is a timestamp, has been UTC normalized without a time zone - """ - from pyspark.sql.utils import require_minimum_pandas_version - require_minimum_pandas_version() - - from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype - # TODO: handle nested timestamps, such as ArrayType(TimestampType())? - if is_datetime64_dtype(s.dtype): - # When tz_localize a tz-naive timestamp, the result is ambiguous if the tz-naive - # timestamp is during the hour when the clock is adjusted backward during due to - # daylight saving time (dst). - # E.g., for America/New_York, the clock is adjusted backward on 2015-11-01 2:00 to - # 2015-11-01 1:00 from dst-time to standard time, and therefore, when tz_localize - # a tz-naive timestamp 2015-11-01 1:30 with America/New_York timezone, it can be either - # dst time (2015-01-01 1:30-0400) or standard time (2015-11-01 1:30-0500). - # - # Here we explicit choose to use standard time. This matches the default behavior of - # pytz. - # - # Here are some code to help understand this behavior: - # >>> import datetime - # >>> import pandas as pd - # >>> import pytz - # >>> - # >>> t = datetime.datetime(2015, 11, 1, 1, 30) - # >>> ts = pd.Series([t]) - # >>> tz = pytz.timezone('America/New_York') - # >>> - # >>> ts.dt.tz_localize(tz, ambiguous=True) - # 0 2015-11-01 01:30:00-04:00 - # dtype: datetime64[ns, America/New_York] - # >>> - # >>> ts.dt.tz_localize(tz, ambiguous=False) - # 0 2015-11-01 01:30:00-05:00 - # dtype: datetime64[ns, America/New_York] - # >>> - # >>> str(tz.localize(t)) - # '2015-11-01 01:30:00-05:00' - tz = timezone or _get_local_timezone() - return s.dt.tz_localize(tz, ambiguous=False).dt.tz_convert('UTC') - elif is_datetime64tz_dtype(s.dtype): - return s.dt.tz_convert('UTC') - else: - return s - - -def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone): - """ - Convert timestamp to timezone-naive in the specified timezone or local timezone - - :param s: a pandas.Series - :param from_timezone: the timezone to convert from. if None then use local timezone - :param to_timezone: the timezone to convert to. if None then use local timezone - :return pandas.Series where if it is a timestamp, has been converted to tz-naive - """ - from pyspark.sql.utils import require_minimum_pandas_version - require_minimum_pandas_version() - - import pandas as pd - from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype - from_tz = from_timezone or _get_local_timezone() - to_tz = to_timezone or _get_local_timezone() - # TODO: handle nested timestamps, such as ArrayType(TimestampType())? - if is_datetime64tz_dtype(s.dtype): - return s.dt.tz_convert(to_tz).dt.tz_localize(None) - elif is_datetime64_dtype(s.dtype) and from_tz != to_tz: - # `s.dt.tz_localize('tzlocal()')` doesn't work properly when including NaT. - return s.apply( - lambda ts: ts.tz_localize(from_tz, ambiguous=False).tz_convert(to_tz).tz_localize(None) - if ts is not pd.NaT else pd.NaT) - else: - return s - - -def _check_series_convert_timestamps_local_tz(s, timezone): - """ - Convert timestamp to timezone-naive in the specified timezone or local timezone - - :param s: a pandas.Series - :param timezone: the timezone to convert to. if None then use local timezone - :return pandas.Series where if it is a timestamp, has been converted to tz-naive - """ - return _check_series_convert_timestamps_localize(s, None, timezone) - - -def _check_series_convert_timestamps_tz_local(s, timezone): - """ - Convert timestamp to timezone-naive in the specified timezone or local timezone - - :param s: a pandas.Series - :param timezone: the timezone to convert from. if None then use local timezone - :return pandas.Series where if it is a timestamp, has been converted to tz-naive - """ - return _check_series_convert_timestamps_localize(s, timezone, None) - - -def _test(): - import doctest - from pyspark.context import SparkContext - from pyspark.sql import SparkSession - globs = globals() - sc = SparkContext('local[4]', 'PythonTest') - globs['sc'] = sc - globs['spark'] = SparkSession.builder.getOrCreate() - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - globs['sc'].stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/udf.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/udf.py deleted file mode 100644 index 58f4e0d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/udf.py +++ /dev/null @@ -1,414 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -""" -User-defined function related classes and functions -""" -import functools -import sys - -from pyspark import SparkContext, since -from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType, ignore_unicode_prefix -from pyspark.sql.column import Column, _to_java_column, _to_seq -from pyspark.sql.types import StringType, DataType, StructType, _parse_datatype_string,\ - to_arrow_type, to_arrow_schema -from pyspark.util import _get_argspec - -__all__ = ["UDFRegistration"] - - -def _wrap_function(sc, func, returnType): - command = (func, returnType) - pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command) - return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec, - sc.pythonVer, broadcast_vars, sc._javaAccumulator) - - -def _create_udf(f, returnType, evalType): - - if evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, - PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, - PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF): - - from pyspark.sql.utils import require_minimum_pyarrow_version - require_minimum_pyarrow_version() - - argspec = _get_argspec(f) - - if evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF and len(argspec.args) == 0 and \ - argspec.varargs is None: - raise ValueError( - "Invalid function: 0-arg pandas_udfs are not supported. " - "Instead, create a 1-arg pandas_udf and ignore the arg in your function." - ) - - if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \ - and len(argspec.args) not in (1, 2): - raise ValueError( - "Invalid function: pandas_udfs with function type GROUPED_MAP " - "must take either one argument (data) or two arguments (key, data).") - - # Set the name of the UserDefinedFunction object to be the name of function f - udf_obj = UserDefinedFunction( - f, returnType=returnType, name=None, evalType=evalType, deterministic=True) - return udf_obj._wrapped() - - -class UserDefinedFunction(object): - """ - User defined function in Python - - .. versionadded:: 1.3 - """ - def __init__(self, func, - returnType=StringType(), - name=None, - evalType=PythonEvalType.SQL_BATCHED_UDF, - deterministic=True): - if not callable(func): - raise TypeError( - "Invalid function: not a function or callable (__call__ is not defined): " - "{0}".format(type(func))) - - if not isinstance(returnType, (DataType, str)): - raise TypeError( - "Invalid returnType: returnType should be DataType or str " - "but is {}".format(returnType)) - - if not isinstance(evalType, int): - raise TypeError( - "Invalid evalType: evalType should be an int but is {}".format(evalType)) - - self.func = func - self._returnType = returnType - # Stores UserDefinedPythonFunctions jobj, once initialized - self._returnType_placeholder = None - self._judf_placeholder = None - self._name = name or ( - func.__name__ if hasattr(func, '__name__') - else func.__class__.__name__) - self.evalType = evalType - self.deterministic = deterministic - - @property - def returnType(self): - # This makes sure this is called after SparkContext is initialized. - # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string. - if self._returnType_placeholder is None: - if isinstance(self._returnType, DataType): - self._returnType_placeholder = self._returnType - else: - self._returnType_placeholder = _parse_datatype_string(self._returnType) - - if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF: - try: - to_arrow_type(self._returnType_placeholder) - except TypeError: - raise NotImplementedError( - "Invalid returnType with scalar Pandas UDFs: %s is " - "not supported" % str(self._returnType_placeholder)) - elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: - if isinstance(self._returnType_placeholder, StructType): - try: - to_arrow_schema(self._returnType_placeholder) - except TypeError: - raise NotImplementedError( - "Invalid returnType with grouped map Pandas UDFs: " - "%s is not supported" % str(self._returnType_placeholder)) - else: - raise TypeError("Invalid returnType for grouped map Pandas " - "UDFs: returnType must be a StructType.") - elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: - try: - to_arrow_type(self._returnType_placeholder) - except TypeError: - raise NotImplementedError( - "Invalid returnType with grouped aggregate Pandas UDFs: " - "%s is not supported" % str(self._returnType_placeholder)) - - return self._returnType_placeholder - - @property - def _judf(self): - # It is possible that concurrent access, to newly created UDF, - # will initialize multiple UserDefinedPythonFunctions. - # This is unlikely, doesn't affect correctness, - # and should have a minimal performance impact. - if self._judf_placeholder is None: - self._judf_placeholder = self._create_judf() - return self._judf_placeholder - - def _create_judf(self): - from pyspark.sql import SparkSession - - spark = SparkSession.builder.getOrCreate() - sc = spark.sparkContext - - wrapped_func = _wrap_function(sc, self.func, self.returnType) - jdt = spark._jsparkSession.parseDataType(self.returnType.json()) - judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction( - self._name, wrapped_func, jdt, self.evalType, self.deterministic) - return judf - - def __call__(self, *cols): - judf = self._judf - sc = SparkContext._active_spark_context - return Column(judf.apply(_to_seq(sc, cols, _to_java_column))) - - # This function is for improving the online help system in the interactive interpreter. - # For example, the built-in help / pydoc.help. It wraps the UDF with the docstring and - # argument annotation. (See: SPARK-19161) - def _wrapped(self): - """ - Wrap this udf with a function and attach docstring from func - """ - - # It is possible for a callable instance without __name__ attribute or/and - # __module__ attribute to be wrapped here. For example, functools.partial. In this case, - # we should avoid wrapping the attributes from the wrapped function to the wrapper - # function. So, we take out these attribute names from the default names to set and - # then manually assign it after being wrapped. - assignments = tuple( - a for a in functools.WRAPPER_ASSIGNMENTS if a != '__name__' and a != '__module__') - - @functools.wraps(self.func, assigned=assignments) - def wrapper(*args): - return self(*args) - - wrapper.__name__ = self._name - wrapper.__module__ = (self.func.__module__ if hasattr(self.func, '__module__') - else self.func.__class__.__module__) - - wrapper.func = self.func - wrapper.returnType = self.returnType - wrapper.evalType = self.evalType - wrapper.deterministic = self.deterministic - wrapper.asNondeterministic = functools.wraps( - self.asNondeterministic)(lambda: self.asNondeterministic()._wrapped()) - return wrapper - - def asNondeterministic(self): - """ - Updates UserDefinedFunction to nondeterministic. - - .. versionadded:: 2.3 - """ - # Here, we explicitly clean the cache to create a JVM UDF instance - # with 'deterministic' updated. See SPARK-23233. - self._judf_placeholder = None - self.deterministic = False - return self - - -class UDFRegistration(object): - """ - Wrapper for user-defined function registration. This instance can be accessed by - :attr:`spark.udf` or :attr:`sqlContext.udf`. - - .. versionadded:: 1.3.1 - """ - - def __init__(self, sparkSession): - self.sparkSession = sparkSession - - @ignore_unicode_prefix - @since("1.3.1") - def register(self, name, f, returnType=None): - """Register a Python function (including lambda function) or a user-defined function - as a SQL function. - - :param name: name of the user-defined function in SQL statements. - :param f: a Python function, or a user-defined function. The user-defined function can - be either row-at-a-time or vectorized. See :meth:`pyspark.sql.functions.udf` and - :meth:`pyspark.sql.functions.pandas_udf`. - :param returnType: the return type of the registered user-defined function. The value can - be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. - :return: a user-defined function. - - To register a nondeterministic Python function, users need to first build - a nondeterministic user-defined function for the Python function and then register it - as a SQL function. - - `returnType` can be optionally specified when `f` is a Python function but not - when `f` is a user-defined function. Please see below. - - 1. When `f` is a Python function: - - `returnType` defaults to string type and can be optionally specified. The produced - object must match the specified type. In this case, this API works as if - `register(name, f, returnType=StringType())`. - - >>> strlen = spark.udf.register("stringLengthString", lambda x: len(x)) - >>> spark.sql("SELECT stringLengthString('test')").collect() - [Row(stringLengthString(test)=u'4')] - - >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect() - [Row(stringLengthString(text)=u'3')] - - >>> from pyspark.sql.types import IntegerType - >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType()) - >>> spark.sql("SELECT stringLengthInt('test')").collect() - [Row(stringLengthInt(test)=4)] - - >>> from pyspark.sql.types import IntegerType - >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType()) - >>> spark.sql("SELECT stringLengthInt('test')").collect() - [Row(stringLengthInt(test)=4)] - - 2. When `f` is a user-defined function: - - Spark uses the return type of the given user-defined function as the return type of - the registered user-defined function. `returnType` should not be specified. - In this case, this API works as if `register(name, f)`. - - >>> from pyspark.sql.types import IntegerType - >>> from pyspark.sql.functions import udf - >>> slen = udf(lambda s: len(s), IntegerType()) - >>> _ = spark.udf.register("slen", slen) - >>> spark.sql("SELECT slen('test')").collect() - [Row(slen(test)=4)] - - >>> import random - >>> from pyspark.sql.functions import udf - >>> from pyspark.sql.types import IntegerType - >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic() - >>> new_random_udf = spark.udf.register("random_udf", random_udf) - >>> spark.sql("SELECT random_udf()").collect() # doctest: +SKIP - [Row(random_udf()=82)] - - >>> from pyspark.sql.functions import pandas_udf, PandasUDFType - >>> @pandas_udf("integer", PandasUDFType.SCALAR) # doctest: +SKIP - ... def add_one(x): - ... return x + 1 - ... - >>> _ = spark.udf.register("add_one", add_one) # doctest: +SKIP - >>> spark.sql("SELECT add_one(id) FROM range(3)").collect() # doctest: +SKIP - [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)] - - >>> @pandas_udf("integer", PandasUDFType.GROUPED_AGG) # doctest: +SKIP - ... def sum_udf(v): - ... return v.sum() - ... - >>> _ = spark.udf.register("sum_udf", sum_udf) # doctest: +SKIP - >>> q = "SELECT sum_udf(v1) FROM VALUES (3, 0), (2, 0), (1, 1) tbl(v1, v2) GROUP BY v2" - >>> spark.sql(q).collect() # doctest: +SKIP - [Row(sum_udf(v1)=1), Row(sum_udf(v1)=5)] - - .. note:: Registration for a user-defined function (case 2.) was added from - Spark 2.3.0. - """ - - # This is to check whether the input function is from a user-defined function or - # Python function. - if hasattr(f, 'asNondeterministic'): - if returnType is not None: - raise TypeError( - "Invalid returnType: data type can not be specified when f is" - "a user-defined function, but got %s." % returnType) - if f.evalType not in [PythonEvalType.SQL_BATCHED_UDF, - PythonEvalType.SQL_SCALAR_PANDAS_UDF, - PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]: - raise ValueError( - "Invalid f: f must be SQL_BATCHED_UDF, SQL_SCALAR_PANDAS_UDF or " - "SQL_GROUPED_AGG_PANDAS_UDF") - register_udf = UserDefinedFunction(f.func, returnType=f.returnType, name=name, - evalType=f.evalType, - deterministic=f.deterministic) - return_udf = f - else: - if returnType is None: - returnType = StringType() - register_udf = UserDefinedFunction(f, returnType=returnType, name=name, - evalType=PythonEvalType.SQL_BATCHED_UDF) - return_udf = register_udf._wrapped() - self.sparkSession._jsparkSession.udf().registerPython(name, register_udf._judf) - return return_udf - - @ignore_unicode_prefix - @since(2.3) - def registerJavaFunction(self, name, javaClassName, returnType=None): - """Register a Java user-defined function as a SQL function. - - In addition to a name and the function itself, the return type can be optionally specified. - When the return type is not specified we would infer it via reflection. - - :param name: name of the user-defined function - :param javaClassName: fully qualified name of java class - :param returnType: the return type of the registered Java function. The value can be either - a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. - - >>> from pyspark.sql.types import IntegerType - >>> spark.udf.registerJavaFunction( - ... "javaStringLength", "test.org.apache.spark.sql.JavaStringLength", IntegerType()) - >>> spark.sql("SELECT javaStringLength('test')").collect() - [Row(UDF:javaStringLength(test)=4)] - - >>> spark.udf.registerJavaFunction( - ... "javaStringLength2", "test.org.apache.spark.sql.JavaStringLength") - >>> spark.sql("SELECT javaStringLength2('test')").collect() - [Row(UDF:javaStringLength2(test)=4)] - - >>> spark.udf.registerJavaFunction( - ... "javaStringLength3", "test.org.apache.spark.sql.JavaStringLength", "integer") - >>> spark.sql("SELECT javaStringLength3('test')").collect() - [Row(UDF:javaStringLength3(test)=4)] - """ - - jdt = None - if returnType is not None: - if not isinstance(returnType, DataType): - returnType = _parse_datatype_string(returnType) - jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json()) - self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt) - - @ignore_unicode_prefix - @since(2.3) - def registerJavaUDAF(self, name, javaClassName): - """Register a Java user-defined aggregate function as a SQL function. - - :param name: name of the user-defined aggregate function - :param javaClassName: fully qualified name of java class - - >>> spark.udf.registerJavaUDAF("javaUDAF", "test.org.apache.spark.sql.MyDoubleAvg") - >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "a")],["id", "name"]) - >>> df.createOrReplaceTempView("df") - >>> spark.sql("SELECT name, javaUDAF(id) as avg from df group by name").collect() - [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)] - """ - - self.sparkSession._jsparkSession.udf().registerJavaUDAF(name, javaClassName) - - -def _test(): - import doctest - from pyspark.sql import SparkSession - import pyspark.sql.udf - globs = pyspark.sql.udf.__dict__.copy() - spark = SparkSession.builder\ - .master("local[4]")\ - .appName("sql.udf tests")\ - .getOrCreate() - globs['spark'] = spark - (failure_count, test_count) = doctest.testmod( - pyspark.sql.udf, globs=globs, - optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) - spark.stop() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/utils.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/utils.py deleted file mode 100644 index bdb3a14..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/utils.py +++ /dev/null @@ -1,194 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import py4j - - -class CapturedException(Exception): - def __init__(self, desc, stackTrace): - self.desc = desc - self.stackTrace = stackTrace - - def __str__(self): - return repr(self.desc) - - -class AnalysisException(CapturedException): - """ - Failed to analyze a SQL query plan. - """ - - -class ParseException(CapturedException): - """ - Failed to parse a SQL command. - """ - - -class IllegalArgumentException(CapturedException): - """ - Passed an illegal or inappropriate argument. - """ - - -class StreamingQueryException(CapturedException): - """ - Exception that stopped a :class:`StreamingQuery`. - """ - - -class QueryExecutionException(CapturedException): - """ - Failed to execute a query. - """ - - -def capture_sql_exception(f): - def deco(*a, **kw): - try: - return f(*a, **kw) - except py4j.protocol.Py4JJavaError as e: - s = e.java_exception.toString() - stackTrace = '\n\t at '.join(map(lambda x: x.toString(), - e.java_exception.getStackTrace())) - if s.startswith('org.apache.spark.sql.AnalysisException: '): - raise AnalysisException(s.split(': ', 1)[1], stackTrace) - if s.startswith('org.apache.spark.sql.catalyst.analysis'): - raise AnalysisException(s.split(': ', 1)[1], stackTrace) - if s.startswith('org.apache.spark.sql.catalyst.parser.ParseException: '): - raise ParseException(s.split(': ', 1)[1], stackTrace) - if s.startswith('org.apache.spark.sql.streaming.StreamingQueryException: '): - raise StreamingQueryException(s.split(': ', 1)[1], stackTrace) - if s.startswith('org.apache.spark.sql.execution.QueryExecutionException: '): - raise QueryExecutionException(s.split(': ', 1)[1], stackTrace) - if s.startswith('java.lang.IllegalArgumentException: '): - raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace) - raise - return deco - - -def install_exception_handler(): - """ - Hook an exception handler into Py4j, which could capture some SQL exceptions in Java. - - When calling Java API, it will call `get_return_value` to parse the returned object. - If any exception happened in JVM, the result will be Java exception object, it raise - py4j.protocol.Py4JJavaError. We replace the original `get_return_value` with one that - could capture the Java exception and throw a Python one (with the same error message). - - It's idempotent, could be called multiple times. - """ - original = py4j.protocol.get_return_value - # The original `get_return_value` is not patched, it's idempotent. - patched = capture_sql_exception(original) - # only patch the one used in py4j.java_gateway (call Java API) - py4j.java_gateway.get_return_value = patched - - -def toJArray(gateway, jtype, arr): - """ - Convert python list to java type array - :param gateway: Py4j Gateway - :param jtype: java type of element in array - :param arr: python type list - """ - jarr = gateway.new_array(jtype, len(arr)) - for i in range(0, len(arr)): - jarr[i] = arr[i] - return jarr - - -def require_minimum_pandas_version(): - """ Raise ImportError if minimum version of Pandas is not installed - """ - # TODO(HyukjinKwon): Relocate and deduplicate the version specification. - minimum_pandas_version = "0.19.2" - - from distutils.version import LooseVersion - try: - import pandas - have_pandas = True - except ImportError: - have_pandas = False - if not have_pandas: - raise ImportError("Pandas >= %s must be installed; however, " - "it was not found." % minimum_pandas_version) - if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version): - raise ImportError("Pandas >= %s must be installed; however, " - "your version was %s." % (minimum_pandas_version, pandas.__version__)) - - -def require_minimum_pyarrow_version(): - """ Raise ImportError if minimum version of pyarrow is not installed - """ - # TODO(HyukjinKwon): Relocate and deduplicate the version specification. - minimum_pyarrow_version = "0.8.0" - - from distutils.version import LooseVersion - try: - import pyarrow - have_arrow = True - except ImportError: - have_arrow = False - if not have_arrow: - raise ImportError("PyArrow >= %s must be installed; however, " - "it was not found." % minimum_pyarrow_version) - if LooseVersion(pyarrow.__version__) < LooseVersion(minimum_pyarrow_version): - raise ImportError("PyArrow >= %s must be installed; however, " - "your version was %s." % (minimum_pyarrow_version, pyarrow.__version__)) - - -def require_test_compiled(): - """ Raise Exception if test classes are not compiled - """ - import os - import glob - try: - spark_home = os.environ['SPARK_HOME'] - except KeyError: - raise RuntimeError('SPARK_HOME is not defined in environment') - - test_class_path = os.path.join( - spark_home, 'sql', 'core', 'target', '*', 'test-classes') - paths = glob.glob(test_class_path) - - if len(paths) == 0: - raise RuntimeError( - "%s doesn't exist. Spark sql test classes are not compiled." % test_class_path) - - -class ForeachBatchFunction(object): - """ - This is the Python implementation of Java interface 'ForeachBatchFunction'. This wraps - the user-defined 'foreachBatch' function such that it can be called from the JVM when - the query is active. - """ - - def __init__(self, sql_ctx, func): - self.sql_ctx = sql_ctx - self.func = func - - def call(self, jdf, batch_id): - from pyspark.sql.dataframe import DataFrame - try: - self.func(DataFrame(jdf, self.sql_ctx), batch_id) - except Exception as e: - self.error = e - raise e - - class Java: - implements = ['org.apache.spark.sql.execution.streaming.sources.PythonForeachBatchFunction'] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/window.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/window.py deleted file mode 100644 index e76563d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/window.py +++ /dev/null @@ -1,241 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys - -from pyspark import since, SparkContext -from pyspark.sql.column import _to_seq, _to_java_column - -__all__ = ["Window", "WindowSpec"] - - -def _to_java_cols(cols): - sc = SparkContext._active_spark_context - if len(cols) == 1 and isinstance(cols[0], list): - cols = cols[0] - return _to_seq(sc, cols, _to_java_column) - - -class Window(object): - """ - Utility functions for defining window in DataFrames. - - For example: - - >>> # ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW - >>> window = Window.orderBy("date").rowsBetween(Window.unboundedPreceding, Window.currentRow) - - >>> # PARTITION BY country ORDER BY date RANGE BETWEEN 3 PRECEDING AND 3 FOLLOWING - >>> window = Window.orderBy("date").partitionBy("country").rangeBetween(-3, 3) - - .. note:: When ordering is not defined, an unbounded window frame (rowFrame, - unboundedPreceding, unboundedFollowing) is used by default. When ordering is defined, - a growing window frame (rangeFrame, unboundedPreceding, currentRow) is used by default. - - .. note:: Experimental - - .. versionadded:: 1.4 - """ - - _JAVA_MIN_LONG = -(1 << 63) # -9223372036854775808 - _JAVA_MAX_LONG = (1 << 63) - 1 # 9223372036854775807 - _PRECEDING_THRESHOLD = max(-sys.maxsize, _JAVA_MIN_LONG) - _FOLLOWING_THRESHOLD = min(sys.maxsize, _JAVA_MAX_LONG) - - unboundedPreceding = _JAVA_MIN_LONG - - unboundedFollowing = _JAVA_MAX_LONG - - currentRow = 0 - - @staticmethod - @since(1.4) - def partitionBy(*cols): - """ - Creates a :class:`WindowSpec` with the partitioning defined. - """ - sc = SparkContext._active_spark_context - jspec = sc._jvm.org.apache.spark.sql.expressions.Window.partitionBy(_to_java_cols(cols)) - return WindowSpec(jspec) - - @staticmethod - @since(1.4) - def orderBy(*cols): - """ - Creates a :class:`WindowSpec` with the ordering defined. - """ - sc = SparkContext._active_spark_context - jspec = sc._jvm.org.apache.spark.sql.expressions.Window.orderBy(_to_java_cols(cols)) - return WindowSpec(jspec) - - @staticmethod - @since(2.1) - def rowsBetween(start, end): - """ - Creates a :class:`WindowSpec` with the frame boundaries defined, - from `start` (inclusive) to `end` (inclusive). - - Both `start` and `end` are relative positions from the current row. - For example, "0" means "current row", while "-1" means the row before - the current row, and "5" means the fifth row after the current row. - - We recommend users use ``Window.unboundedPreceding``, ``Window.unboundedFollowing``, - and ``Window.currentRow`` to specify special boundary values, rather than using integral - values directly. - - :param start: boundary start, inclusive. - The frame is unbounded if this is ``Window.unboundedPreceding``, or - any value less than or equal to -9223372036854775808. - :param end: boundary end, inclusive. - The frame is unbounded if this is ``Window.unboundedFollowing``, or - any value greater than or equal to 9223372036854775807. - """ - if start <= Window._PRECEDING_THRESHOLD: - start = Window.unboundedPreceding - if end >= Window._FOLLOWING_THRESHOLD: - end = Window.unboundedFollowing - sc = SparkContext._active_spark_context - jspec = sc._jvm.org.apache.spark.sql.expressions.Window.rowsBetween(start, end) - return WindowSpec(jspec) - - @staticmethod - @since(2.1) - def rangeBetween(start, end): - """ - Creates a :class:`WindowSpec` with the frame boundaries defined, - from `start` (inclusive) to `end` (inclusive). - - Both `start` and `end` are relative from the current row. For example, - "0" means "current row", while "-1" means one off before the current row, - and "5" means the five off after the current row. - - We recommend users use ``Window.unboundedPreceding``, ``Window.unboundedFollowing``, - and ``Window.currentRow`` to specify special boundary values, rather than using integral - values directly. - - :param start: boundary start, inclusive. - The frame is unbounded if this is ``Window.unboundedPreceding``, or - any value less than or equal to max(-sys.maxsize, -9223372036854775808). - :param end: boundary end, inclusive. - The frame is unbounded if this is ``Window.unboundedFollowing``, or - any value greater than or equal to min(sys.maxsize, 9223372036854775807). - """ - if start <= Window._PRECEDING_THRESHOLD: - start = Window.unboundedPreceding - if end >= Window._FOLLOWING_THRESHOLD: - end = Window.unboundedFollowing - sc = SparkContext._active_spark_context - jspec = sc._jvm.org.apache.spark.sql.expressions.Window.rangeBetween(start, end) - return WindowSpec(jspec) - - -class WindowSpec(object): - """ - A window specification that defines the partitioning, ordering, - and frame boundaries. - - Use the static methods in :class:`Window` to create a :class:`WindowSpec`. - - .. note:: Experimental - - .. versionadded:: 1.4 - """ - - def __init__(self, jspec): - self._jspec = jspec - - @since(1.4) - def partitionBy(self, *cols): - """ - Defines the partitioning columns in a :class:`WindowSpec`. - - :param cols: names of columns or expressions - """ - return WindowSpec(self._jspec.partitionBy(_to_java_cols(cols))) - - @since(1.4) - def orderBy(self, *cols): - """ - Defines the ordering columns in a :class:`WindowSpec`. - - :param cols: names of columns or expressions - """ - return WindowSpec(self._jspec.orderBy(_to_java_cols(cols))) - - @since(1.4) - def rowsBetween(self, start, end): - """ - Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive). - - Both `start` and `end` are relative positions from the current row. - For example, "0" means "current row", while "-1" means the row before - the current row, and "5" means the fifth row after the current row. - - We recommend users use ``Window.unboundedPreceding``, ``Window.unboundedFollowing``, - and ``Window.currentRow`` to specify special boundary values, rather than using integral - values directly. - - :param start: boundary start, inclusive. - The frame is unbounded if this is ``Window.unboundedPreceding``, or - any value less than or equal to max(-sys.maxsize, -9223372036854775808). - :param end: boundary end, inclusive. - The frame is unbounded if this is ``Window.unboundedFollowing``, or - any value greater than or equal to min(sys.maxsize, 9223372036854775807). - """ - if start <= Window._PRECEDING_THRESHOLD: - start = Window.unboundedPreceding - if end >= Window._FOLLOWING_THRESHOLD: - end = Window.unboundedFollowing - return WindowSpec(self._jspec.rowsBetween(start, end)) - - @since(1.4) - def rangeBetween(self, start, end): - """ - Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive). - - Both `start` and `end` are relative from the current row. For example, - "0" means "current row", while "-1" means one off before the current row, - and "5" means the five off after the current row. - - We recommend users use ``Window.unboundedPreceding``, ``Window.unboundedFollowing``, - and ``Window.currentRow`` to specify special boundary values, rather than using integral - values directly. - - :param start: boundary start, inclusive. - The frame is unbounded if this is ``Window.unboundedPreceding``, or - any value less than or equal to max(-sys.maxsize, -9223372036854775808). - :param end: boundary end, inclusive. - The frame is unbounded if this is ``Window.unboundedFollowing``, or - any value greater than or equal to min(sys.maxsize, 9223372036854775807). - """ - if start <= Window._PRECEDING_THRESHOLD: - start = Window.unboundedPreceding - if end >= Window._FOLLOWING_THRESHOLD: - end = Window.unboundedFollowing - return WindowSpec(self._jspec.rangeBetween(start, end)) - - -def _test(): - import doctest - SparkContext('local[4]', 'PythonTest') - (failure_count, test_count) = doctest.testmod() - if failure_count: - sys.exit(-1) - - -if __name__ == "__main__": - _test() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/statcounter.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/statcounter.py deleted file mode 100644 index 03ea0b6..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/statcounter.py +++ /dev/null @@ -1,158 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This file is ported from spark/util/StatCounter.scala - -import copy -import math - -try: - from numpy import maximum, minimum, sqrt -except ImportError: - maximum = max - minimum = min - sqrt = math.sqrt - - -class StatCounter(object): - - def __init__(self, values=None): - if values is None: - values = list() - self.n = 0 # Running count of our values - self.mu = 0.0 # Running mean of our values - self.m2 = 0.0 # Running variance numerator (sum of (x - mean)^2) - self.maxValue = float("-inf") - self.minValue = float("inf") - - for v in values: - self.merge(v) - - # Add a value into this StatCounter, updating the internal statistics. - def merge(self, value): - delta = value - self.mu - self.n += 1 - self.mu += delta / self.n - self.m2 += delta * (value - self.mu) - self.maxValue = maximum(self.maxValue, value) - self.minValue = minimum(self.minValue, value) - - return self - - # Merge another StatCounter into this one, adding up the internal statistics. - def mergeStats(self, other): - if not isinstance(other, StatCounter): - raise Exception("Can only merge Statcounters!") - - if other is self: # reference equality holds - self.merge(copy.deepcopy(other)) # Avoid overwriting fields in a weird order - else: - if self.n == 0: - self.mu = other.mu - self.m2 = other.m2 - self.n = other.n - self.maxValue = other.maxValue - self.minValue = other.minValue - - elif other.n != 0: - delta = other.mu - self.mu - if other.n * 10 < self.n: - self.mu = self.mu + (delta * other.n) / (self.n + other.n) - elif self.n * 10 < other.n: - self.mu = other.mu - (delta * self.n) / (self.n + other.n) - else: - self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n) - - self.maxValue = maximum(self.maxValue, other.maxValue) - self.minValue = minimum(self.minValue, other.minValue) - - self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n) - self.n += other.n - return self - - # Clone this StatCounter - def copy(self): - return copy.deepcopy(self) - - def count(self): - return int(self.n) - - def mean(self): - return self.mu - - def sum(self): - return self.n * self.mu - - def min(self): - return self.minValue - - def max(self): - return self.maxValue - - # Return the variance of the values. - def variance(self): - if self.n == 0: - return float('nan') - else: - return self.m2 / self.n - - # - # Return the sample variance, which corrects for bias in estimating the variance by dividing - # by N-1 instead of N. - # - def sampleVariance(self): - if self.n <= 1: - return float('nan') - else: - return self.m2 / (self.n - 1) - - # Return the standard deviation of the values. - def stdev(self): - return sqrt(self.variance()) - - # - # Return the sample standard deviation of the values, which corrects for bias in estimating the - # variance by dividing by N-1 instead of N. - # - def sampleStdev(self): - return sqrt(self.sampleVariance()) - - def asDict(self, sample=False): - """Returns the :class:`StatCounter` members as a ``dict``. - - >>> sc.parallelize([1., 2., 3., 4.]).stats().asDict() - {'count': 4L, - 'max': 4.0, - 'mean': 2.5, - 'min': 1.0, - 'stdev': 1.2909944487358056, - 'sum': 10.0, - 'variance': 1.6666666666666667} - """ - return { - 'count': self.count(), - 'mean': self.mean(), - 'sum': self.sum(), - 'min': self.min(), - 'max': self.max(), - 'stdev': self.stdev() if sample else self.sampleStdev(), - 'variance': self.variance() if sample else self.sampleVariance() - } - - def __repr__(self): - return ("(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" % - (self.count(), self.mean(), self.stdev(), self.max(), self.min())) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/status.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/status.py deleted file mode 100644 index a6fa7dd..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/status.py +++ /dev/null @@ -1,96 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from collections import namedtuple - -__all__ = ["SparkJobInfo", "SparkStageInfo", "StatusTracker"] - - -class SparkJobInfo(namedtuple("SparkJobInfo", "jobId stageIds status")): - """ - Exposes information about Spark Jobs. - """ - - -class SparkStageInfo(namedtuple("SparkStageInfo", - "stageId currentAttemptId name numTasks numActiveTasks " - "numCompletedTasks numFailedTasks")): - """ - Exposes information about Spark Stages. - """ - - -class StatusTracker(object): - """ - Low-level status reporting APIs for monitoring job and stage progress. - - These APIs intentionally provide very weak consistency semantics; - consumers of these APIs should be prepared to handle empty / missing - information. For example, a job's stage ids may be known but the status - API may not have any information about the details of those stages, so - `getStageInfo` could potentially return `None` for a valid stage id. - - To limit memory usage, these APIs only provide information on recent - jobs / stages. These APIs will provide information for the last - `spark.ui.retainedStages` stages and `spark.ui.retainedJobs` jobs. - """ - def __init__(self, jtracker): - self._jtracker = jtracker - - def getJobIdsForGroup(self, jobGroup=None): - """ - Return a list of all known jobs in a particular job group. If - `jobGroup` is None, then returns all known jobs that are not - associated with a job group. - - The returned list may contain running, failed, and completed jobs, - and may vary across invocations of this method. This method does - not guarantee the order of the elements in its result. - """ - return list(self._jtracker.getJobIdsForGroup(jobGroup)) - - def getActiveStageIds(self): - """ - Returns an array containing the ids of all active stages. - """ - return sorted(list(self._jtracker.getActiveStageIds())) - - def getActiveJobsIds(self): - """ - Returns an array containing the ids of all active jobs. - """ - return sorted((list(self._jtracker.getActiveJobIds()))) - - def getJobInfo(self, jobId): - """ - Returns a :class:`SparkJobInfo` object, or None if the job info - could not be found or was garbage collected. - """ - job = self._jtracker.getJobInfo(jobId) - if job is not None: - return SparkJobInfo(jobId, job.stageIds(), str(job.status())) - - def getStageInfo(self, stageId): - """ - Returns a :class:`SparkStageInfo` object, or None if the stage - info could not be found or was garbage collected. - """ - stage = self._jtracker.getStageInfo(stageId) - if stage is not None: - # TODO: fetch them in batch for better performance - attrs = [getattr(stage, f)() for f in SparkStageInfo._fields[1:]] - return SparkStageInfo(stageId, *attrs) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/storagelevel.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/storagelevel.py deleted file mode 100644 index 7f29646..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/storagelevel.py +++ /dev/null @@ -1,71 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -__all__ = ["StorageLevel"] - - -class StorageLevel(object): - - """ - Flags for controlling the storage of an RDD. Each StorageLevel records whether to use memory, - whether to drop the RDD to disk if it falls out of memory, whether to keep the data in memory - in a JAVA-specific serialized format, and whether to replicate the RDD partitions on multiple - nodes. Also contains static constants for some commonly used storage levels, MEMORY_ONLY. - Since the data is always serialized on the Python side, all the constants use the serialized - formats. - """ - - def __init__(self, useDisk, useMemory, useOffHeap, deserialized, replication=1): - self.useDisk = useDisk - self.useMemory = useMemory - self.useOffHeap = useOffHeap - self.deserialized = deserialized - self.replication = replication - - def __repr__(self): - return "StorageLevel(%s, %s, %s, %s, %s)" % ( - self.useDisk, self.useMemory, self.useOffHeap, self.deserialized, self.replication) - - def __str__(self): - result = "" - result += "Disk " if self.useDisk else "" - result += "Memory " if self.useMemory else "" - result += "OffHeap " if self.useOffHeap else "" - result += "Deserialized " if self.deserialized else "Serialized " - result += "%sx Replicated" % self.replication - return result - -StorageLevel.DISK_ONLY = StorageLevel(True, False, False, False) -StorageLevel.DISK_ONLY_2 = StorageLevel(True, False, False, False, 2) -StorageLevel.MEMORY_ONLY = StorageLevel(False, True, False, False) -StorageLevel.MEMORY_ONLY_2 = StorageLevel(False, True, False, False, 2) -StorageLevel.MEMORY_AND_DISK = StorageLevel(True, True, False, False) -StorageLevel.MEMORY_AND_DISK_2 = StorageLevel(True, True, False, False, 2) -StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1) - -""" -.. note:: The following four storage level constants are deprecated in 2.0, since the records - will always be serialized in Python. -""" -StorageLevel.MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY -""".. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY`` instead.""" -StorageLevel.MEMORY_ONLY_SER_2 = StorageLevel.MEMORY_ONLY_2 -""".. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY_2`` instead.""" -StorageLevel.MEMORY_AND_DISK_SER = StorageLevel.MEMORY_AND_DISK -""".. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_AND_DISK`` instead.""" -StorageLevel.MEMORY_AND_DISK_SER_2 = StorageLevel.MEMORY_AND_DISK_2 -""".. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_AND_DISK_2`` instead.""" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/__init__.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/__init__.py deleted file mode 100644 index 66e8f8e..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from pyspark.streaming.context import StreamingContext -from pyspark.streaming.dstream import DStream -from pyspark.streaming.listener import StreamingListener - -__all__ = ['StreamingContext', 'DStream', 'StreamingListener'] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/context.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/context.py deleted file mode 100644 index 3fa57ca..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/context.py +++ /dev/null @@ -1,356 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -import os -import sys - -from py4j.java_gateway import java_import, JavaObject - -from pyspark import RDD, SparkConf -from pyspark.serializers import NoOpSerializer, UTF8Deserializer, CloudPickleSerializer -from pyspark.context import SparkContext -from pyspark.storagelevel import StorageLevel -from pyspark.streaming.dstream import DStream -from pyspark.streaming.util import TransformFunction, TransformFunctionSerializer - -__all__ = ["StreamingContext"] - - -class StreamingContext(object): - """ - Main entry point for Spark Streaming functionality. A StreamingContext - represents the connection to a Spark cluster, and can be used to create - L{DStream} various input sources. It can be from an existing L{SparkContext}. - After creating and transforming DStreams, the streaming computation can - be started and stopped using `context.start()` and `context.stop()`, - respectively. `context.awaitTermination()` allows the current thread - to wait for the termination of the context by `stop()` or by an exception. - """ - _transformerSerializer = None - - # Reference to a currently active StreamingContext - _activeContext = None - - def __init__(self, sparkContext, batchDuration=None, jssc=None): - """ - Create a new StreamingContext. - - @param sparkContext: L{SparkContext} object. - @param batchDuration: the time interval (in seconds) at which streaming - data will be divided into batches - """ - - self._sc = sparkContext - self._jvm = self._sc._jvm - self._jssc = jssc or self._initialize_context(self._sc, batchDuration) - - def _initialize_context(self, sc, duration): - self._ensure_initialized() - return self._jvm.JavaStreamingContext(sc._jsc, self._jduration(duration)) - - def _jduration(self, seconds): - """ - Create Duration object given number of seconds - """ - return self._jvm.Duration(int(seconds * 1000)) - - @classmethod - def _ensure_initialized(cls): - SparkContext._ensure_initialized() - gw = SparkContext._gateway - - java_import(gw.jvm, "org.apache.spark.streaming.*") - java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") - java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") - - from pyspark.java_gateway import ensure_callback_server_started - ensure_callback_server_started(gw) - - # register serializer for TransformFunction - # it happens before creating SparkContext when loading from checkpointing - cls._transformerSerializer = TransformFunctionSerializer( - SparkContext._active_spark_context, CloudPickleSerializer(), gw) - - @classmethod - def getOrCreate(cls, checkpointPath, setupFunc): - """ - Either recreate a StreamingContext from checkpoint data or create a new StreamingContext. - If checkpoint data exists in the provided `checkpointPath`, then StreamingContext will be - recreated from the checkpoint data. If the data does not exist, then the provided setupFunc - will be used to create a new context. - - @param checkpointPath: Checkpoint directory used in an earlier streaming program - @param setupFunc: Function to create a new context and setup DStreams - """ - cls._ensure_initialized() - gw = SparkContext._gateway - - # Check whether valid checkpoint information exists in the given path - ssc_option = gw.jvm.StreamingContextPythonHelper().tryRecoverFromCheckpoint(checkpointPath) - if ssc_option.isEmpty(): - ssc = setupFunc() - ssc.checkpoint(checkpointPath) - return ssc - - jssc = gw.jvm.JavaStreamingContext(ssc_option.get()) - - # If there is already an active instance of Python SparkContext use it, or create a new one - if not SparkContext._active_spark_context: - jsc = jssc.sparkContext() - conf = SparkConf(_jconf=jsc.getConf()) - SparkContext(conf=conf, gateway=gw, jsc=jsc) - - sc = SparkContext._active_spark_context - - # update ctx in serializer - cls._transformerSerializer.ctx = sc - return StreamingContext(sc, None, jssc) - - @classmethod - def getActive(cls): - """ - Return either the currently active StreamingContext (i.e., if there is a context started - but not stopped) or None. - """ - activePythonContext = cls._activeContext - if activePythonContext is not None: - # Verify that the current running Java StreamingContext is active and is the same one - # backing the supposedly active Python context - activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode() - activeJvmContextOption = activePythonContext._jvm.StreamingContext.getActive() - - if activeJvmContextOption.isEmpty(): - cls._activeContext = None - elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId: - cls._activeContext = None - raise Exception("JVM's active JavaStreamingContext is not the JavaStreamingContext " - "backing the action Python StreamingContext. This is unexpected.") - return cls._activeContext - - @classmethod - def getActiveOrCreate(cls, checkpointPath, setupFunc): - """ - Either return the active StreamingContext (i.e. currently started but not stopped), - or recreate a StreamingContext from checkpoint data or create a new StreamingContext - using the provided setupFunc function. If the checkpointPath is None or does not contain - valid checkpoint data, then setupFunc will be called to create a new context and setup - DStreams. - - @param checkpointPath: Checkpoint directory used in an earlier streaming program. Can be - None if the intention is to always create a new context when there - is no active context. - @param setupFunc: Function to create a new JavaStreamingContext and setup DStreams - """ - - if setupFunc is None: - raise Exception("setupFunc cannot be None") - activeContext = cls.getActive() - if activeContext is not None: - return activeContext - elif checkpointPath is not None: - return cls.getOrCreate(checkpointPath, setupFunc) - else: - return setupFunc() - - @property - def sparkContext(self): - """ - Return SparkContext which is associated with this StreamingContext. - """ - return self._sc - - def start(self): - """ - Start the execution of the streams. - """ - self._jssc.start() - StreamingContext._activeContext = self - - def awaitTermination(self, timeout=None): - """ - Wait for the execution to stop. - - @param timeout: time to wait in seconds - """ - if timeout is None: - self._jssc.awaitTermination() - else: - self._jssc.awaitTerminationOrTimeout(int(timeout * 1000)) - - def awaitTerminationOrTimeout(self, timeout): - """ - Wait for the execution to stop. Return `true` if it's stopped; or - throw the reported error during the execution; or `false` if the - waiting time elapsed before returning from the method. - - @param timeout: time to wait in seconds - """ - return self._jssc.awaitTerminationOrTimeout(int(timeout * 1000)) - - def stop(self, stopSparkContext=True, stopGraceFully=False): - """ - Stop the execution of the streams, with option of ensuring all - received data has been processed. - - @param stopSparkContext: Stop the associated SparkContext or not - @param stopGracefully: Stop gracefully by waiting for the processing - of all received data to be completed - """ - self._jssc.stop(stopSparkContext, stopGraceFully) - StreamingContext._activeContext = None - if stopSparkContext: - self._sc.stop() - - def remember(self, duration): - """ - Set each DStreams in this context to remember RDDs it generated - in the last given duration. DStreams remember RDDs only for a - limited duration of time and releases them for garbage collection. - This method allows the developer to specify how long to remember - the RDDs (if the developer wishes to query old data outside the - DStream computation). - - @param duration: Minimum duration (in seconds) that each DStream - should remember its RDDs - """ - self._jssc.remember(self._jduration(duration)) - - def checkpoint(self, directory): - """ - Sets the context to periodically checkpoint the DStream operations for master - fault-tolerance. The graph will be checkpointed every batch interval. - - @param directory: HDFS-compatible directory where the checkpoint data - will be reliably stored - """ - self._jssc.checkpoint(directory) - - def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_DISK_2): - """ - Create an input from TCP source hostname:port. Data is received using - a TCP socket and receive byte is interpreted as UTF8 encoded ``\\n`` delimited - lines. - - @param hostname: Hostname to connect to for receiving data - @param port: Port to connect to for receiving data - @param storageLevel: Storage level to use for storing the received objects - """ - jlevel = self._sc._getJavaStorageLevel(storageLevel) - return DStream(self._jssc.socketTextStream(hostname, port, jlevel), self, - UTF8Deserializer()) - - def textFileStream(self, directory): - """ - Create an input stream that monitors a Hadoop-compatible file system - for new files and reads them as text files. Files must be wrriten to the - monitored directory by "moving" them from another location within the same - file system. File names starting with . are ignored. - """ - return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer()) - - def binaryRecordsStream(self, directory, recordLength): - """ - Create an input stream that monitors a Hadoop-compatible file system - for new files and reads them as flat binary files with records of - fixed length. Files must be written to the monitored directory by "moving" - them from another location within the same file system. - File names starting with . are ignored. - - @param directory: Directory to load data from - @param recordLength: Length of each record in bytes - """ - return DStream(self._jssc.binaryRecordsStream(directory, recordLength), self, - NoOpSerializer()) - - def _check_serializers(self, rdds): - # make sure they have same serializer - if len(set(rdd._jrdd_deserializer for rdd in rdds)) > 1: - for i in range(len(rdds)): - # reset them to sc.serializer - rdds[i] = rdds[i]._reserialize() - - def queueStream(self, rdds, oneAtATime=True, default=None): - """ - Create an input stream from a queue of RDDs or list. In each batch, - it will process either one or all of the RDDs returned by the queue. - - .. note:: Changes to the queue after the stream is created will not be recognized. - - @param rdds: Queue of RDDs - @param oneAtATime: pick one rdd each time or pick all of them once. - @param default: The default rdd if no more in rdds - """ - if default and not isinstance(default, RDD): - default = self._sc.parallelize(default) - - if not rdds and default: - rdds = [rdds] - - if rdds and not isinstance(rdds[0], RDD): - rdds = [self._sc.parallelize(input) for input in rdds] - self._check_serializers(rdds) - - queue = self._jvm.PythonDStream.toRDDQueue([r._jrdd for r in rdds]) - if default: - default = default._reserialize(rdds[0]._jrdd_deserializer) - jdstream = self._jssc.queueStream(queue, oneAtATime, default._jrdd) - else: - jdstream = self._jssc.queueStream(queue, oneAtATime) - return DStream(jdstream, self, rdds[0]._jrdd_deserializer) - - def transform(self, dstreams, transformFunc): - """ - Create a new DStream in which each RDD is generated by applying - a function on RDDs of the DStreams. The order of the JavaRDDs in - the transform function parameter will be the same as the order - of corresponding DStreams in the list. - """ - jdstreams = [d._jdstream for d in dstreams] - # change the final serializer to sc.serializer - func = TransformFunction(self._sc, - lambda t, *rdds: transformFunc(rdds), - *[d._jrdd_deserializer for d in dstreams]) - jfunc = self._jvm.TransformFunction(func) - jdstream = self._jssc.transform(jdstreams, jfunc) - return DStream(jdstream, self, self._sc.serializer) - - def union(self, *dstreams): - """ - Create a unified DStream from multiple DStreams of the same - type and same slide duration. - """ - if not dstreams: - raise ValueError("should have at least one DStream to union") - if len(dstreams) == 1: - return dstreams[0] - if len(set(s._jrdd_deserializer for s in dstreams)) > 1: - raise ValueError("All DStreams should have same serializer") - if len(set(s._slideDuration for s in dstreams)) > 1: - raise ValueError("All DStreams should have same slide duration") - first = dstreams[0] - jrest = [d._jdstream for d in dstreams[1:]] - return DStream(self._jssc.union(first._jdstream, jrest), self, first._jrdd_deserializer) - - def addStreamingListener(self, streamingListener): - """ - Add a [[org.apache.spark.streaming.scheduler.StreamingListener]] object for - receiving system events related to streaming. - """ - self._jssc.addStreamingListener(self._jvm.JavaStreamingListenerWrapper( - self._jvm.PythonStreamingListenerWrapper(streamingListener))) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/dstream.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/dstream.py deleted file mode 100644 index ce42a85..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/dstream.py +++ /dev/null @@ -1,647 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import operator -import time -from itertools import chain -from datetime import datetime - -if sys.version < "3": - from itertools import imap as map, ifilter as filter -else: - long = int - -from py4j.protocol import Py4JJavaError - -from pyspark import RDD -from pyspark.storagelevel import StorageLevel -from pyspark.streaming.util import rddToFileName, TransformFunction -from pyspark.rdd import portable_hash -from pyspark.resultiterable import ResultIterable - -__all__ = ["DStream"] - - -class DStream(object): - """ - A Discretized Stream (DStream), the basic abstraction in Spark Streaming, - is a continuous sequence of RDDs (of the same type) representing a - continuous stream of data (see L{RDD} in the Spark core documentation - for more details on RDDs). - - DStreams can either be created from live data (such as, data from TCP - sockets, Kafka, Flume, etc.) using a L{StreamingContext} or it can be - generated by transforming existing DStreams using operations such as - `map`, `window` and `reduceByKeyAndWindow`. While a Spark Streaming - program is running, each DStream periodically generates a RDD, either - from live data or by transforming the RDD generated by a parent DStream. - - DStreams internally is characterized by a few basic properties: - - A list of other DStreams that the DStream depends on - - A time interval at which the DStream generates an RDD - - A function that is used to generate an RDD after each time interval - """ - def __init__(self, jdstream, ssc, jrdd_deserializer): - self._jdstream = jdstream - self._ssc = ssc - self._sc = ssc._sc - self._jrdd_deserializer = jrdd_deserializer - self.is_cached = False - self.is_checkpointed = False - - def context(self): - """ - Return the StreamingContext associated with this DStream - """ - return self._ssc - - def count(self): - """ - Return a new DStream in which each RDD has a single element - generated by counting each RDD of this DStream. - """ - return self.mapPartitions(lambda i: [sum(1 for _ in i)]).reduce(operator.add) - - def filter(self, f): - """ - Return a new DStream containing only the elements that satisfy predicate. - """ - def func(iterator): - return filter(f, iterator) - return self.mapPartitions(func, True) - - def flatMap(self, f, preservesPartitioning=False): - """ - Return a new DStream by applying a function to all elements of - this DStream, and then flattening the results - """ - def func(s, iterator): - return chain.from_iterable(map(f, iterator)) - return self.mapPartitionsWithIndex(func, preservesPartitioning) - - def map(self, f, preservesPartitioning=False): - """ - Return a new DStream by applying a function to each element of DStream. - """ - def func(iterator): - return map(f, iterator) - return self.mapPartitions(func, preservesPartitioning) - - def mapPartitions(self, f, preservesPartitioning=False): - """ - Return a new DStream in which each RDD is generated by applying - mapPartitions() to each RDDs of this DStream. - """ - def func(s, iterator): - return f(iterator) - return self.mapPartitionsWithIndex(func, preservesPartitioning) - - def mapPartitionsWithIndex(self, f, preservesPartitioning=False): - """ - Return a new DStream in which each RDD is generated by applying - mapPartitionsWithIndex() to each RDDs of this DStream. - """ - return self.transform(lambda rdd: rdd.mapPartitionsWithIndex(f, preservesPartitioning)) - - def reduce(self, func): - """ - Return a new DStream in which each RDD has a single element - generated by reducing each RDD of this DStream. - """ - return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1]) - - def reduceByKey(self, func, numPartitions=None): - """ - Return a new DStream by applying reduceByKey to each RDD. - """ - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - return self.combineByKey(lambda x: x, func, func, numPartitions) - - def combineByKey(self, createCombiner, mergeValue, mergeCombiners, - numPartitions=None): - """ - Return a new DStream by applying combineByKey to each RDD. - """ - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - - def func(rdd): - return rdd.combineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions) - return self.transform(func) - - def partitionBy(self, numPartitions, partitionFunc=portable_hash): - """ - Return a copy of the DStream in which each RDD are partitioned - using the specified partitioner. - """ - return self.transform(lambda rdd: rdd.partitionBy(numPartitions, partitionFunc)) - - def foreachRDD(self, func): - """ - Apply a function to each RDD in this DStream. - """ - if func.__code__.co_argcount == 1: - old_func = func - func = lambda t, rdd: old_func(rdd) - jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer) - api = self._ssc._jvm.PythonDStream - api.callForeachRDD(self._jdstream, jfunc) - - def pprint(self, num=10): - """ - Print the first num elements of each RDD generated in this DStream. - - @param num: the number of elements from the first will be printed. - """ - def takeAndPrint(time, rdd): - taken = rdd.take(num + 1) - print("-------------------------------------------") - print("Time: %s" % time) - print("-------------------------------------------") - for record in taken[:num]: - print(record) - if len(taken) > num: - print("...") - print("") - - self.foreachRDD(takeAndPrint) - - def mapValues(self, f): - """ - Return a new DStream by applying a map function to the value of - each key-value pairs in this DStream without changing the key. - """ - map_values_fn = lambda kv: (kv[0], f(kv[1])) - return self.map(map_values_fn, preservesPartitioning=True) - - def flatMapValues(self, f): - """ - Return a new DStream by applying a flatmap function to the value - of each key-value pairs in this DStream without changing the key. - """ - flat_map_fn = lambda kv: ((kv[0], x) for x in f(kv[1])) - return self.flatMap(flat_map_fn, preservesPartitioning=True) - - def glom(self): - """ - Return a new DStream in which RDD is generated by applying glom() - to RDD of this DStream. - """ - def func(iterator): - yield list(iterator) - return self.mapPartitions(func) - - def cache(self): - """ - Persist the RDDs of this DStream with the default storage level - (C{MEMORY_ONLY}). - """ - self.is_cached = True - self.persist(StorageLevel.MEMORY_ONLY) - return self - - def persist(self, storageLevel): - """ - Persist the RDDs of this DStream with the given storage level - """ - self.is_cached = True - javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel) - self._jdstream.persist(javaStorageLevel) - return self - - def checkpoint(self, interval): - """ - Enable periodic checkpointing of RDDs of this DStream - - @param interval: time in seconds, after each period of that, generated - RDD will be checkpointed - """ - self.is_checkpointed = True - self._jdstream.checkpoint(self._ssc._jduration(interval)) - return self - - def groupByKey(self, numPartitions=None): - """ - Return a new DStream by applying groupByKey on each RDD. - """ - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - return self.transform(lambda rdd: rdd.groupByKey(numPartitions)) - - def countByValue(self): - """ - Return a new DStream in which each RDD contains the counts of each - distinct value in each RDD of this DStream. - """ - return self.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y) - - def saveAsTextFiles(self, prefix, suffix=None): - """ - Save each RDD in this DStream as at text file, using string - representation of elements. - """ - def saveAsTextFile(t, rdd): - path = rddToFileName(prefix, suffix, t) - try: - rdd.saveAsTextFile(path) - except Py4JJavaError as e: - # after recovered from checkpointing, the foreachRDD may - # be called twice - if 'FileAlreadyExistsException' not in str(e): - raise - return self.foreachRDD(saveAsTextFile) - - # TODO: uncomment this until we have ssc.pickleFileStream() - # def saveAsPickleFiles(self, prefix, suffix=None): - # """ - # Save each RDD in this DStream as at binary file, the elements are - # serialized by pickle. - # """ - # def saveAsPickleFile(t, rdd): - # path = rddToFileName(prefix, suffix, t) - # try: - # rdd.saveAsPickleFile(path) - # except Py4JJavaError as e: - # # after recovered from checkpointing, the foreachRDD may - # # be called twice - # if 'FileAlreadyExistsException' not in str(e): - # raise - # return self.foreachRDD(saveAsPickleFile) - - def transform(self, func): - """ - Return a new DStream in which each RDD is generated by applying a function - on each RDD of this DStream. - - `func` can have one argument of `rdd`, or have two arguments of - (`time`, `rdd`) - """ - if func.__code__.co_argcount == 1: - oldfunc = func - func = lambda t, rdd: oldfunc(rdd) - assert func.__code__.co_argcount == 2, "func should take one or two arguments" - return TransformedDStream(self, func) - - def transformWith(self, func, other, keepSerializer=False): - """ - Return a new DStream in which each RDD is generated by applying a function - on each RDD of this DStream and 'other' DStream. - - `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three - arguments of (`time`, `rdd_a`, `rdd_b`) - """ - if func.__code__.co_argcount == 2: - oldfunc = func - func = lambda t, a, b: oldfunc(a, b) - assert func.__code__.co_argcount == 3, "func should take two or three arguments" - jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer, other._jrdd_deserializer) - dstream = self._sc._jvm.PythonTransformed2DStream(self._jdstream.dstream(), - other._jdstream.dstream(), jfunc) - jrdd_serializer = self._jrdd_deserializer if keepSerializer else self._sc.serializer - return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer) - - def repartition(self, numPartitions): - """ - Return a new DStream with an increased or decreased level of parallelism. - """ - return self.transform(lambda rdd: rdd.repartition(numPartitions)) - - @property - def _slideDuration(self): - """ - Return the slideDuration in seconds of this DStream - """ - return self._jdstream.dstream().slideDuration().milliseconds() / 1000.0 - - def union(self, other): - """ - Return a new DStream by unifying data of another DStream with this DStream. - - @param other: Another DStream having the same interval (i.e., slideDuration) - as this DStream. - """ - if self._slideDuration != other._slideDuration: - raise ValueError("the two DStream should have same slide duration") - return self.transformWith(lambda a, b: a.union(b), other, True) - - def cogroup(self, other, numPartitions=None): - """ - Return a new DStream by applying 'cogroup' between RDDs of this - DStream and `other` DStream. - - Hash partitioning is used to generate the RDDs with `numPartitions` partitions. - """ - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - return self.transformWith(lambda a, b: a.cogroup(b, numPartitions), other) - - def join(self, other, numPartitions=None): - """ - Return a new DStream by applying 'join' between RDDs of this DStream and - `other` DStream. - - Hash partitioning is used to generate the RDDs with `numPartitions` - partitions. - """ - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - return self.transformWith(lambda a, b: a.join(b, numPartitions), other) - - def leftOuterJoin(self, other, numPartitions=None): - """ - Return a new DStream by applying 'left outer join' between RDDs of this DStream and - `other` DStream. - - Hash partitioning is used to generate the RDDs with `numPartitions` - partitions. - """ - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - return self.transformWith(lambda a, b: a.leftOuterJoin(b, numPartitions), other) - - def rightOuterJoin(self, other, numPartitions=None): - """ - Return a new DStream by applying 'right outer join' between RDDs of this DStream and - `other` DStream. - - Hash partitioning is used to generate the RDDs with `numPartitions` - partitions. - """ - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - return self.transformWith(lambda a, b: a.rightOuterJoin(b, numPartitions), other) - - def fullOuterJoin(self, other, numPartitions=None): - """ - Return a new DStream by applying 'full outer join' between RDDs of this DStream and - `other` DStream. - - Hash partitioning is used to generate the RDDs with `numPartitions` - partitions. - """ - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - return self.transformWith(lambda a, b: a.fullOuterJoin(b, numPartitions), other) - - def _jtime(self, timestamp): - """ Convert datetime or unix_timestamp into Time - """ - if isinstance(timestamp, datetime): - timestamp = time.mktime(timestamp.timetuple()) - return self._sc._jvm.Time(long(timestamp * 1000)) - - def slice(self, begin, end): - """ - Return all the RDDs between 'begin' to 'end' (both included) - - `begin`, `end` could be datetime.datetime() or unix_timestamp - """ - jrdds = self._jdstream.slice(self._jtime(begin), self._jtime(end)) - return [RDD(jrdd, self._sc, self._jrdd_deserializer) for jrdd in jrdds] - - def _validate_window_param(self, window, slide): - duration = self._jdstream.dstream().slideDuration().milliseconds() - if int(window * 1000) % duration != 0: - raise ValueError("windowDuration must be multiple of the slide duration (%d ms)" - % duration) - if slide and int(slide * 1000) % duration != 0: - raise ValueError("slideDuration must be multiple of the slide duration (%d ms)" - % duration) - - def window(self, windowDuration, slideDuration=None): - """ - Return a new DStream in which each RDD contains all the elements in seen in a - sliding window of time over this DStream. - - @param windowDuration: width of the window; must be a multiple of this DStream's - batching interval - @param slideDuration: sliding interval of the window (i.e., the interval after which - the new DStream will generate RDDs); must be a multiple of this - DStream's batching interval - """ - self._validate_window_param(windowDuration, slideDuration) - d = self._ssc._jduration(windowDuration) - if slideDuration is None: - return DStream(self._jdstream.window(d), self._ssc, self._jrdd_deserializer) - s = self._ssc._jduration(slideDuration) - return DStream(self._jdstream.window(d, s), self._ssc, self._jrdd_deserializer) - - def reduceByWindow(self, reduceFunc, invReduceFunc, windowDuration, slideDuration): - """ - Return a new DStream in which each RDD has a single element generated by reducing all - elements in a sliding window over this DStream. - - if `invReduceFunc` is not None, the reduction is done incrementally - using the old window's reduced value : - - 1. reduce the new values that entered the window (e.g., adding new counts) - - 2. "inverse reduce" the old values that left the window (e.g., subtracting old counts) - This is more efficient than `invReduceFunc` is None. - - @param reduceFunc: associative and commutative reduce function - @param invReduceFunc: inverse reduce function of `reduceFunc`; such that for all y, - and invertible x: - `invReduceFunc(reduceFunc(x, y), x) = y` - @param windowDuration: width of the window; must be a multiple of this DStream's - batching interval - @param slideDuration: sliding interval of the window (i.e., the interval after which - the new DStream will generate RDDs); must be a multiple of this - DStream's batching interval - """ - keyed = self.map(lambda x: (1, x)) - reduced = keyed.reduceByKeyAndWindow(reduceFunc, invReduceFunc, - windowDuration, slideDuration, 1) - return reduced.map(lambda kv: kv[1]) - - def countByWindow(self, windowDuration, slideDuration): - """ - Return a new DStream in which each RDD has a single element generated - by counting the number of elements in a window over this DStream. - windowDuration and slideDuration are as defined in the window() operation. - - This is equivalent to window(windowDuration, slideDuration).count(), - but will be more efficient if window is large. - """ - return self.map(lambda x: 1).reduceByWindow(operator.add, operator.sub, - windowDuration, slideDuration) - - def countByValueAndWindow(self, windowDuration, slideDuration, numPartitions=None): - """ - Return a new DStream in which each RDD contains the count of distinct elements in - RDDs in a sliding window over this DStream. - - @param windowDuration: width of the window; must be a multiple of this DStream's - batching interval - @param slideDuration: sliding interval of the window (i.e., the interval after which - the new DStream will generate RDDs); must be a multiple of this - DStream's batching interval - @param numPartitions: number of partitions of each RDD in the new DStream. - """ - keyed = self.map(lambda x: (x, 1)) - counted = keyed.reduceByKeyAndWindow(operator.add, operator.sub, - windowDuration, slideDuration, numPartitions) - return counted.filter(lambda kv: kv[1] > 0) - - def groupByKeyAndWindow(self, windowDuration, slideDuration, numPartitions=None): - """ - Return a new DStream by applying `groupByKey` over a sliding window. - Similar to `DStream.groupByKey()`, but applies it over a sliding window. - - @param windowDuration: width of the window; must be a multiple of this DStream's - batching interval - @param slideDuration: sliding interval of the window (i.e., the interval after which - the new DStream will generate RDDs); must be a multiple of this - DStream's batching interval - @param numPartitions: Number of partitions of each RDD in the new DStream. - """ - ls = self.mapValues(lambda x: [x]) - grouped = ls.reduceByKeyAndWindow(lambda a, b: a.extend(b) or a, lambda a, b: a[len(b):], - windowDuration, slideDuration, numPartitions) - return grouped.mapValues(ResultIterable) - - def reduceByKeyAndWindow(self, func, invFunc, windowDuration, slideDuration=None, - numPartitions=None, filterFunc=None): - """ - Return a new DStream by applying incremental `reduceByKey` over a sliding window. - - The reduced value of over a new window is calculated using the old window's reduce value : - 1. reduce the new values that entered the window (e.g., adding new counts) - 2. "inverse reduce" the old values that left the window (e.g., subtracting old counts) - - `invFunc` can be None, then it will reduce all the RDDs in window, could be slower - than having `invFunc`. - - @param func: associative and commutative reduce function - @param invFunc: inverse function of `reduceFunc` - @param windowDuration: width of the window; must be a multiple of this DStream's - batching interval - @param slideDuration: sliding interval of the window (i.e., the interval after which - the new DStream will generate RDDs); must be a multiple of this - DStream's batching interval - @param numPartitions: number of partitions of each RDD in the new DStream. - @param filterFunc: function to filter expired key-value pairs; - only pairs that satisfy the function are retained - set this to null if you do not want to filter - """ - self._validate_window_param(windowDuration, slideDuration) - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - - reduced = self.reduceByKey(func, numPartitions) - - if invFunc: - def reduceFunc(t, a, b): - b = b.reduceByKey(func, numPartitions) - r = a.union(b).reduceByKey(func, numPartitions) if a else b - if filterFunc: - r = r.filter(filterFunc) - return r - - def invReduceFunc(t, a, b): - b = b.reduceByKey(func, numPartitions) - joined = a.leftOuterJoin(b, numPartitions) - return joined.mapValues(lambda kv: invFunc(kv[0], kv[1]) - if kv[1] is not None else kv[0]) - - jreduceFunc = TransformFunction(self._sc, reduceFunc, reduced._jrdd_deserializer) - jinvReduceFunc = TransformFunction(self._sc, invReduceFunc, reduced._jrdd_deserializer) - if slideDuration is None: - slideDuration = self._slideDuration - dstream = self._sc._jvm.PythonReducedWindowedDStream( - reduced._jdstream.dstream(), - jreduceFunc, jinvReduceFunc, - self._ssc._jduration(windowDuration), - self._ssc._jduration(slideDuration)) - return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer) - else: - return reduced.window(windowDuration, slideDuration).reduceByKey(func, numPartitions) - - def updateStateByKey(self, updateFunc, numPartitions=None, initialRDD=None): - """ - Return a new "state" DStream where the state for each key is updated by applying - the given function on the previous state of the key and the new values of the key. - - @param updateFunc: State update function. If this function returns None, then - corresponding state key-value pair will be eliminated. - """ - if numPartitions is None: - numPartitions = self._sc.defaultParallelism - - if initialRDD and not isinstance(initialRDD, RDD): - initialRDD = self._sc.parallelize(initialRDD) - - def reduceFunc(t, a, b): - if a is None: - g = b.groupByKey(numPartitions).mapValues(lambda vs: (list(vs), None)) - else: - g = a.cogroup(b.partitionBy(numPartitions), numPartitions) - g = g.mapValues(lambda ab: (list(ab[1]), list(ab[0])[0] if len(ab[0]) else None)) - state = g.mapValues(lambda vs_s: updateFunc(vs_s[0], vs_s[1])) - return state.filter(lambda k_v: k_v[1] is not None) - - jreduceFunc = TransformFunction(self._sc, reduceFunc, - self._sc.serializer, self._jrdd_deserializer) - if initialRDD: - initialRDD = initialRDD._reserialize(self._jrdd_deserializer) - dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc, - initialRDD._jrdd) - else: - dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc) - - return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer) - - -class TransformedDStream(DStream): - """ - TransformedDStream is a DStream generated by an Python function - transforming each RDD of a DStream to another RDDs. - - Multiple continuous transformations of DStream can be combined into - one transformation. - """ - def __init__(self, prev, func): - self._ssc = prev._ssc - self._sc = self._ssc._sc - self._jrdd_deserializer = self._sc.serializer - self.is_cached = False - self.is_checkpointed = False - self._jdstream_val = None - - # Using type() to avoid folding the functions and compacting the DStreams which is not - # not strictly an object of TransformedDStream. - # Changed here is to avoid bug in KafkaTransformedDStream when calling offsetRanges(). - if (type(prev) is TransformedDStream and - not prev.is_cached and not prev.is_checkpointed): - prev_func = prev.func - self.func = lambda t, rdd: func(t, prev_func(t, rdd)) - self.prev = prev.prev - else: - self.prev = prev - self.func = func - - @property - def _jdstream(self): - if self._jdstream_val is not None: - return self._jdstream_val - - jfunc = TransformFunction(self._sc, self.func, self.prev._jrdd_deserializer) - dstream = self._sc._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc) - self._jdstream_val = dstream.asJavaDStream() - return self._jdstream_val diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/flume.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/flume.py deleted file mode 100644 index 5de4481..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/flume.py +++ /dev/null @@ -1,156 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -if sys.version >= "3": - from io import BytesIO -else: - from StringIO import StringIO -import warnings - -from py4j.protocol import Py4JJavaError - -from pyspark.storagelevel import StorageLevel -from pyspark.serializers import PairDeserializer, NoOpSerializer, UTF8Deserializer, read_int -from pyspark.streaming import DStream - -__all__ = ['FlumeUtils', 'utf8_decoder'] - - -def utf8_decoder(s): - """ Decode the unicode as UTF-8 """ - if s is None: - return None - return s.decode('utf-8') - - -class FlumeUtils(object): - - @staticmethod - def createStream(ssc, hostname, port, - storageLevel=StorageLevel.MEMORY_AND_DISK_2, - enableDecompression=False, - bodyDecoder=utf8_decoder): - """ - Create an input stream that pulls events from Flume. - - :param ssc: StreamingContext object - :param hostname: Hostname of the slave machine to which the flume data will be sent - :param port: Port of the slave machine to which the flume data will be sent - :param storageLevel: Storage level to use for storing the received objects - :param enableDecompression: Should netty server decompress input stream - :param bodyDecoder: A function used to decode body (default is utf8_decoder) - :return: A DStream object - - .. note:: Deprecated in 2.3.0. Flume support is deprecated as of Spark 2.3.0. - See SPARK-22142. - """ - warnings.warn( - "Deprecated in 2.3.0. Flume support is deprecated as of Spark 2.3.0. " - "See SPARK-22142.", - DeprecationWarning) - jlevel = ssc._sc._getJavaStorageLevel(storageLevel) - helper = FlumeUtils._get_helper(ssc._sc) - jstream = helper.createStream(ssc._jssc, hostname, port, jlevel, enableDecompression) - return FlumeUtils._toPythonDStream(ssc, jstream, bodyDecoder) - - @staticmethod - def createPollingStream(ssc, addresses, - storageLevel=StorageLevel.MEMORY_AND_DISK_2, - maxBatchSize=1000, - parallelism=5, - bodyDecoder=utf8_decoder): - """ - Creates an input stream that is to be used with the Spark Sink deployed on a Flume agent. - This stream will poll the sink for data and will pull events as they are available. - - :param ssc: StreamingContext object - :param addresses: List of (host, port)s on which the Spark Sink is running. - :param storageLevel: Storage level to use for storing the received objects - :param maxBatchSize: The maximum number of events to be pulled from the Spark sink - in a single RPC call - :param parallelism: Number of concurrent requests this stream should send to the sink. - Note that having a higher number of requests concurrently being pulled - will result in this stream using more threads - :param bodyDecoder: A function used to decode body (default is utf8_decoder) - :return: A DStream object - - .. note:: Deprecated in 2.3.0. Flume support is deprecated as of Spark 2.3.0. - See SPARK-22142. - """ - warnings.warn( - "Deprecated in 2.3.0. Flume support is deprecated as of Spark 2.3.0. " - "See SPARK-22142.", - DeprecationWarning) - jlevel = ssc._sc._getJavaStorageLevel(storageLevel) - hosts = [] - ports = [] - for (host, port) in addresses: - hosts.append(host) - ports.append(port) - helper = FlumeUtils._get_helper(ssc._sc) - jstream = helper.createPollingStream( - ssc._jssc, hosts, ports, jlevel, maxBatchSize, parallelism) - return FlumeUtils._toPythonDStream(ssc, jstream, bodyDecoder) - - @staticmethod - def _toPythonDStream(ssc, jstream, bodyDecoder): - ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) - stream = DStream(jstream, ssc, ser) - - def func(event): - headersBytes = BytesIO(event[0]) if sys.version >= "3" else StringIO(event[0]) - headers = {} - strSer = UTF8Deserializer() - for i in range(0, read_int(headersBytes)): - key = strSer.loads(headersBytes) - value = strSer.loads(headersBytes) - headers[key] = value - body = bodyDecoder(event[1]) - return (headers, body) - return stream.map(func) - - @staticmethod - def _get_helper(sc): - try: - return sc._jvm.org.apache.spark.streaming.flume.FlumeUtilsPythonHelper() - except TypeError as e: - if str(e) == "'JavaPackage' object is not callable": - FlumeUtils._printErrorMsg(sc) - raise - - @staticmethod - def _printErrorMsg(sc): - print(""" -________________________________________________________________________________________________ - - Spark Streaming's Flume libraries not found in class path. Try one of the following. - - 1. Include the Flume library and its dependencies with in the - spark-submit command as - - $ bin/spark-submit --packages org.apache.spark:spark-streaming-flume:%s ... - - 2. Download the JAR of the artifact from Maven Central http://search.maven.org/, - Group Id = org.apache.spark, Artifact Id = spark-streaming-flume-assembly, Version = %s. - Then, include the jar in the spark-submit command as - - $ bin/spark-submit --jars ... - -________________________________________________________________________________________________ - -""" % (sc.version, sc.version)) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/kafka.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/kafka.py deleted file mode 100644 index ed2e0e7..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/kafka.py +++ /dev/null @@ -1,506 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import warnings - -from py4j.protocol import Py4JJavaError - -from pyspark.rdd import RDD -from pyspark.storagelevel import StorageLevel -from pyspark.serializers import AutoBatchedSerializer, PickleSerializer, PairDeserializer, \ - NoOpSerializer -from pyspark.streaming import DStream -from pyspark.streaming.dstream import TransformedDStream -from pyspark.streaming.util import TransformFunction - -__all__ = ['Broker', 'KafkaMessageAndMetadata', 'KafkaUtils', 'OffsetRange', - 'TopicAndPartition', 'utf8_decoder'] - - -def utf8_decoder(s): - """ Decode the unicode as UTF-8 """ - if s is None: - return None - return s.decode('utf-8') - - -class KafkaUtils(object): - - @staticmethod - def createStream(ssc, zkQuorum, groupId, topics, kafkaParams=None, - storageLevel=StorageLevel.MEMORY_AND_DISK_2, - keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): - """ - Create an input stream that pulls messages from a Kafka Broker. - - :param ssc: StreamingContext object - :param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..). - :param groupId: The group id for this consumer. - :param topics: Dict of (topic_name -> numPartitions) to consume. - Each partition is consumed in its own thread. - :param kafkaParams: Additional params for Kafka - :param storageLevel: RDD storage level. - :param keyDecoder: A function used to decode key (default is utf8_decoder) - :param valueDecoder: A function used to decode value (default is utf8_decoder) - :return: A DStream object - - .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. - See SPARK-21893. - """ - warnings.warn( - "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " - "See SPARK-21893.", - DeprecationWarning) - if kafkaParams is None: - kafkaParams = dict() - kafkaParams.update({ - "zookeeper.connect": zkQuorum, - "group.id": groupId, - "zookeeper.connection.timeout.ms": "10000", - }) - if not isinstance(topics, dict): - raise TypeError("topics should be dict") - jlevel = ssc._sc._getJavaStorageLevel(storageLevel) - helper = KafkaUtils._get_helper(ssc._sc) - jstream = helper.createStream(ssc._jssc, kafkaParams, topics, jlevel) - ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) - stream = DStream(jstream, ssc, ser) - return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1]))) - - @staticmethod - def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None, - keyDecoder=utf8_decoder, valueDecoder=utf8_decoder, - messageHandler=None): - """ - Create an input stream that directly pulls messages from a Kafka Broker and specific offset. - - This is not a receiver based Kafka input stream, it directly pulls the message from Kafka - in each batch duration and processed without storing. - - This does not use Zookeeper to store offsets. The consumed offsets are tracked - by the stream itself. For interoperability with Kafka monitoring tools that depend on - Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application. - You can access the offsets used in each batch from the generated RDDs (see - - To recover from driver failures, you have to enable checkpointing in the StreamingContext. - The information on consumed offset can be recovered from the checkpoint. - See the programming guide for details (constraints, etc.). - - :param ssc: StreamingContext object. - :param topics: list of topic_name to consume. - :param kafkaParams: Additional params for Kafka. - :param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting - point of the stream (a dictionary mapping `TopicAndPartition` to - integers). - :param keyDecoder: A function used to decode key (default is utf8_decoder). - :param valueDecoder: A function used to decode value (default is utf8_decoder). - :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess - meta using messageHandler (default is None). - :return: A DStream object - - .. note:: Experimental - .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. - See SPARK-21893. - """ - warnings.warn( - "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " - "See SPARK-21893.", - DeprecationWarning) - if fromOffsets is None: - fromOffsets = dict() - if not isinstance(topics, list): - raise TypeError("topics should be list") - if not isinstance(kafkaParams, dict): - raise TypeError("kafkaParams should be dict") - - def funcWithoutMessageHandler(k_v): - return (keyDecoder(k_v[0]), valueDecoder(k_v[1])) - - def funcWithMessageHandler(m): - m._set_key_decoder(keyDecoder) - m._set_value_decoder(valueDecoder) - return messageHandler(m) - - helper = KafkaUtils._get_helper(ssc._sc) - - jfromOffsets = dict([(k._jTopicAndPartition(helper), - v) for (k, v) in fromOffsets.items()]) - if messageHandler is None: - ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) - func = funcWithoutMessageHandler - jstream = helper.createDirectStreamWithoutMessageHandler( - ssc._jssc, kafkaParams, set(topics), jfromOffsets) - else: - ser = AutoBatchedSerializer(PickleSerializer()) - func = funcWithMessageHandler - jstream = helper.createDirectStreamWithMessageHandler( - ssc._jssc, kafkaParams, set(topics), jfromOffsets) - - stream = DStream(jstream, ssc, ser).map(func) - return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer) - - @staticmethod - def createRDD(sc, kafkaParams, offsetRanges, leaders=None, - keyDecoder=utf8_decoder, valueDecoder=utf8_decoder, - messageHandler=None): - """ - Create an RDD from Kafka using offset ranges for each topic and partition. - - :param sc: SparkContext object - :param kafkaParams: Additional params for Kafka - :param offsetRanges: list of offsetRange to specify topic:partition:[start, end) to consume - :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty - map, in which case leaders will be looked up on the driver. - :param keyDecoder: A function used to decode key (default is utf8_decoder) - :param valueDecoder: A function used to decode value (default is utf8_decoder) - :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess - meta using messageHandler (default is None). - :return: An RDD object - - .. note:: Experimental - .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. - See SPARK-21893. - """ - warnings.warn( - "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " - "See SPARK-21893.", - DeprecationWarning) - if leaders is None: - leaders = dict() - if not isinstance(kafkaParams, dict): - raise TypeError("kafkaParams should be dict") - if not isinstance(offsetRanges, list): - raise TypeError("offsetRanges should be list") - - def funcWithoutMessageHandler(k_v): - return (keyDecoder(k_v[0]), valueDecoder(k_v[1])) - - def funcWithMessageHandler(m): - m._set_key_decoder(keyDecoder) - m._set_value_decoder(valueDecoder) - return messageHandler(m) - - helper = KafkaUtils._get_helper(sc) - - joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges] - jleaders = dict([(k._jTopicAndPartition(helper), - v._jBroker(helper)) for (k, v) in leaders.items()]) - if messageHandler is None: - jrdd = helper.createRDDWithoutMessageHandler( - sc._jsc, kafkaParams, joffsetRanges, jleaders) - ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) - rdd = RDD(jrdd, sc, ser).map(funcWithoutMessageHandler) - else: - jrdd = helper.createRDDWithMessageHandler( - sc._jsc, kafkaParams, joffsetRanges, jleaders) - rdd = RDD(jrdd, sc).map(funcWithMessageHandler) - - return KafkaRDD(rdd._jrdd, sc, rdd._jrdd_deserializer) - - @staticmethod - def _get_helper(sc): - try: - return sc._jvm.org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper() - except TypeError as e: - if str(e) == "'JavaPackage' object is not callable": - KafkaUtils._printErrorMsg(sc) - raise - - @staticmethod - def _printErrorMsg(sc): - print(""" -________________________________________________________________________________________________ - - Spark Streaming's Kafka libraries not found in class path. Try one of the following. - - 1. Include the Kafka library and its dependencies with in the - spark-submit command as - - $ bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8:%s ... - - 2. Download the JAR of the artifact from Maven Central http://search.maven.org/, - Group Id = org.apache.spark, Artifact Id = spark-streaming-kafka-0-8-assembly, Version = %s. - Then, include the jar in the spark-submit command as - - $ bin/spark-submit --jars ... - -________________________________________________________________________________________________ - -""" % (sc.version, sc.version)) - - -class OffsetRange(object): - """ - Represents a range of offsets from a single Kafka TopicAndPartition. - - .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. - See SPARK-21893. - """ - - def __init__(self, topic, partition, fromOffset, untilOffset): - """ - Create an OffsetRange to represent range of offsets - :param topic: Kafka topic name. - :param partition: Kafka partition id. - :param fromOffset: Inclusive starting offset. - :param untilOffset: Exclusive ending offset. - """ - warnings.warn( - "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " - "See SPARK-21893.", - DeprecationWarning) - self.topic = topic - self.partition = partition - self.fromOffset = fromOffset - self.untilOffset = untilOffset - - def __eq__(self, other): - if isinstance(other, self.__class__): - return (self.topic == other.topic - and self.partition == other.partition - and self.fromOffset == other.fromOffset - and self.untilOffset == other.untilOffset) - else: - return False - - def __ne__(self, other): - return not self.__eq__(other) - - def __str__(self): - return "OffsetRange(topic: %s, partition: %d, range: [%d -> %d]" \ - % (self.topic, self.partition, self.fromOffset, self.untilOffset) - - def _jOffsetRange(self, helper): - return helper.createOffsetRange(self.topic, self.partition, self.fromOffset, - self.untilOffset) - - -class TopicAndPartition(object): - """ - Represents a specific topic and partition for Kafka. - - .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. - See SPARK-21893. - """ - - def __init__(self, topic, partition): - """ - Create a Python TopicAndPartition to map to the Java related object - :param topic: Kafka topic name. - :param partition: Kafka partition id. - """ - warnings.warn( - "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " - "See SPARK-21893.", - DeprecationWarning) - self._topic = topic - self._partition = partition - - def _jTopicAndPartition(self, helper): - return helper.createTopicAndPartition(self._topic, self._partition) - - def __eq__(self, other): - if isinstance(other, self.__class__): - return (self._topic == other._topic - and self._partition == other._partition) - else: - return False - - def __ne__(self, other): - return not self.__eq__(other) - - def __hash__(self): - return (self._topic, self._partition).__hash__() - - -class Broker(object): - """ - Represent the host and port info for a Kafka broker. - - .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. - See SPARK-21893. - """ - - def __init__(self, host, port): - """ - Create a Python Broker to map to the Java related object. - :param host: Broker's hostname. - :param port: Broker's port. - """ - warnings.warn( - "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " - "See SPARK-21893.", - DeprecationWarning) - self._host = host - self._port = port - - def _jBroker(self, helper): - return helper.createBroker(self._host, self._port) - - -class KafkaRDD(RDD): - """ - A Python wrapper of KafkaRDD, to provide additional information on normal RDD. - - .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. - See SPARK-21893. - """ - - def __init__(self, jrdd, ctx, jrdd_deserializer): - warnings.warn( - "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " - "See SPARK-21893.", - DeprecationWarning) - RDD.__init__(self, jrdd, ctx, jrdd_deserializer) - - def offsetRanges(self): - """ - Get the OffsetRange of specific KafkaRDD. - :return: A list of OffsetRange - """ - helper = KafkaUtils._get_helper(self.ctx) - joffsetRanges = helper.offsetRangesOfKafkaRDD(self._jrdd.rdd()) - ranges = [OffsetRange(o.topic(), o.partition(), o.fromOffset(), o.untilOffset()) - for o in joffsetRanges] - return ranges - - -class KafkaDStream(DStream): - """ - A Python wrapper of KafkaDStream - - .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. - See SPARK-21893. - """ - - def __init__(self, jdstream, ssc, jrdd_deserializer): - warnings.warn( - "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " - "See SPARK-21893.", - DeprecationWarning) - DStream.__init__(self, jdstream, ssc, jrdd_deserializer) - - def foreachRDD(self, func): - """ - Apply a function to each RDD in this DStream. - """ - if func.__code__.co_argcount == 1: - old_func = func - func = lambda r, rdd: old_func(rdd) - jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer) \ - .rdd_wrapper(lambda jrdd, ctx, ser: KafkaRDD(jrdd, ctx, ser)) - api = self._ssc._jvm.PythonDStream - api.callForeachRDD(self._jdstream, jfunc) - - def transform(self, func): - """ - Return a new DStream in which each RDD is generated by applying a function - on each RDD of this DStream. - - `func` can have one argument of `rdd`, or have two arguments of - (`time`, `rdd`) - """ - if func.__code__.co_argcount == 1: - oldfunc = func - func = lambda t, rdd: oldfunc(rdd) - assert func.__code__.co_argcount == 2, "func should take one or two arguments" - - return KafkaTransformedDStream(self, func) - - -class KafkaTransformedDStream(TransformedDStream): - """ - Kafka specific wrapper of TransformedDStream to transform on Kafka RDD. - - .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. - See SPARK-21893. - """ - - def __init__(self, prev, func): - warnings.warn( - "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " - "See SPARK-21893.", - DeprecationWarning) - TransformedDStream.__init__(self, prev, func) - - @property - def _jdstream(self): - if self._jdstream_val is not None: - return self._jdstream_val - - jfunc = TransformFunction(self._sc, self.func, self.prev._jrdd_deserializer) \ - .rdd_wrapper(lambda jrdd, ctx, ser: KafkaRDD(jrdd, ctx, ser)) - dstream = self._sc._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc) - self._jdstream_val = dstream.asJavaDStream() - return self._jdstream_val - - -class KafkaMessageAndMetadata(object): - """ - Kafka message and metadata information. Including topic, partition, offset and message - - .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. - See SPARK-21893. - """ - - def __init__(self, topic, partition, offset, key, message): - """ - Python wrapper of Kafka MessageAndMetadata - :param topic: topic name of this Kafka message - :param partition: partition id of this Kafka message - :param offset: Offset of this Kafka message in the specific partition - :param key: key payload of this Kafka message, can be null if this Kafka message has no key - specified, the return data is undecoded bytearry. - :param message: actual message payload of this Kafka message, the return data is - undecoded bytearray. - """ - warnings.warn( - "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " - "See SPARK-21893.", - DeprecationWarning) - self.topic = topic - self.partition = partition - self.offset = offset - self._rawKey = key - self._rawMessage = message - self._keyDecoder = utf8_decoder - self._valueDecoder = utf8_decoder - - def __str__(self): - return "KafkaMessageAndMetadata(topic: %s, partition: %d, offset: %d, key and message...)" \ - % (self.topic, self.partition, self.offset) - - def __repr__(self): - return self.__str__() - - def __reduce__(self): - return (KafkaMessageAndMetadata, - (self.topic, self.partition, self.offset, self._rawKey, self._rawMessage)) - - def _set_key_decoder(self, decoder): - self._keyDecoder = decoder - - def _set_value_decoder(self, decoder): - self._valueDecoder = decoder - - @property - def key(self): - return self._keyDecoder(self._rawKey) - - @property - def message(self): - return self._valueDecoder(self._rawMessage) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/kinesis.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/kinesis.py deleted file mode 100644 index b839859..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/kinesis.py +++ /dev/null @@ -1,120 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from py4j.protocol import Py4JJavaError - -from pyspark.serializers import PairDeserializer, NoOpSerializer -from pyspark.storagelevel import StorageLevel -from pyspark.streaming import DStream - -__all__ = ['KinesisUtils', 'InitialPositionInStream', 'utf8_decoder'] - - -def utf8_decoder(s): - """ Decode the unicode as UTF-8 """ - if s is None: - return None - return s.decode('utf-8') - - -class KinesisUtils(object): - - @staticmethod - def createStream(ssc, kinesisAppName, streamName, endpointUrl, regionName, - initialPositionInStream, checkpointInterval, - storageLevel=StorageLevel.MEMORY_AND_DISK_2, - awsAccessKeyId=None, awsSecretKey=None, decoder=utf8_decoder, - stsAssumeRoleArn=None, stsSessionName=None, stsExternalId=None): - """ - Create an input stream that pulls messages from a Kinesis stream. This uses the - Kinesis Client Library (KCL) to pull messages from Kinesis. - - .. note:: The given AWS credentials will get saved in DStream checkpoints if checkpointing - is enabled. Make sure that your checkpoint directory is secure. - - :param ssc: StreamingContext object - :param kinesisAppName: Kinesis application name used by the Kinesis Client Library (KCL) to - update DynamoDB - :param streamName: Kinesis stream name - :param endpointUrl: Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) - :param regionName: Name of region used by the Kinesis Client Library (KCL) to update - DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) - :param initialPositionInStream: In the absence of Kinesis checkpoint info, this is the - worker's initial starting position in the stream. The - values are either the beginning of the stream per Kinesis' - limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or - the tip of the stream (InitialPositionInStream.LATEST). - :param checkpointInterval: Checkpoint interval for Kinesis checkpointing. See the Kinesis - Spark Streaming documentation for more details on the different - types of checkpoints. - :param storageLevel: Storage level to use for storing the received objects (default is - StorageLevel.MEMORY_AND_DISK_2) - :param awsAccessKeyId: AWS AccessKeyId (default is None. If None, will use - DefaultAWSCredentialsProviderChain) - :param awsSecretKey: AWS SecretKey (default is None. If None, will use - DefaultAWSCredentialsProviderChain) - :param decoder: A function used to decode value (default is utf8_decoder) - :param stsAssumeRoleArn: ARN of IAM role to assume when using STS sessions to read from - the Kinesis stream (default is None). - :param stsSessionName: Name to uniquely identify STS sessions used to read from Kinesis - stream, if STS is being used (default is None). - :param stsExternalId: External ID that can be used to validate against the assumed IAM - role's trust policy, if STS is being used (default is None). - :return: A DStream object - """ - jlevel = ssc._sc._getJavaStorageLevel(storageLevel) - jduration = ssc._jduration(checkpointInterval) - - try: - # Use KinesisUtilsPythonHelper to access Scala's KinesisUtils - helper = ssc._jvm.org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper() - except TypeError as e: - if str(e) == "'JavaPackage' object is not callable": - KinesisUtils._printErrorMsg(ssc.sparkContext) - raise - jstream = helper.createStream(ssc._jssc, kinesisAppName, streamName, endpointUrl, - regionName, initialPositionInStream, jduration, jlevel, - awsAccessKeyId, awsSecretKey, stsAssumeRoleArn, - stsSessionName, stsExternalId) - stream = DStream(jstream, ssc, NoOpSerializer()) - return stream.map(lambda v: decoder(v)) - - @staticmethod - def _printErrorMsg(sc): - print(""" -________________________________________________________________________________________________ - - Spark Streaming's Kinesis libraries not found in class path. Try one of the following. - - 1. Include the Kinesis library and its dependencies with in the - spark-submit command as - - $ bin/spark-submit --packages org.apache.spark:spark-streaming-kinesis-asl:%s ... - - 2. Download the JAR of the artifact from Maven Central http://search.maven.org/, - Group Id = org.apache.spark, Artifact Id = spark-streaming-kinesis-asl-assembly, Version = %s. - Then, include the jar in the spark-submit command as - - $ bin/spark-submit --jars ... - -________________________________________________________________________________________________ - -""" % (sc.version, sc.version)) - - -class InitialPositionInStream(object): - LATEST, TRIM_HORIZON = (0, 1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/listener.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/listener.py deleted file mode 100644 index d4ecc21..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/listener.py +++ /dev/null @@ -1,81 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -__all__ = ["StreamingListener"] - - -class StreamingListener(object): - - def __init__(self): - pass - - def onStreamingStarted(self, streamingStarted): - """ - Called when the streaming has been started. - """ - pass - - def onReceiverStarted(self, receiverStarted): - """ - Called when a receiver has been started - """ - pass - - def onReceiverError(self, receiverError): - """ - Called when a receiver has reported an error - """ - pass - - def onReceiverStopped(self, receiverStopped): - """ - Called when a receiver has been stopped - """ - pass - - def onBatchSubmitted(self, batchSubmitted): - """ - Called when a batch of jobs has been submitted for processing. - """ - pass - - def onBatchStarted(self, batchStarted): - """ - Called when processing of a batch of jobs has started. - """ - pass - - def onBatchCompleted(self, batchCompleted): - """ - Called when processing of a batch of jobs has completed. - """ - pass - - def onOutputOperationStarted(self, outputOperationStarted): - """ - Called when processing of a job of a batch has started. - """ - pass - - def onOutputOperationCompleted(self, outputOperationCompleted): - """ - Called when processing of a job of a batch has completed - """ - pass - - class Java: - implements = ["org.apache.spark.streaming.api.java.PythonStreamingListener"] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/tests.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/tests.py deleted file mode 100644 index 5cef621..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/tests.py +++ /dev/null @@ -1,1640 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import glob -import os -import sys -from itertools import chain -import time -import operator -import tempfile -import random -import struct -import shutil -from functools import reduce - -try: - import xmlrunner -except ImportError: - xmlrunner = None - -if sys.version_info[:2] <= (2, 6): - try: - import unittest2 as unittest - except ImportError: - sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') - sys.exit(1) -else: - import unittest - -if sys.version >= "3": - long = int - -from pyspark.context import SparkConf, SparkContext, RDD -from pyspark.storagelevel import StorageLevel -from pyspark.streaming.context import StreamingContext -from pyspark.streaming.kafka import Broker, KafkaUtils, OffsetRange, TopicAndPartition -from pyspark.streaming.flume import FlumeUtils -from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream -from pyspark.streaming.listener import StreamingListener - - -class PySparkStreamingTestCase(unittest.TestCase): - - timeout = 30 # seconds - duration = .5 - - @classmethod - def setUpClass(cls): - class_name = cls.__name__ - conf = SparkConf().set("spark.default.parallelism", 1) - cls.sc = SparkContext(appName=class_name, conf=conf) - cls.sc.setCheckpointDir(tempfile.mkdtemp()) - - @classmethod - def tearDownClass(cls): - cls.sc.stop() - # Clean up in the JVM just in case there has been some issues in Python API - try: - jSparkContextOption = SparkContext._jvm.SparkContext.get() - if jSparkContextOption.nonEmpty(): - jSparkContextOption.get().stop() - except: - pass - - def setUp(self): - self.ssc = StreamingContext(self.sc, self.duration) - - def tearDown(self): - if self.ssc is not None: - self.ssc.stop(False) - # Clean up in the JVM just in case there has been some issues in Python API - try: - jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive() - if jStreamingContextOption.nonEmpty(): - jStreamingContextOption.get().stop(False) - except: - pass - - def wait_for(self, result, n): - start_time = time.time() - while len(result) < n and time.time() - start_time < self.timeout: - time.sleep(0.01) - if len(result) < n: - print("timeout after", self.timeout) - - def _take(self, dstream, n): - """ - Return the first `n` elements in the stream (will start and stop). - """ - results = [] - - def take(_, rdd): - if rdd and len(results) < n: - results.extend(rdd.take(n - len(results))) - - dstream.foreachRDD(take) - - self.ssc.start() - self.wait_for(results, n) - return results - - def _collect(self, dstream, n, block=True): - """ - Collect each RDDs into the returned list. - - :return: list, which will have the collected items. - """ - result = [] - - def get_output(_, rdd): - if rdd and len(result) < n: - r = rdd.collect() - if r: - result.append(r) - - dstream.foreachRDD(get_output) - - if not block: - return result - - self.ssc.start() - self.wait_for(result, n) - return result - - def _test_func(self, input, func, expected, sort=False, input2=None): - """ - @param input: dataset for the test. This should be list of lists. - @param func: wrapped function. This function should return PythonDStream object. - @param expected: expected output for this testcase. - """ - if not isinstance(input[0], RDD): - input = [self.sc.parallelize(d, 1) for d in input] - input_stream = self.ssc.queueStream(input) - if input2 and not isinstance(input2[0], RDD): - input2 = [self.sc.parallelize(d, 1) for d in input2] - input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None - - # Apply test function to stream. - if input2: - stream = func(input_stream, input_stream2) - else: - stream = func(input_stream) - - result = self._collect(stream, len(expected)) - if sort: - self._sort_result_based_on_key(result) - self._sort_result_based_on_key(expected) - self.assertEqual(expected, result) - - def _sort_result_based_on_key(self, outputs): - """Sort the list based on first value.""" - for output in outputs: - output.sort(key=lambda x: x[0]) - - -class BasicOperationTests(PySparkStreamingTestCase): - - def test_map(self): - """Basic operation test for DStream.map.""" - input = [range(1, 5), range(5, 9), range(9, 13)] - - def func(dstream): - return dstream.map(str) - expected = [list(map(str, x)) for x in input] - self._test_func(input, func, expected) - - def test_flatMap(self): - """Basic operation test for DStream.flatMap.""" - input = [range(1, 5), range(5, 9), range(9, 13)] - - def func(dstream): - return dstream.flatMap(lambda x: (x, x * 2)) - expected = [list(chain.from_iterable((map(lambda y: [y, y * 2], x)))) - for x in input] - self._test_func(input, func, expected) - - def test_filter(self): - """Basic operation test for DStream.filter.""" - input = [range(1, 5), range(5, 9), range(9, 13)] - - def func(dstream): - return dstream.filter(lambda x: x % 2 == 0) - expected = [[y for y in x if y % 2 == 0] for x in input] - self._test_func(input, func, expected) - - def test_count(self): - """Basic operation test for DStream.count.""" - input = [range(5), range(10), range(20)] - - def func(dstream): - return dstream.count() - expected = [[len(x)] for x in input] - self._test_func(input, func, expected) - - def test_slice(self): - """Basic operation test for DStream.slice.""" - import datetime as dt - self.ssc = StreamingContext(self.sc, 1.0) - self.ssc.remember(4.0) - input = [[1], [2], [3], [4]] - stream = self.ssc.queueStream([self.sc.parallelize(d, 1) for d in input]) - - time_vals = [] - - def get_times(t, rdd): - if rdd and len(time_vals) < len(input): - time_vals.append(t) - - stream.foreachRDD(get_times) - - self.ssc.start() - self.wait_for(time_vals, 4) - begin_time = time_vals[0] - - def get_sliced(begin_delta, end_delta): - begin = begin_time + dt.timedelta(seconds=begin_delta) - end = begin_time + dt.timedelta(seconds=end_delta) - rdds = stream.slice(begin, end) - result_list = [rdd.collect() for rdd in rdds] - return [r for result in result_list for r in result] - - self.assertEqual(set([1]), set(get_sliced(0, 0))) - self.assertEqual(set([2, 3]), set(get_sliced(1, 2))) - self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4))) - self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4))) - - def test_reduce(self): - """Basic operation test for DStream.reduce.""" - input = [range(1, 5), range(5, 9), range(9, 13)] - - def func(dstream): - return dstream.reduce(operator.add) - expected = [[reduce(operator.add, x)] for x in input] - self._test_func(input, func, expected) - - def test_reduceByKey(self): - """Basic operation test for DStream.reduceByKey.""" - input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)], - [("", 1), ("", 1), ("", 1), ("", 1)], - [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]] - - def func(dstream): - return dstream.reduceByKey(operator.add) - expected = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]] - self._test_func(input, func, expected, sort=True) - - def test_mapValues(self): - """Basic operation test for DStream.mapValues.""" - input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], - [(0, 4), (1, 1), (2, 2), (3, 3)], - [(1, 1), (2, 1), (3, 1), (4, 1)]] - - def func(dstream): - return dstream.mapValues(lambda x: x + 10) - expected = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)], - [(0, 14), (1, 11), (2, 12), (3, 13)], - [(1, 11), (2, 11), (3, 11), (4, 11)]] - self._test_func(input, func, expected, sort=True) - - def test_flatMapValues(self): - """Basic operation test for DStream.flatMapValues.""" - input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], - [(0, 4), (1, 1), (2, 1), (3, 1)], - [(1, 1), (2, 1), (3, 1), (4, 1)]] - - def func(dstream): - return dstream.flatMapValues(lambda x: (x, x + 10)) - expected = [[("a", 2), ("a", 12), ("b", 2), ("b", 12), - ("c", 1), ("c", 11), ("d", 1), ("d", 11)], - [(0, 4), (0, 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)], - [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]] - self._test_func(input, func, expected) - - def test_glom(self): - """Basic operation test for DStream.glom.""" - input = [range(1, 5), range(5, 9), range(9, 13)] - rdds = [self.sc.parallelize(r, 2) for r in input] - - def func(dstream): - return dstream.glom() - expected = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]] - self._test_func(rdds, func, expected) - - def test_mapPartitions(self): - """Basic operation test for DStream.mapPartitions.""" - input = [range(1, 5), range(5, 9), range(9, 13)] - rdds = [self.sc.parallelize(r, 2) for r in input] - - def func(dstream): - def f(iterator): - yield sum(iterator) - return dstream.mapPartitions(f) - expected = [[3, 7], [11, 15], [19, 23]] - self._test_func(rdds, func, expected) - - def test_countByValue(self): - """Basic operation test for DStream.countByValue.""" - input = [list(range(1, 5)) * 2, list(range(5, 7)) + list(range(5, 9)), ["a", "a", "b", ""]] - - def func(dstream): - return dstream.countByValue() - expected = [[(1, 2), (2, 2), (3, 2), (4, 2)], - [(5, 2), (6, 2), (7, 1), (8, 1)], - [("a", 2), ("b", 1), ("", 1)]] - self._test_func(input, func, expected, sort=True) - - def test_groupByKey(self): - """Basic operation test for DStream.groupByKey.""" - input = [[(1, 1), (2, 1), (3, 1), (4, 1)], - [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], - [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]] - - def func(dstream): - return dstream.groupByKey().mapValues(list) - - expected = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])], - [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])], - [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]] - self._test_func(input, func, expected, sort=True) - - def test_combineByKey(self): - """Basic operation test for DStream.combineByKey.""" - input = [[(1, 1), (2, 1), (3, 1), (4, 1)], - [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], - [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]] - - def func(dstream): - def add(a, b): - return a + str(b) - return dstream.combineByKey(str, add, add) - expected = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")], - [(1, "111"), (2, "11"), (3, "1")], - [("a", "11"), ("b", "1"), ("", "111")]] - self._test_func(input, func, expected, sort=True) - - def test_repartition(self): - input = [range(1, 5), range(5, 9)] - rdds = [self.sc.parallelize(r, 2) for r in input] - - def func(dstream): - return dstream.repartition(1).glom() - expected = [[[1, 2, 3, 4]], [[5, 6, 7, 8]]] - self._test_func(rdds, func, expected) - - def test_union(self): - input1 = [range(3), range(5), range(6)] - input2 = [range(3, 6), range(5, 6)] - - def func(d1, d2): - return d1.union(d2) - - expected = [list(range(6)), list(range(6)), list(range(6))] - self._test_func(input1, func, expected, input2=input2) - - def test_cogroup(self): - input = [[(1, 1), (2, 1), (3, 1)], - [(1, 1), (1, 1), (1, 1), (2, 1)], - [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1)]] - input2 = [[(1, 2)], - [(4, 1)], - [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 2)]] - - def func(d1, d2): - return d1.cogroup(d2).mapValues(lambda vs: tuple(map(list, vs))) - - expected = [[(1, ([1], [2])), (2, ([1], [])), (3, ([1], []))], - [(1, ([1, 1, 1], [])), (2, ([1], [])), (4, ([], [1]))], - [("a", ([1, 1], [1, 1])), ("b", ([1], [1])), ("", ([1, 1], [1, 2]))]] - self._test_func(input, func, expected, sort=True, input2=input2) - - def test_join(self): - input = [[('a', 1), ('b', 2)]] - input2 = [[('b', 3), ('c', 4)]] - - def func(a, b): - return a.join(b) - - expected = [[('b', (2, 3))]] - self._test_func(input, func, expected, True, input2) - - def test_left_outer_join(self): - input = [[('a', 1), ('b', 2)]] - input2 = [[('b', 3), ('c', 4)]] - - def func(a, b): - return a.leftOuterJoin(b) - - expected = [[('a', (1, None)), ('b', (2, 3))]] - self._test_func(input, func, expected, True, input2) - - def test_right_outer_join(self): - input = [[('a', 1), ('b', 2)]] - input2 = [[('b', 3), ('c', 4)]] - - def func(a, b): - return a.rightOuterJoin(b) - - expected = [[('b', (2, 3)), ('c', (None, 4))]] - self._test_func(input, func, expected, True, input2) - - def test_full_outer_join(self): - input = [[('a', 1), ('b', 2)]] - input2 = [[('b', 3), ('c', 4)]] - - def func(a, b): - return a.fullOuterJoin(b) - - expected = [[('a', (1, None)), ('b', (2, 3)), ('c', (None, 4))]] - self._test_func(input, func, expected, True, input2) - - def test_update_state_by_key(self): - - def updater(vs, s): - if not s: - s = [] - s.extend(vs) - return s - - input = [[('k', i)] for i in range(5)] - - def func(dstream): - return dstream.updateStateByKey(updater) - - expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]] - expected = [[('k', v)] for v in expected] - self._test_func(input, func, expected) - - def test_update_state_by_key_initial_rdd(self): - - def updater(vs, s): - if not s: - s = [] - s.extend(vs) - return s - - initial = [('k', [0, 1])] - initial = self.sc.parallelize(initial, 1) - - input = [[('k', i)] for i in range(2, 5)] - - def func(dstream): - return dstream.updateStateByKey(updater, initialRDD=initial) - - expected = [[0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]] - expected = [[('k', v)] for v in expected] - self._test_func(input, func, expected) - - def test_failed_func(self): - # Test failure in - # TransformFunction.apply(rdd: Option[RDD[_]], time: Time) - input = [self.sc.parallelize([d], 1) for d in range(4)] - input_stream = self.ssc.queueStream(input) - - def failed_func(i): - raise ValueError("This is a special error") - - input_stream.map(failed_func).pprint() - self.ssc.start() - try: - self.ssc.awaitTerminationOrTimeout(10) - except: - import traceback - failure = traceback.format_exc() - self.assertTrue("This is a special error" in failure) - return - - self.fail("a failed func should throw an error") - - def test_failed_func2(self): - # Test failure in - # TransformFunction.apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time) - input = [self.sc.parallelize([d], 1) for d in range(4)] - input_stream1 = self.ssc.queueStream(input) - input_stream2 = self.ssc.queueStream(input) - - def failed_func(rdd1, rdd2): - raise ValueError("This is a special error") - - input_stream1.transformWith(failed_func, input_stream2, True).pprint() - self.ssc.start() - try: - self.ssc.awaitTerminationOrTimeout(10) - except: - import traceback - failure = traceback.format_exc() - self.assertTrue("This is a special error" in failure) - return - - self.fail("a failed func should throw an error") - - def test_failed_func_with_reseting_failure(self): - input = [self.sc.parallelize([d], 1) for d in range(4)] - input_stream = self.ssc.queueStream(input) - - def failed_func(i): - if i == 1: - # Make it fail in the second batch - raise ValueError("This is a special error") - else: - return i - - # We should be able to see the results of the 3rd and 4th batches even if the second batch - # fails - expected = [[0], [2], [3]] - self.assertEqual(expected, self._collect(input_stream.map(failed_func), 3)) - try: - self.ssc.awaitTerminationOrTimeout(10) - except: - import traceback - failure = traceback.format_exc() - self.assertTrue("This is a special error" in failure) - return - - self.fail("a failed func should throw an error") - - -class StreamingListenerTests(PySparkStreamingTestCase): - - duration = .5 - - class BatchInfoCollector(StreamingListener): - - def __init__(self): - super(StreamingListener, self).__init__() - self.batchInfosCompleted = [] - self.batchInfosStarted = [] - self.batchInfosSubmitted = [] - self.streamingStartedTime = [] - - def onStreamingStarted(self, streamingStarted): - self.streamingStartedTime.append(streamingStarted.time) - - def onBatchSubmitted(self, batchSubmitted): - self.batchInfosSubmitted.append(batchSubmitted.batchInfo()) - - def onBatchStarted(self, batchStarted): - self.batchInfosStarted.append(batchStarted.batchInfo()) - - def onBatchCompleted(self, batchCompleted): - self.batchInfosCompleted.append(batchCompleted.batchInfo()) - - def test_batch_info_reports(self): - batch_collector = self.BatchInfoCollector() - self.ssc.addStreamingListener(batch_collector) - input = [[1], [2], [3], [4]] - - def func(dstream): - return dstream.map(int) - expected = [[1], [2], [3], [4]] - self._test_func(input, func, expected) - - batchInfosSubmitted = batch_collector.batchInfosSubmitted - batchInfosStarted = batch_collector.batchInfosStarted - batchInfosCompleted = batch_collector.batchInfosCompleted - streamingStartedTime = batch_collector.streamingStartedTime - - self.wait_for(batchInfosCompleted, 4) - - self.assertEqual(len(streamingStartedTime), 1) - - self.assertGreaterEqual(len(batchInfosSubmitted), 4) - for info in batchInfosSubmitted: - self.assertGreaterEqual(info.batchTime().milliseconds(), 0) - self.assertGreaterEqual(info.submissionTime(), 0) - - for streamId in info.streamIdToInputInfo(): - streamInputInfo = info.streamIdToInputInfo()[streamId] - self.assertGreaterEqual(streamInputInfo.inputStreamId(), 0) - self.assertGreaterEqual(streamInputInfo.numRecords, 0) - for key in streamInputInfo.metadata(): - self.assertIsNotNone(streamInputInfo.metadata()[key]) - self.assertIsNotNone(streamInputInfo.metadataDescription()) - - for outputOpId in info.outputOperationInfos(): - outputInfo = info.outputOperationInfos()[outputOpId] - self.assertGreaterEqual(outputInfo.batchTime().milliseconds(), 0) - self.assertGreaterEqual(outputInfo.id(), 0) - self.assertIsNotNone(outputInfo.name()) - self.assertIsNotNone(outputInfo.description()) - self.assertGreaterEqual(outputInfo.startTime(), -1) - self.assertGreaterEqual(outputInfo.endTime(), -1) - self.assertIsNone(outputInfo.failureReason()) - - self.assertEqual(info.schedulingDelay(), -1) - self.assertEqual(info.processingDelay(), -1) - self.assertEqual(info.totalDelay(), -1) - self.assertEqual(info.numRecords(), 0) - - self.assertGreaterEqual(len(batchInfosStarted), 4) - for info in batchInfosStarted: - self.assertGreaterEqual(info.batchTime().milliseconds(), 0) - self.assertGreaterEqual(info.submissionTime(), 0) - - for streamId in info.streamIdToInputInfo(): - streamInputInfo = info.streamIdToInputInfo()[streamId] - self.assertGreaterEqual(streamInputInfo.inputStreamId(), 0) - self.assertGreaterEqual(streamInputInfo.numRecords, 0) - for key in streamInputInfo.metadata(): - self.assertIsNotNone(streamInputInfo.metadata()[key]) - self.assertIsNotNone(streamInputInfo.metadataDescription()) - - for outputOpId in info.outputOperationInfos(): - outputInfo = info.outputOperationInfos()[outputOpId] - self.assertGreaterEqual(outputInfo.batchTime().milliseconds(), 0) - self.assertGreaterEqual(outputInfo.id(), 0) - self.assertIsNotNone(outputInfo.name()) - self.assertIsNotNone(outputInfo.description()) - self.assertGreaterEqual(outputInfo.startTime(), -1) - self.assertGreaterEqual(outputInfo.endTime(), -1) - self.assertIsNone(outputInfo.failureReason()) - - self.assertGreaterEqual(info.schedulingDelay(), 0) - self.assertEqual(info.processingDelay(), -1) - self.assertEqual(info.totalDelay(), -1) - self.assertEqual(info.numRecords(), 0) - - self.assertGreaterEqual(len(batchInfosCompleted), 4) - for info in batchInfosCompleted: - self.assertGreaterEqual(info.batchTime().milliseconds(), 0) - self.assertGreaterEqual(info.submissionTime(), 0) - - for streamId in info.streamIdToInputInfo(): - streamInputInfo = info.streamIdToInputInfo()[streamId] - self.assertGreaterEqual(streamInputInfo.inputStreamId(), 0) - self.assertGreaterEqual(streamInputInfo.numRecords, 0) - for key in streamInputInfo.metadata(): - self.assertIsNotNone(streamInputInfo.metadata()[key]) - self.assertIsNotNone(streamInputInfo.metadataDescription()) - - for outputOpId in info.outputOperationInfos(): - outputInfo = info.outputOperationInfos()[outputOpId] - self.assertGreaterEqual(outputInfo.batchTime().milliseconds(), 0) - self.assertGreaterEqual(outputInfo.id(), 0) - self.assertIsNotNone(outputInfo.name()) - self.assertIsNotNone(outputInfo.description()) - self.assertGreaterEqual(outputInfo.startTime(), 0) - self.assertGreaterEqual(outputInfo.endTime(), 0) - self.assertIsNone(outputInfo.failureReason()) - - self.assertGreaterEqual(info.schedulingDelay(), 0) - self.assertGreaterEqual(info.processingDelay(), 0) - self.assertGreaterEqual(info.totalDelay(), 0) - self.assertEqual(info.numRecords(), 0) - - -class WindowFunctionTests(PySparkStreamingTestCase): - - timeout = 15 - - def test_window(self): - input = [range(1), range(2), range(3), range(4), range(5)] - - def func(dstream): - return dstream.window(1.5, .5).count() - - expected = [[1], [3], [6], [9], [12], [9], [5]] - self._test_func(input, func, expected) - - def test_count_by_window(self): - input = [range(1), range(2), range(3), range(4), range(5)] - - def func(dstream): - return dstream.countByWindow(1.5, .5) - - expected = [[1], [3], [6], [9], [12], [9], [5]] - self._test_func(input, func, expected) - - def test_count_by_window_large(self): - input = [range(1), range(2), range(3), range(4), range(5), range(6)] - - def func(dstream): - return dstream.countByWindow(2.5, .5) - - expected = [[1], [3], [6], [10], [15], [20], [18], [15], [11], [6]] - self._test_func(input, func, expected) - - def test_count_by_value_and_window(self): - input = [range(1), range(2), range(3), range(4), range(5), range(6)] - - def func(dstream): - return dstream.countByValueAndWindow(2.5, .5) - - expected = [[(0, 1)], - [(0, 2), (1, 1)], - [(0, 3), (1, 2), (2, 1)], - [(0, 4), (1, 3), (2, 2), (3, 1)], - [(0, 5), (1, 4), (2, 3), (3, 2), (4, 1)], - [(0, 5), (1, 5), (2, 4), (3, 3), (4, 2), (5, 1)], - [(0, 4), (1, 4), (2, 4), (3, 3), (4, 2), (5, 1)], - [(0, 3), (1, 3), (2, 3), (3, 3), (4, 2), (5, 1)], - [(0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (5, 1)], - [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]] - self._test_func(input, func, expected) - - def test_group_by_key_and_window(self): - input = [[('a', i)] for i in range(5)] - - def func(dstream): - return dstream.groupByKeyAndWindow(1.5, .5).mapValues(list) - - expected = [[('a', [0])], [('a', [0, 1])], [('a', [0, 1, 2])], [('a', [1, 2, 3])], - [('a', [2, 3, 4])], [('a', [3, 4])], [('a', [4])]] - self._test_func(input, func, expected) - - def test_reduce_by_invalid_window(self): - input1 = [range(3), range(5), range(1), range(6)] - d1 = self.ssc.queueStream(input1) - self.assertRaises(ValueError, lambda: d1.reduceByKeyAndWindow(None, None, 0.1, 0.1)) - self.assertRaises(ValueError, lambda: d1.reduceByKeyAndWindow(None, None, 1, 0.1)) - - def test_reduce_by_key_and_window_with_none_invFunc(self): - input = [range(1), range(2), range(3), range(4), range(5), range(6)] - - def func(dstream): - return dstream.map(lambda x: (x, 1))\ - .reduceByKeyAndWindow(operator.add, None, 5, 1)\ - .filter(lambda kv: kv[1] > 0).count() - - expected = [[2], [4], [6], [6], [6], [6]] - self._test_func(input, func, expected) - - -class StreamingContextTests(PySparkStreamingTestCase): - - duration = 0.1 - setupCalled = False - - def _add_input_stream(self): - inputs = [range(1, x) for x in range(101)] - stream = self.ssc.queueStream(inputs) - self._collect(stream, 1, block=False) - - def test_stop_only_streaming_context(self): - self._add_input_stream() - self.ssc.start() - self.ssc.stop(False) - self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5) - - def test_stop_multiple_times(self): - self._add_input_stream() - self.ssc.start() - self.ssc.stop(False) - self.ssc.stop(False) - - def test_queue_stream(self): - input = [list(range(i + 1)) for i in range(3)] - dstream = self.ssc.queueStream(input) - result = self._collect(dstream, 3) - self.assertEqual(input, result) - - def test_text_file_stream(self): - d = tempfile.mkdtemp() - self.ssc = StreamingContext(self.sc, self.duration) - dstream2 = self.ssc.textFileStream(d).map(int) - result = self._collect(dstream2, 2, block=False) - self.ssc.start() - for name in ('a', 'b'): - time.sleep(1) - with open(os.path.join(d, name), "w") as f: - f.writelines(["%d\n" % i for i in range(10)]) - self.wait_for(result, 2) - self.assertEqual([list(range(10)), list(range(10))], result) - - def test_binary_records_stream(self): - d = tempfile.mkdtemp() - self.ssc = StreamingContext(self.sc, self.duration) - dstream = self.ssc.binaryRecordsStream(d, 10).map( - lambda v: struct.unpack("10b", bytes(v))) - result = self._collect(dstream, 2, block=False) - self.ssc.start() - for name in ('a', 'b'): - time.sleep(1) - with open(os.path.join(d, name), "wb") as f: - f.write(bytearray(range(10))) - self.wait_for(result, 2) - self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result]) - - def test_union(self): - input = [list(range(i + 1)) for i in range(3)] - dstream = self.ssc.queueStream(input) - dstream2 = self.ssc.queueStream(input) - dstream3 = self.ssc.union(dstream, dstream2) - result = self._collect(dstream3, 3) - expected = [i * 2 for i in input] - self.assertEqual(expected, result) - - def test_transform(self): - dstream1 = self.ssc.queueStream([[1]]) - dstream2 = self.ssc.queueStream([[2]]) - dstream3 = self.ssc.queueStream([[3]]) - - def func(rdds): - rdd1, rdd2, rdd3 = rdds - return rdd2.union(rdd3).union(rdd1) - - dstream = self.ssc.transform([dstream1, dstream2, dstream3], func) - - self.assertEqual([2, 3, 1], self._take(dstream, 3)) - - def test_transform_pairrdd(self): - # This regression test case is for SPARK-17756. - dstream = self.ssc.queueStream( - [[1], [2], [3]]).transform(lambda rdd: rdd.cartesian(rdd)) - self.assertEqual([(1, 1), (2, 2), (3, 3)], self._take(dstream, 3)) - - def test_get_active(self): - self.assertEqual(StreamingContext.getActive(), None) - - # Verify that getActive() returns the active context - self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) - self.ssc.start() - self.assertEqual(StreamingContext.getActive(), self.ssc) - - # Verify that getActive() returns None - self.ssc.stop(False) - self.assertEqual(StreamingContext.getActive(), None) - - # Verify that if the Java context is stopped, then getActive() returns None - self.ssc = StreamingContext(self.sc, self.duration) - self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) - self.ssc.start() - self.assertEqual(StreamingContext.getActive(), self.ssc) - self.ssc._jssc.stop(False) - self.assertEqual(StreamingContext.getActive(), None) - - def test_get_active_or_create(self): - # Test StreamingContext.getActiveOrCreate() without checkpoint data - # See CheckpointTests for tests with checkpoint data - self.ssc = None - self.assertEqual(StreamingContext.getActive(), None) - - def setupFunc(): - ssc = StreamingContext(self.sc, self.duration) - ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) - self.setupCalled = True - return ssc - - # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active - self.setupCalled = False - self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) - self.assertTrue(self.setupCalled) - - # Verify that getActiveOrCreate() returns active context and does not call the setupFunc - self.ssc.start() - self.setupCalled = False - self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) - self.assertFalse(self.setupCalled) - - # Verify that getActiveOrCreate() calls setupFunc after active context is stopped - self.ssc.stop(False) - self.setupCalled = False - self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) - self.assertTrue(self.setupCalled) - - # Verify that if the Java context is stopped, then getActive() returns None - self.ssc = StreamingContext(self.sc, self.duration) - self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) - self.ssc.start() - self.assertEqual(StreamingContext.getActive(), self.ssc) - self.ssc._jssc.stop(False) - self.setupCalled = False - self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) - self.assertTrue(self.setupCalled) - - def test_await_termination_or_timeout(self): - self._add_input_stream() - self.ssc.start() - self.assertFalse(self.ssc.awaitTerminationOrTimeout(0.001)) - self.ssc.stop(False) - self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001)) - - -class CheckpointTests(unittest.TestCase): - - setupCalled = False - - @staticmethod - def tearDownClass(): - # Clean up in the JVM just in case there has been some issues in Python API - if SparkContext._jvm is not None: - jStreamingContextOption = \ - SparkContext._jvm.org.apache.spark.streaming.StreamingContext.getActive() - if jStreamingContextOption.nonEmpty(): - jStreamingContextOption.get().stop() - - def setUp(self): - self.ssc = None - self.sc = None - self.cpd = None - - def tearDown(self): - if self.ssc is not None: - self.ssc.stop(True) - if self.sc is not None: - self.sc.stop() - if self.cpd is not None: - shutil.rmtree(self.cpd) - - def test_transform_function_serializer_failure(self): - inputd = tempfile.mkdtemp() - self.cpd = tempfile.mkdtemp("test_transform_function_serializer_failure") - - def setup(): - conf = SparkConf().set("spark.default.parallelism", 1) - sc = SparkContext(conf=conf) - ssc = StreamingContext(sc, 0.5) - - # A function that cannot be serialized - def process(time, rdd): - sc.parallelize(range(1, 10)) - - ssc.textFileStream(inputd).foreachRDD(process) - return ssc - - self.ssc = StreamingContext.getOrCreate(self.cpd, setup) - try: - self.ssc.start() - except: - import traceback - failure = traceback.format_exc() - self.assertTrue( - "It appears that you are attempting to reference SparkContext" in failure) - return - - self.fail("using SparkContext in process should fail because it's not Serializable") - - def test_get_or_create_and_get_active_or_create(self): - inputd = tempfile.mkdtemp() - outputd = tempfile.mkdtemp() + "/" - - def updater(vs, s): - return sum(vs, s or 0) - - def setup(): - conf = SparkConf().set("spark.default.parallelism", 1) - sc = SparkContext(conf=conf) - ssc = StreamingContext(sc, 2) - dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) - wc = dstream.updateStateByKey(updater) - wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") - wc.checkpoint(2) - self.setupCalled = True - return ssc - - # Verify that getOrCreate() calls setup() in absence of checkpoint files - self.cpd = tempfile.mkdtemp("test_streaming_cps") - self.setupCalled = False - self.ssc = StreamingContext.getOrCreate(self.cpd, setup) - self.assertTrue(self.setupCalled) - - self.ssc.start() - - def check_output(n): - while not os.listdir(outputd): - if self.ssc.awaitTerminationOrTimeout(0.5): - raise Exception("ssc stopped") - time.sleep(1) # make sure mtime is larger than the previous one - with open(os.path.join(inputd, str(n)), 'w') as f: - f.writelines(["%d\n" % i for i in range(10)]) - - while True: - if self.ssc.awaitTerminationOrTimeout(0.5): - raise Exception("ssc stopped") - p = os.path.join(outputd, max(os.listdir(outputd))) - if '_SUCCESS' not in os.listdir(p): - # not finished - continue - ordd = self.ssc.sparkContext.textFile(p).map(lambda line: line.split(",")) - d = ordd.values().map(int).collect() - if not d: - continue - self.assertEqual(10, len(d)) - s = set(d) - self.assertEqual(1, len(s)) - m = s.pop() - if n > m: - continue - self.assertEqual(n, m) - break - - check_output(1) - check_output(2) - - # Verify the getOrCreate() recovers from checkpoint files - self.ssc.stop(True, True) - time.sleep(1) - self.setupCalled = False - self.ssc = StreamingContext.getOrCreate(self.cpd, setup) - self.assertFalse(self.setupCalled) - self.ssc.start() - check_output(3) - - # Verify that getOrCreate() uses existing SparkContext - self.ssc.stop(True, True) - time.sleep(1) - self.sc = SparkContext(conf=SparkConf()) - self.setupCalled = False - self.ssc = StreamingContext.getOrCreate(self.cpd, setup) - self.assertFalse(self.setupCalled) - self.assertTrue(self.ssc.sparkContext == self.sc) - - # Verify the getActiveOrCreate() recovers from checkpoint files - self.ssc.stop(True, True) - time.sleep(1) - self.setupCalled = False - self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) - self.assertFalse(self.setupCalled) - self.ssc.start() - check_output(4) - - # Verify that getActiveOrCreate() returns active context - self.setupCalled = False - self.assertEqual(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc) - self.assertFalse(self.setupCalled) - - # Verify that getActiveOrCreate() uses existing SparkContext - self.ssc.stop(True, True) - time.sleep(1) - self.sc = SparkContext(conf=SparkConf()) - self.setupCalled = False - self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) - self.assertFalse(self.setupCalled) - self.assertTrue(self.ssc.sparkContext == self.sc) - - # Verify that getActiveOrCreate() calls setup() in absence of checkpoint files - self.ssc.stop(True, True) - shutil.rmtree(self.cpd) # delete checkpoint directory - time.sleep(1) - self.setupCalled = False - self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) - self.assertTrue(self.setupCalled) - - # Stop everything - self.ssc.stop(True, True) - - -class KafkaStreamTests(PySparkStreamingTestCase): - timeout = 20 # seconds - duration = 1 - - def setUp(self): - super(KafkaStreamTests, self).setUp() - self._kafkaTestUtils = self.ssc._jvm.org.apache.spark.streaming.kafka.KafkaTestUtils() - self._kafkaTestUtils.setup() - - def tearDown(self): - super(KafkaStreamTests, self).tearDown() - - if self._kafkaTestUtils is not None: - self._kafkaTestUtils.teardown() - self._kafkaTestUtils = None - - def _randomTopic(self): - return "topic-%d" % random.randint(0, 10000) - - def _validateStreamResult(self, sendData, stream): - result = {} - for i in chain.from_iterable(self._collect(stream.map(lambda x: x[1]), - sum(sendData.values()))): - result[i] = result.get(i, 0) + 1 - - self.assertEqual(sendData, result) - - def _validateRddResult(self, sendData, rdd): - result = {} - for i in rdd.map(lambda x: x[1]).collect(): - result[i] = result.get(i, 0) + 1 - self.assertEqual(sendData, result) - - def test_kafka_stream(self): - """Test the Python Kafka stream API.""" - topic = self._randomTopic() - sendData = {"a": 3, "b": 5, "c": 10} - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - - stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(), - "test-streaming-consumer", {topic: 1}, - {"auto.offset.reset": "smallest"}) - self._validateStreamResult(sendData, stream) - - def test_kafka_direct_stream(self): - """Test the Python direct Kafka stream API.""" - topic = self._randomTopic() - sendData = {"a": 1, "b": 2, "c": 3} - kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), - "auto.offset.reset": "smallest"} - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - - stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) - self._validateStreamResult(sendData, stream) - - def test_kafka_direct_stream_from_offset(self): - """Test the Python direct Kafka stream API with start offset specified.""" - topic = self._randomTopic() - sendData = {"a": 1, "b": 2, "c": 3} - fromOffsets = {TopicAndPartition(topic, 0): long(0)} - kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - - stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets) - self._validateStreamResult(sendData, stream) - - def test_kafka_rdd(self): - """Test the Python direct Kafka RDD API.""" - topic = self._randomTopic() - sendData = {"a": 1, "b": 2} - offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] - kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges) - self._validateRddResult(sendData, rdd) - - def test_kafka_rdd_with_leaders(self): - """Test the Python direct Kafka RDD API with leaders.""" - topic = self._randomTopic() - sendData = {"a": 1, "b": 2, "c": 3} - offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] - kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} - address = self._kafkaTestUtils.brokerAddress().split(":") - leaders = {TopicAndPartition(topic, 0): Broker(address[0], int(address[1]))} - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, leaders) - self._validateRddResult(sendData, rdd) - - def test_kafka_rdd_get_offsetRanges(self): - """Test Python direct Kafka RDD get OffsetRanges.""" - topic = self._randomTopic() - sendData = {"a": 3, "b": 4, "c": 5} - offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] - kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges) - self.assertEqual(offsetRanges, rdd.offsetRanges()) - - def test_kafka_direct_stream_foreach_get_offsetRanges(self): - """Test the Python direct Kafka stream foreachRDD get offsetRanges.""" - topic = self._randomTopic() - sendData = {"a": 1, "b": 2, "c": 3} - kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), - "auto.offset.reset": "smallest"} - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - - stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) - - offsetRanges = [] - - def getOffsetRanges(_, rdd): - for o in rdd.offsetRanges(): - offsetRanges.append(o) - - stream.foreachRDD(getOffsetRanges) - self.ssc.start() - self.wait_for(offsetRanges, 1) - - self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))]) - - def test_kafka_direct_stream_transform_get_offsetRanges(self): - """Test the Python direct Kafka stream transform get offsetRanges.""" - topic = self._randomTopic() - sendData = {"a": 1, "b": 2, "c": 3} - kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), - "auto.offset.reset": "smallest"} - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - - stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) - - offsetRanges = [] - - def transformWithOffsetRanges(rdd): - for o in rdd.offsetRanges(): - offsetRanges.append(o) - return rdd - - # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together, - # only the TransformedDstreams can be folded together. - stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint() - self.ssc.start() - self.wait_for(offsetRanges, 1) - - self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))]) - - def test_topic_and_partition_equality(self): - topic_and_partition_a = TopicAndPartition("foo", 0) - topic_and_partition_b = TopicAndPartition("foo", 0) - topic_and_partition_c = TopicAndPartition("bar", 0) - topic_and_partition_d = TopicAndPartition("foo", 1) - - self.assertEqual(topic_and_partition_a, topic_and_partition_b) - self.assertNotEqual(topic_and_partition_a, topic_and_partition_c) - self.assertNotEqual(topic_and_partition_a, topic_and_partition_d) - - def test_kafka_direct_stream_transform_with_checkpoint(self): - """Test the Python direct Kafka stream transform with checkpoint correctly recovered.""" - topic = self._randomTopic() - sendData = {"a": 1, "b": 2, "c": 3} - kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), - "auto.offset.reset": "smallest"} - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - - offsetRanges = [] - - def transformWithOffsetRanges(rdd): - for o in rdd.offsetRanges(): - offsetRanges.append(o) - return rdd - - self.ssc.stop(False) - self.ssc = None - tmpdir = "checkpoint-test-%d" % random.randint(0, 10000) - - def setup(): - ssc = StreamingContext(self.sc, 0.5) - ssc.checkpoint(tmpdir) - stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams) - stream.transform(transformWithOffsetRanges).count().pprint() - return ssc - - try: - ssc1 = StreamingContext.getOrCreate(tmpdir, setup) - ssc1.start() - self.wait_for(offsetRanges, 1) - self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))]) - - # To make sure some checkpoint is written - time.sleep(3) - ssc1.stop(False) - ssc1 = None - - # Restart again to make sure the checkpoint is recovered correctly - ssc2 = StreamingContext.getOrCreate(tmpdir, setup) - ssc2.start() - ssc2.awaitTermination(3) - ssc2.stop(stopSparkContext=False, stopGraceFully=True) - ssc2 = None - finally: - shutil.rmtree(tmpdir) - - def test_kafka_rdd_message_handler(self): - """Test Python direct Kafka RDD MessageHandler.""" - topic = self._randomTopic() - sendData = {"a": 1, "b": 1, "c": 2} - offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] - kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} - - def getKeyAndDoubleMessage(m): - return m and (m.key, m.message * 2) - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges, - messageHandler=getKeyAndDoubleMessage) - self._validateRddResult({"aa": 1, "bb": 1, "cc": 2}, rdd) - - def test_kafka_direct_stream_message_handler(self): - """Test the Python direct Kafka stream MessageHandler.""" - topic = self._randomTopic() - sendData = {"a": 1, "b": 2, "c": 3} - kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), - "auto.offset.reset": "smallest"} - - self._kafkaTestUtils.createTopic(topic) - self._kafkaTestUtils.sendMessages(topic, sendData) - - def getKeyAndDoubleMessage(m): - return m and (m.key, m.message * 2) - - stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, - messageHandler=getKeyAndDoubleMessage) - self._validateStreamResult({"aa": 1, "bb": 2, "cc": 3}, stream) - - -class FlumeStreamTests(PySparkStreamingTestCase): - timeout = 20 # seconds - duration = 1 - - def setUp(self): - super(FlumeStreamTests, self).setUp() - self._utils = self.ssc._jvm.org.apache.spark.streaming.flume.FlumeTestUtils() - - def tearDown(self): - if self._utils is not None: - self._utils.close() - self._utils = None - - super(FlumeStreamTests, self).tearDown() - - def _startContext(self, n, compressed): - # Start the StreamingContext and also collect the result - dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(), - enableDecompression=compressed) - result = [] - - def get_output(_, rdd): - for event in rdd.collect(): - if len(result) < n: - result.append(event) - dstream.foreachRDD(get_output) - self.ssc.start() - return result - - def _validateResult(self, input, result): - # Validate both the header and the body - header = {"test": "header"} - self.assertEqual(len(input), len(result)) - for i in range(0, len(input)): - self.assertEqual(header, result[i][0]) - self.assertEqual(input[i], result[i][1]) - - def _writeInput(self, input, compressed): - # Try to write input to the receiver until success or timeout - start_time = time.time() - while True: - try: - self._utils.writeInput(input, compressed) - break - except: - if time.time() - start_time < self.timeout: - time.sleep(0.01) - else: - raise - - def test_flume_stream(self): - input = [str(i) for i in range(1, 101)] - result = self._startContext(len(input), False) - self._writeInput(input, False) - self.wait_for(result, len(input)) - self._validateResult(input, result) - - def test_compressed_flume_stream(self): - input = [str(i) for i in range(1, 101)] - result = self._startContext(len(input), True) - self._writeInput(input, True) - self.wait_for(result, len(input)) - self._validateResult(input, result) - - -class FlumePollingStreamTests(PySparkStreamingTestCase): - timeout = 20 # seconds - duration = 1 - maxAttempts = 5 - - def setUp(self): - self._utils = self.sc._jvm.org.apache.spark.streaming.flume.PollingFlumeTestUtils() - - def tearDown(self): - if self._utils is not None: - self._utils.close() - self._utils = None - - def _writeAndVerify(self, ports): - # Set up the streaming context and input streams - ssc = StreamingContext(self.sc, self.duration) - try: - addresses = [("localhost", port) for port in ports] - dstream = FlumeUtils.createPollingStream( - ssc, - addresses, - maxBatchSize=self._utils.eventsPerBatch(), - parallelism=5) - outputBuffer = [] - - def get_output(_, rdd): - for e in rdd.collect(): - outputBuffer.append(e) - - dstream.foreachRDD(get_output) - ssc.start() - self._utils.sendDataAndEnsureAllDataHasBeenReceived() - - self.wait_for(outputBuffer, self._utils.getTotalEvents()) - outputHeaders = [event[0] for event in outputBuffer] - outputBodies = [event[1] for event in outputBuffer] - self._utils.assertOutput(outputHeaders, outputBodies) - finally: - ssc.stop(False) - - def _testMultipleTimes(self, f): - attempt = 0 - while True: - try: - f() - break - except: - attempt += 1 - if attempt >= self.maxAttempts: - raise - else: - import traceback - traceback.print_exc() - - def _testFlumePolling(self): - try: - port = self._utils.startSingleSink() - self._writeAndVerify([port]) - self._utils.assertChannelsAreEmpty() - finally: - self._utils.close() - - def _testFlumePollingMultipleHosts(self): - try: - port = self._utils.startSingleSink() - self._writeAndVerify([port]) - self._utils.assertChannelsAreEmpty() - finally: - self._utils.close() - - def test_flume_polling(self): - self._testMultipleTimes(self._testFlumePolling) - - def test_flume_polling_multiple_hosts(self): - self._testMultipleTimes(self._testFlumePollingMultipleHosts) - - -class KinesisStreamTests(PySparkStreamingTestCase): - - def test_kinesis_stream_api(self): - # Don't start the StreamingContext because we cannot test it in Jenkins - kinesisStream1 = KinesisUtils.createStream( - self.ssc, "myAppNam", "mySparkStream", - "https://kinesis.us-west-2.amazonaws.com", "us-west-2", - InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2) - kinesisStream2 = KinesisUtils.createStream( - self.ssc, "myAppNam", "mySparkStream", - "https://kinesis.us-west-2.amazonaws.com", "us-west-2", - InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2, - "awsAccessKey", "awsSecretKey") - - def test_kinesis_stream(self): - if not are_kinesis_tests_enabled: - sys.stderr.write( - "Skipped test_kinesis_stream (enable by setting environment variable %s=1" - % kinesis_test_environ_var) - return - - import random - kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000))) - kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils(2) - try: - kinesisTestUtils.createStream() - aWSCredentials = kinesisTestUtils.getAWSCredentials() - stream = KinesisUtils.createStream( - self.ssc, kinesisAppName, kinesisTestUtils.streamName(), - kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(), - InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY, - aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey()) - - outputBuffer = [] - - def get_output(_, rdd): - for e in rdd.collect(): - outputBuffer.append(e) - - stream.foreachRDD(get_output) - self.ssc.start() - - testData = [i for i in range(1, 11)] - expectedOutput = set([str(i) for i in testData]) - start_time = time.time() - while time.time() - start_time < 120: - kinesisTestUtils.pushData(testData) - if expectedOutput == set(outputBuffer): - break - time.sleep(10) - self.assertEqual(expectedOutput, set(outputBuffer)) - except: - import traceback - traceback.print_exc() - raise - finally: - self.ssc.stop(False) - kinesisTestUtils.deleteStream() - kinesisTestUtils.deleteDynamoDBTable(kinesisAppName) - - -# Search jar in the project dir using the jar name_prefix for both sbt build and maven build because -# the artifact jars are in different directories. -def search_jar(dir, name_prefix): - # We should ignore the following jars - ignored_jar_suffixes = ("javadoc.jar", "sources.jar", "test-sources.jar", "tests.jar") - jars = (glob.glob(os.path.join(dir, "target/scala-*/" + name_prefix + "-*.jar")) + # sbt build - glob.glob(os.path.join(dir, "target/" + name_prefix + "_*.jar"))) # maven build - return [jar for jar in jars if not jar.endswith(ignored_jar_suffixes)] - - -def search_kafka_assembly_jar(): - SPARK_HOME = os.environ["SPARK_HOME"] - kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-0-8-assembly") - jars = search_jar(kafka_assembly_dir, "spark-streaming-kafka-0-8-assembly") - if not jars: - raise Exception( - ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) + - "You need to build Spark with " - "'build/sbt -Pkafka-0-8 assembly/package streaming-kafka-0-8-assembly/assembly' or " - "'build/mvn -DskipTests -Pkafka-0-8 package' before running this test.") - elif len(jars) > 1: - raise Exception(("Found multiple Spark Streaming Kafka assembly JARs: %s; please " - "remove all but one") % (", ".join(jars))) - else: - return jars[0] - - -def search_flume_assembly_jar(): - SPARK_HOME = os.environ["SPARK_HOME"] - flume_assembly_dir = os.path.join(SPARK_HOME, "external/flume-assembly") - jars = search_jar(flume_assembly_dir, "spark-streaming-flume-assembly") - if not jars: - raise Exception( - ("Failed to find Spark Streaming Flume assembly jar in %s. " % flume_assembly_dir) + - "You need to build Spark with " - "'build/sbt -Pflume assembly/package streaming-flume-assembly/assembly' or " - "'build/mvn -DskipTests -Pflume package' before running this test.") - elif len(jars) > 1: - raise Exception(("Found multiple Spark Streaming Flume assembly JARs: %s; please " - "remove all but one") % (", ".join(jars))) - else: - return jars[0] - - -def _kinesis_asl_assembly_dir(): - SPARK_HOME = os.environ["SPARK_HOME"] - return os.path.join(SPARK_HOME, "external/kinesis-asl-assembly") - - -def search_kinesis_asl_assembly_jar(): - jars = search_jar(_kinesis_asl_assembly_dir(), "spark-streaming-kinesis-asl-assembly") - if not jars: - return None - elif len(jars) > 1: - raise Exception(("Found multiple Spark Streaming Kinesis ASL assembly JARs: %s; please " - "remove all but one") % (", ".join(jars))) - else: - return jars[0] - - -# Must be same as the variable and condition defined in modules.py -flume_test_environ_var = "ENABLE_FLUME_TESTS" -are_flume_tests_enabled = os.environ.get(flume_test_environ_var) == '1' -# Must be same as the variable and condition defined in modules.py -kafka_test_environ_var = "ENABLE_KAFKA_0_8_TESTS" -are_kafka_tests_enabled = os.environ.get(kafka_test_environ_var) == '1' -# Must be same as the variable and condition defined in KinesisTestUtils.scala and modules.py -kinesis_test_environ_var = "ENABLE_KINESIS_TESTS" -are_kinesis_tests_enabled = os.environ.get(kinesis_test_environ_var) == '1' - -if __name__ == "__main__": - from pyspark.streaming.tests import * - kafka_assembly_jar = search_kafka_assembly_jar() - flume_assembly_jar = search_flume_assembly_jar() - kinesis_asl_assembly_jar = search_kinesis_asl_assembly_jar() - - if kinesis_asl_assembly_jar is None: - kinesis_jar_present = False - jars = "%s,%s" % (kafka_assembly_jar, flume_assembly_jar) - else: - kinesis_jar_present = True - jars = "%s,%s,%s" % (kafka_assembly_jar, flume_assembly_jar, kinesis_asl_assembly_jar) - - existing_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell") - jars_args = "--jars %s" % jars - os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join([jars_args, existing_args]) - testcases = [BasicOperationTests, WindowFunctionTests, StreamingContextTests, CheckpointTests, - StreamingListenerTests] - - if are_flume_tests_enabled: - testcases.append(FlumeStreamTests) - testcases.append(FlumePollingStreamTests) - else: - sys.stderr.write( - "Skipped test_flume_stream (enable by setting environment variable %s=1" - % flume_test_environ_var) - - if are_kafka_tests_enabled: - testcases.append(KafkaStreamTests) - else: - sys.stderr.write( - "Skipped test_kafka_stream (enable by setting environment variable %s=1" - % kafka_test_environ_var) - - if kinesis_jar_present is True: - testcases.append(KinesisStreamTests) - elif are_kinesis_tests_enabled is False: - sys.stderr.write("Skipping all Kinesis Python tests as the optional Kinesis project was " - "not compiled into a JAR. To run these tests, " - "you need to build Spark with 'build/sbt -Pkinesis-asl assembly/package " - "streaming-kinesis-asl-assembly/assembly' or " - "'build/mvn -Pkinesis-asl package' before running this test.") - else: - raise Exception( - ("Failed to find Spark Streaming Kinesis assembly jar in %s. " - % _kinesis_asl_assembly_dir()) + - "You need to build Spark with 'build/sbt -Pkinesis-asl " - "assembly/package streaming-kinesis-asl-assembly/assembly'" - "or 'build/mvn -Pkinesis-asl package' before running this test.") - - sys.stderr.write("Running tests: %s \n" % (str(testcases))) - failed = False - for testcase in testcases: - sys.stderr.write("[Running %s]\n" % (testcase)) - tests = unittest.TestLoader().loadTestsFromTestCase(testcase) - if xmlrunner: - result = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2).run(tests) - if not result.wasSuccessful(): - failed = True - else: - result = unittest.TextTestRunner(verbosity=2).run(tests) - if not result.wasSuccessful(): - failed = True - sys.exit(failed) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/util.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/util.py deleted file mode 100644 index b4b9f97..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/streaming/util.py +++ /dev/null @@ -1,160 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import time -from datetime import datetime -import traceback -import sys - -from py4j.java_gateway import is_instance_of - -from pyspark import SparkContext, RDD - - -class TransformFunction(object): - """ - This class wraps a function RDD[X] -> RDD[Y] that was passed to - DStream.transform(), allowing it to be called from Java via Py4J's - callback server. - - Java calls this function with a sequence of JavaRDDs and this function - returns a single JavaRDD pointer back to Java. - """ - _emptyRDD = None - - def __init__(self, ctx, func, *deserializers): - self.ctx = ctx - self.func = func - self.deserializers = deserializers - self.rdd_wrap_func = lambda jrdd, ctx, ser: RDD(jrdd, ctx, ser) - self.failure = None - - def rdd_wrapper(self, func): - self.rdd_wrap_func = func - return self - - def call(self, milliseconds, jrdds): - # Clear the failure - self.failure = None - try: - if self.ctx is None: - self.ctx = SparkContext._active_spark_context - if not self.ctx or not self.ctx._jsc: - # stopped - return - - # extend deserializers with the first one - sers = self.deserializers - if len(sers) < len(jrdds): - sers += (sers[0],) * (len(jrdds) - len(sers)) - - rdds = [self.rdd_wrap_func(jrdd, self.ctx, ser) if jrdd else None - for jrdd, ser in zip(jrdds, sers)] - t = datetime.fromtimestamp(milliseconds / 1000.0) - r = self.func(t, *rdds) - if r: - # Here, we work around to ensure `_jrdd` is `JavaRDD` by wrapping it by `map`. - # org.apache.spark.streaming.api.python.PythonTransformFunction requires to return - # `JavaRDD`; however, this could be `JavaPairRDD` by some APIs, for example, `zip`. - # See SPARK-17756. - if is_instance_of(self.ctx._gateway, r._jrdd, "org.apache.spark.api.java.JavaRDD"): - return r._jrdd - else: - return r.map(lambda x: x)._jrdd - except: - self.failure = traceback.format_exc() - - def getLastFailure(self): - return self.failure - - def __repr__(self): - return "TransformFunction(%s)" % self.func - - class Java: - implements = ['org.apache.spark.streaming.api.python.PythonTransformFunction'] - - -class TransformFunctionSerializer(object): - """ - This class implements a serializer for PythonTransformFunction Java - objects. - - This is necessary because the Java PythonTransformFunction objects are - actually Py4J references to Python objects and thus are not directly - serializable. When Java needs to serialize a PythonTransformFunction, - it uses this class to invoke Python, which returns the serialized function - as a byte array. - """ - def __init__(self, ctx, serializer, gateway=None): - self.ctx = ctx - self.serializer = serializer - self.gateway = gateway or self.ctx._gateway - self.gateway.jvm.PythonDStream.registerSerializer(self) - self.failure = None - - def dumps(self, id): - # Clear the failure - self.failure = None - try: - func = self.gateway.gateway_property.pool[id] - return bytearray(self.serializer.dumps(( - func.func, func.rdd_wrap_func, func.deserializers))) - except: - self.failure = traceback.format_exc() - - def loads(self, data): - # Clear the failure - self.failure = None - try: - f, wrap_func, deserializers = self.serializer.loads(bytes(data)) - return TransformFunction(self.ctx, f, *deserializers).rdd_wrapper(wrap_func) - except: - self.failure = traceback.format_exc() - - def getLastFailure(self): - return self.failure - - def __repr__(self): - return "TransformFunctionSerializer(%s)" % self.serializer - - class Java: - implements = ['org.apache.spark.streaming.api.python.PythonTransformFunctionSerializer'] - - -def rddToFileName(prefix, suffix, timestamp): - """ - Return string prefix-time(.suffix) - - >>> rddToFileName("spark", None, 12345678910) - 'spark-12345678910' - >>> rddToFileName("spark", "tmp", 12345678910) - 'spark-12345678910.tmp' - """ - if isinstance(timestamp, datetime): - seconds = time.mktime(timestamp.timetuple()) - timestamp = int(seconds * 1000) + timestamp.microsecond // 1000 - if suffix is None: - return prefix + "-" + str(timestamp) - else: - return prefix + "-" + str(timestamp) + "." + suffix - - -if __name__ == "__main__": - import doctest - (failure_count, test_count) = doctest.testmod() - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/taskcontext.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/taskcontext.py deleted file mode 100644 index 98b505c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/taskcontext.py +++ /dev/null @@ -1,225 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function -import socket - -from pyspark.java_gateway import local_connect_and_auth -from pyspark.serializers import write_int, UTF8Deserializer - - -class TaskContext(object): - - """ - .. note:: Experimental - - Contextual information about a task which can be read or mutated during - execution. To access the TaskContext for a running task, use: - L{TaskContext.get()}. - """ - - _taskContext = None - - _attemptNumber = None - _partitionId = None - _stageId = None - _taskAttemptId = None - _localProperties = None - - def __new__(cls): - """Even if users construct TaskContext instead of using get, give them the singleton.""" - taskContext = cls._taskContext - if taskContext is not None: - return taskContext - cls._taskContext = taskContext = object.__new__(cls) - return taskContext - - def __init__(self): - """Construct a TaskContext, use get instead""" - pass - - @classmethod - def _getOrCreate(cls): - """Internal function to get or create global TaskContext.""" - if cls._taskContext is None: - cls._taskContext = TaskContext() - return cls._taskContext - - @classmethod - def get(cls): - """ - Return the currently active TaskContext. This can be called inside of - user functions to access contextual information about running tasks. - - .. note:: Must be called on the worker, not the driver. Returns None if not initialized. - """ - return cls._taskContext - - def stageId(self): - """The ID of the stage that this task belong to.""" - return self._stageId - - def partitionId(self): - """ - The ID of the RDD partition that is computed by this task. - """ - return self._partitionId - - def attemptNumber(self): - """" - How many times this task has been attempted. The first task attempt will be assigned - attemptNumber = 0, and subsequent attempts will have increasing attempt numbers. - """ - return self._attemptNumber - - def taskAttemptId(self): - """ - An ID that is unique to this task attempt (within the same SparkContext, no two task - attempts will share the same attempt ID). This is roughly equivalent to Hadoop's - TaskAttemptID. - """ - return self._taskAttemptId - - def getLocalProperty(self, key): - """ - Get a local property set upstream in the driver, or None if it is missing. - """ - return self._localProperties.get(key, None) - - -BARRIER_FUNCTION = 1 - - -def _load_from_socket(port, auth_secret): - """ - Load data from a given socket, this is a blocking method thus only return when the socket - connection has been closed. - """ - (sockfile, sock) = local_connect_and_auth(port, auth_secret) - # The barrier() call may block forever, so no timeout - sock.settimeout(None) - # Make a barrier() function call. - write_int(BARRIER_FUNCTION, sockfile) - sockfile.flush() - - # Collect result. - res = UTF8Deserializer().loads(sockfile) - - # Release resources. - sockfile.close() - sock.close() - - return res - - -class BarrierTaskContext(TaskContext): - - """ - .. note:: Experimental - - A :class:`TaskContext` with extra contextual info and tooling for tasks in a barrier stage. - Use :func:`BarrierTaskContext.get` to obtain the barrier context for a running barrier task. - - .. versionadded:: 2.4.0 - """ - - _port = None - _secret = None - - def __init__(self): - """Construct a BarrierTaskContext, use get instead""" - pass - - @classmethod - def _getOrCreate(cls): - """Internal function to get or create global BarrierTaskContext.""" - if not isinstance(cls._taskContext, BarrierTaskContext): - cls._taskContext = object.__new__(cls) - return cls._taskContext - - @classmethod - def get(cls): - """ - .. note:: Experimental - - Return the currently active :class:`BarrierTaskContext`. - This can be called inside of user functions to access contextual information about - running tasks. - - .. note:: Must be called on the worker, not the driver. Returns None if not initialized. - """ - return cls._taskContext - - @classmethod - def _initialize(cls, port, secret): - """ - Initialize BarrierTaskContext, other methods within BarrierTaskContext can only be called - after BarrierTaskContext is initialized. - """ - cls._port = port - cls._secret = secret - - def barrier(self): - """ - .. note:: Experimental - - Sets a global barrier and waits until all tasks in this stage hit this barrier. - Similar to `MPI_Barrier` function in MPI, this function blocks until all tasks - in the same stage have reached this routine. - - .. warning:: In a barrier stage, each task much have the same number of `barrier()` - calls, in all possible code branches. - Otherwise, you may get the job hanging or a SparkException after timeout. - - .. versionadded:: 2.4.0 - """ - if self._port is None or self._secret is None: - raise Exception("Not supported to call barrier() before initialize " + - "BarrierTaskContext.") - else: - _load_from_socket(self._port, self._secret) - - def getTaskInfos(self): - """ - .. note:: Experimental - - Returns :class:`BarrierTaskInfo` for all tasks in this barrier stage, - ordered by partition ID. - - .. versionadded:: 2.4.0 - """ - if self._port is None or self._secret is None: - raise Exception("Not supported to call getTaskInfos() before initialize " + - "BarrierTaskContext.") - else: - addresses = self._localProperties.get("addresses", "") - return [BarrierTaskInfo(h.strip()) for h in addresses.split(",")] - - -class BarrierTaskInfo(object): - """ - .. note:: Experimental - - Carries all task infos of a barrier task. - - :var address: The IPv4 address (host:port) of the executor that the barrier task is running on - - .. versionadded:: 2.4.0 - """ - - def __init__(self, address): - self.address = address diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/test_broadcast.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/test_broadcast.py deleted file mode 100644 index 4b6dbf7..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/test_broadcast.py +++ /dev/null @@ -1,141 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import random -import tempfile -import unittest - -try: - import xmlrunner -except ImportError: - xmlrunner = None - -from pyspark.broadcast import Broadcast -from pyspark.conf import SparkConf -from pyspark.context import SparkContext -from pyspark.java_gateway import launch_gateway -from pyspark.serializers import ChunkedStream - - -class BroadcastTest(unittest.TestCase): - - def tearDown(self): - if getattr(self, "sc", None) is not None: - self.sc.stop() - self.sc = None - - def _test_encryption_helper(self, vs): - """ - Creates a broadcast variables for each value in vs, and runs a simple job to make sure the - value is the same when it's read in the executors. Also makes sure there are no task - failures. - """ - bs = [self.sc.broadcast(value=v) for v in vs] - exec_values = self.sc.parallelize(range(2)).map(lambda x: [b.value for b in bs]).collect() - for ev in exec_values: - self.assertEqual(ev, vs) - # make sure there are no task failures - status = self.sc.statusTracker() - for jid in status.getJobIdsForGroup(): - for sid in status.getJobInfo(jid).stageIds: - stage_info = status.getStageInfo(sid) - self.assertEqual(0, stage_info.numFailedTasks) - - def _test_multiple_broadcasts(self, *extra_confs): - """ - Test broadcast variables make it OK to the executors. Tests multiple broadcast variables, - and also multiple jobs. - """ - conf = SparkConf() - for key, value in extra_confs: - conf.set(key, value) - conf.setMaster("local-cluster[2,1,1024]") - self.sc = SparkContext(conf=conf) - self._test_encryption_helper([5]) - self._test_encryption_helper([5, 10, 20]) - - def test_broadcast_with_encryption(self): - self._test_multiple_broadcasts(("spark.io.encryption.enabled", "true")) - - def test_broadcast_no_encryption(self): - self._test_multiple_broadcasts() - - def _test_broadcast_on_driver(self, *extra_confs): - conf = SparkConf() - for key, value in extra_confs: - conf.set(key, value) - conf.setMaster("local-cluster[2,1,1024]") - self.sc = SparkContext(conf=conf) - bs = self.sc.broadcast(value=5) - self.assertEqual(5, bs.value) - - def test_broadcast_value_driver_no_encryption(self): - self._test_broadcast_on_driver() - - def test_broadcast_value_driver_encryption(self): - self._test_broadcast_on_driver(("spark.io.encryption.enabled", "true")) - - -class BroadcastFrameProtocolTest(unittest.TestCase): - - @classmethod - def setUpClass(cls): - gateway = launch_gateway(SparkConf()) - cls._jvm = gateway.jvm - cls.longMessage = True - random.seed(42) - - def _test_chunked_stream(self, data, py_buf_size): - # write data using the chunked protocol from python. - chunked_file = tempfile.NamedTemporaryFile(delete=False) - dechunked_file = tempfile.NamedTemporaryFile(delete=False) - dechunked_file.close() - try: - out = ChunkedStream(chunked_file, py_buf_size) - out.write(data) - out.close() - # now try to read it in java - jin = self._jvm.java.io.FileInputStream(chunked_file.name) - jout = self._jvm.java.io.FileOutputStream(dechunked_file.name) - self._jvm.DechunkedInputStream.dechunkAndCopyToOutput(jin, jout) - # java should have decoded it back to the original data - self.assertEqual(len(data), os.stat(dechunked_file.name).st_size) - with open(dechunked_file.name, "rb") as f: - byte = f.read(1) - idx = 0 - while byte: - self.assertEqual(data[idx], bytearray(byte)[0], msg="idx = " + str(idx)) - byte = f.read(1) - idx += 1 - finally: - os.unlink(chunked_file.name) - os.unlink(dechunked_file.name) - - def test_chunked_stream(self): - def random_bytes(n): - return bytearray(random.getrandbits(8) for _ in range(n)) - for data_length in [1, 10, 100, 10000]: - for buffer_length in [1, 2, 5, 8192]: - self._test_chunked_stream(random_bytes(data_length), buffer_length) - -if __name__ == '__main__': - from pyspark.test_broadcast import * - if xmlrunner: - unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2) - else: - unittest.main(verbosity=2) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/test_serializers.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/test_serializers.py deleted file mode 100644 index 5b43729..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/test_serializers.py +++ /dev/null @@ -1,90 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import io -import math -import struct -import sys -import unittest - -try: - import xmlrunner -except ImportError: - xmlrunner = None - -from pyspark import serializers - - -def read_int(b): - return struct.unpack("!i", b)[0] - - -def write_int(i): - return struct.pack("!i", i) - - -class SerializersTest(unittest.TestCase): - - def test_chunked_stream(self): - original_bytes = bytearray(range(100)) - for data_length in [1, 10, 100]: - for buffer_length in [1, 2, 3, 5, 20, 99, 100, 101, 500]: - dest = ByteArrayOutput() - stream_out = serializers.ChunkedStream(dest, buffer_length) - stream_out.write(original_bytes[:data_length]) - stream_out.close() - num_chunks = int(math.ceil(float(data_length) / buffer_length)) - # length for each chunk, and a final -1 at the very end - exp_size = (num_chunks + 1) * 4 + data_length - self.assertEqual(len(dest.buffer), exp_size) - dest_pos = 0 - data_pos = 0 - for chunk_idx in range(num_chunks): - chunk_length = read_int(dest.buffer[dest_pos:(dest_pos + 4)]) - if chunk_idx == num_chunks - 1: - exp_length = data_length % buffer_length - if exp_length == 0: - exp_length = buffer_length - else: - exp_length = buffer_length - self.assertEqual(chunk_length, exp_length) - dest_pos += 4 - dest_chunk = dest.buffer[dest_pos:dest_pos + chunk_length] - orig_chunk = original_bytes[data_pos:data_pos + chunk_length] - self.assertEqual(dest_chunk, orig_chunk) - dest_pos += chunk_length - data_pos += chunk_length - # ends with a -1 - self.assertEqual(dest.buffer[-4:], write_int(-1)) - - -class ByteArrayOutput(object): - def __init__(self): - self.buffer = bytearray() - - def write(self, b): - self.buffer += b - - def close(self): - pass - -if __name__ == '__main__': - from pyspark.test_serializers import * - if xmlrunner: - unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2) - else: - unittest.main(verbosity=2) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/tests.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/tests.py deleted file mode 100644 index a2d825b..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/tests.py +++ /dev/null @@ -1,2534 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Unit tests for PySpark; additional tests are implemented as doctests in -individual modules. -""" - -from array import array -from glob import glob -import os -import re -import shutil -import subprocess -import sys -import tempfile -import time -import zipfile -import random -import threading -import hashlib - -from py4j.protocol import Py4JJavaError -try: - import xmlrunner -except ImportError: - xmlrunner = None - -if sys.version_info[:2] <= (2, 6): - try: - import unittest2 as unittest - except ImportError: - sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') - sys.exit(1) -else: - import unittest - if sys.version_info[0] >= 3: - xrange = range - basestring = str - -if sys.version >= "3": - from io import StringIO -else: - from StringIO import StringIO - - -from pyspark import keyword_only -from pyspark.conf import SparkConf -from pyspark.context import SparkContext -from pyspark.java_gateway import _launch_gateway -from pyspark.rdd import RDD -from pyspark.files import SparkFiles -from pyspark.serializers import read_int, BatchedSerializer, MarshalSerializer, PickleSerializer, \ - CloudPickleSerializer, CompressedSerializer, UTF8Deserializer, NoOpSerializer, \ - PairDeserializer, CartesianDeserializer, AutoBatchedSerializer, AutoSerializer, \ - FlattenedValuesSerializer -from pyspark.shuffle import Aggregator, ExternalMerger, ExternalSorter -from pyspark import shuffle -from pyspark.profiler import BasicProfiler -from pyspark.taskcontext import BarrierTaskContext, TaskContext - -_have_scipy = False -_have_numpy = False -try: - import scipy.sparse - _have_scipy = True -except: - # No SciPy, but that's okay, we'll skip those tests - pass -try: - import numpy as np - _have_numpy = True -except: - # No NumPy, but that's okay, we'll skip those tests - pass - - -SPARK_HOME = os.environ["SPARK_HOME"] - - -class MergerTests(unittest.TestCase): - - def setUp(self): - self.N = 1 << 12 - self.l = [i for i in xrange(self.N)] - self.data = list(zip(self.l, self.l)) - self.agg = Aggregator(lambda x: [x], - lambda x, y: x.append(y) or x, - lambda x, y: x.extend(y) or x) - - def test_small_dataset(self): - m = ExternalMerger(self.agg, 1000) - m.mergeValues(self.data) - self.assertEqual(m.spills, 0) - self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N))) - - m = ExternalMerger(self.agg, 1000) - m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), self.data)) - self.assertEqual(m.spills, 0) - self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N))) - - def test_medium_dataset(self): - m = ExternalMerger(self.agg, 20) - m.mergeValues(self.data) - self.assertTrue(m.spills >= 1) - self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N))) - - m = ExternalMerger(self.agg, 10) - m.mergeCombiners(map(lambda x_y2: (x_y2[0], [x_y2[1]]), self.data * 3)) - self.assertTrue(m.spills >= 1) - self.assertEqual(sum(sum(v) for k, v in m.items()), - sum(xrange(self.N)) * 3) - - def test_huge_dataset(self): - m = ExternalMerger(self.agg, 5, partitions=3) - m.mergeCombiners(map(lambda k_v: (k_v[0], [str(k_v[1])]), self.data * 10)) - self.assertTrue(m.spills >= 1) - self.assertEqual(sum(len(v) for k, v in m.items()), - self.N * 10) - m._cleanup() - - def test_group_by_key(self): - - def gen_data(N, step): - for i in range(1, N + 1, step): - for j in range(i): - yield (i, [j]) - - def gen_gs(N, step=1): - return shuffle.GroupByKey(gen_data(N, step)) - - self.assertEqual(1, len(list(gen_gs(1)))) - self.assertEqual(2, len(list(gen_gs(2)))) - self.assertEqual(100, len(list(gen_gs(100)))) - self.assertEqual(list(range(1, 101)), [k for k, _ in gen_gs(100)]) - self.assertTrue(all(list(range(k)) == list(vs) for k, vs in gen_gs(100))) - - for k, vs in gen_gs(50002, 10000): - self.assertEqual(k, len(vs)) - self.assertEqual(list(range(k)), list(vs)) - - ser = PickleSerializer() - l = ser.loads(ser.dumps(list(gen_gs(50002, 30000)))) - for k, vs in l: - self.assertEqual(k, len(vs)) - self.assertEqual(list(range(k)), list(vs)) - - def test_stopiteration_is_raised(self): - - def stopit(*args, **kwargs): - raise StopIteration() - - def legit_create_combiner(x): - return [x] - - def legit_merge_value(x, y): - return x.append(y) or x - - def legit_merge_combiners(x, y): - return x.extend(y) or x - - data = [(x % 2, x) for x in range(100)] - - # wrong create combiner - m = ExternalMerger(Aggregator(stopit, legit_merge_value, legit_merge_combiners), 20) - with self.assertRaises((Py4JJavaError, RuntimeError)) as cm: - m.mergeValues(data) - - # wrong merge value - m = ExternalMerger(Aggregator(legit_create_combiner, stopit, legit_merge_combiners), 20) - with self.assertRaises((Py4JJavaError, RuntimeError)) as cm: - m.mergeValues(data) - - # wrong merge combiners - m = ExternalMerger(Aggregator(legit_create_combiner, legit_merge_value, stopit), 20) - with self.assertRaises((Py4JJavaError, RuntimeError)) as cm: - m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), data)) - - -class SorterTests(unittest.TestCase): - def test_in_memory_sort(self): - l = list(range(1024)) - random.shuffle(l) - sorter = ExternalSorter(1024) - self.assertEqual(sorted(l), list(sorter.sorted(l))) - self.assertEqual(sorted(l, reverse=True), list(sorter.sorted(l, reverse=True))) - self.assertEqual(sorted(l, key=lambda x: -x), list(sorter.sorted(l, key=lambda x: -x))) - self.assertEqual(sorted(l, key=lambda x: -x, reverse=True), - list(sorter.sorted(l, key=lambda x: -x, reverse=True))) - - def test_external_sort(self): - class CustomizedSorter(ExternalSorter): - def _next_limit(self): - return self.memory_limit - l = list(range(1024)) - random.shuffle(l) - sorter = CustomizedSorter(1) - self.assertEqual(sorted(l), list(sorter.sorted(l))) - self.assertGreater(shuffle.DiskBytesSpilled, 0) - last = shuffle.DiskBytesSpilled - self.assertEqual(sorted(l, reverse=True), list(sorter.sorted(l, reverse=True))) - self.assertGreater(shuffle.DiskBytesSpilled, last) - last = shuffle.DiskBytesSpilled - self.assertEqual(sorted(l, key=lambda x: -x), list(sorter.sorted(l, key=lambda x: -x))) - self.assertGreater(shuffle.DiskBytesSpilled, last) - last = shuffle.DiskBytesSpilled - self.assertEqual(sorted(l, key=lambda x: -x, reverse=True), - list(sorter.sorted(l, key=lambda x: -x, reverse=True))) - self.assertGreater(shuffle.DiskBytesSpilled, last) - - def test_external_sort_in_rdd(self): - conf = SparkConf().set("spark.python.worker.memory", "1m") - sc = SparkContext(conf=conf) - l = list(range(10240)) - random.shuffle(l) - rdd = sc.parallelize(l, 4) - self.assertEqual(sorted(l), rdd.sortBy(lambda x: x).collect()) - sc.stop() - - -class SerializationTestCase(unittest.TestCase): - - def test_namedtuple(self): - from collections import namedtuple - from pickle import dumps, loads - P = namedtuple("P", "x y") - p1 = P(1, 3) - p2 = loads(dumps(p1, 2)) - self.assertEqual(p1, p2) - - from pyspark.cloudpickle import dumps - P2 = loads(dumps(P)) - p3 = P2(1, 3) - self.assertEqual(p1, p3) - - def test_itemgetter(self): - from operator import itemgetter - ser = CloudPickleSerializer() - d = range(10) - getter = itemgetter(1) - getter2 = ser.loads(ser.dumps(getter)) - self.assertEqual(getter(d), getter2(d)) - - getter = itemgetter(0, 3) - getter2 = ser.loads(ser.dumps(getter)) - self.assertEqual(getter(d), getter2(d)) - - def test_function_module_name(self): - ser = CloudPickleSerializer() - func = lambda x: x - func2 = ser.loads(ser.dumps(func)) - self.assertEqual(func.__module__, func2.__module__) - - def test_attrgetter(self): - from operator import attrgetter - ser = CloudPickleSerializer() - - class C(object): - def __getattr__(self, item): - return item - d = C() - getter = attrgetter("a") - getter2 = ser.loads(ser.dumps(getter)) - self.assertEqual(getter(d), getter2(d)) - getter = attrgetter("a", "b") - getter2 = ser.loads(ser.dumps(getter)) - self.assertEqual(getter(d), getter2(d)) - - d.e = C() - getter = attrgetter("e.a") - getter2 = ser.loads(ser.dumps(getter)) - self.assertEqual(getter(d), getter2(d)) - getter = attrgetter("e.a", "e.b") - getter2 = ser.loads(ser.dumps(getter)) - self.assertEqual(getter(d), getter2(d)) - - # Regression test for SPARK-3415 - def test_pickling_file_handles(self): - # to be corrected with SPARK-11160 - if not xmlrunner: - ser = CloudPickleSerializer() - out1 = sys.stderr - out2 = ser.loads(ser.dumps(out1)) - self.assertEqual(out1, out2) - - def test_func_globals(self): - - class Unpicklable(object): - def __reduce__(self): - raise Exception("not picklable") - - global exit - exit = Unpicklable() - - ser = CloudPickleSerializer() - self.assertRaises(Exception, lambda: ser.dumps(exit)) - - def foo(): - sys.exit(0) - - self.assertTrue("exit" in foo.__code__.co_names) - ser.dumps(foo) - - def test_compressed_serializer(self): - ser = CompressedSerializer(PickleSerializer()) - try: - from StringIO import StringIO - except ImportError: - from io import BytesIO as StringIO - io = StringIO() - ser.dump_stream(["abc", u"123", range(5)], io) - io.seek(0) - self.assertEqual(["abc", u"123", range(5)], list(ser.load_stream(io))) - ser.dump_stream(range(1000), io) - io.seek(0) - self.assertEqual(["abc", u"123", range(5)] + list(range(1000)), list(ser.load_stream(io))) - io.close() - - def test_hash_serializer(self): - hash(NoOpSerializer()) - hash(UTF8Deserializer()) - hash(PickleSerializer()) - hash(MarshalSerializer()) - hash(AutoSerializer()) - hash(BatchedSerializer(PickleSerializer())) - hash(AutoBatchedSerializer(MarshalSerializer())) - hash(PairDeserializer(NoOpSerializer(), UTF8Deserializer())) - hash(CartesianDeserializer(NoOpSerializer(), UTF8Deserializer())) - hash(CompressedSerializer(PickleSerializer())) - hash(FlattenedValuesSerializer(PickleSerializer())) - - -class QuietTest(object): - def __init__(self, sc): - self.log4j = sc._jvm.org.apache.log4j - - def __enter__(self): - self.old_level = self.log4j.LogManager.getRootLogger().getLevel() - self.log4j.LogManager.getRootLogger().setLevel(self.log4j.Level.FATAL) - - def __exit__(self, exc_type, exc_val, exc_tb): - self.log4j.LogManager.getRootLogger().setLevel(self.old_level) - - -class PySparkTestCase(unittest.TestCase): - - def setUp(self): - self._old_sys_path = list(sys.path) - class_name = self.__class__.__name__ - self.sc = SparkContext('local[4]', class_name) - - def tearDown(self): - self.sc.stop() - sys.path = self._old_sys_path - - -class ReusedPySparkTestCase(unittest.TestCase): - - @classmethod - def conf(cls): - """ - Override this in subclasses to supply a more specific conf - """ - return SparkConf() - - @classmethod - def setUpClass(cls): - cls.sc = SparkContext('local[4]', cls.__name__, conf=cls.conf()) - - @classmethod - def tearDownClass(cls): - cls.sc.stop() - - -class CheckpointTests(ReusedPySparkTestCase): - - def setUp(self): - self.checkpointDir = tempfile.NamedTemporaryFile(delete=False) - os.unlink(self.checkpointDir.name) - self.sc.setCheckpointDir(self.checkpointDir.name) - - def tearDown(self): - shutil.rmtree(self.checkpointDir.name) - - def test_basic_checkpointing(self): - parCollection = self.sc.parallelize([1, 2, 3, 4]) - flatMappedRDD = parCollection.flatMap(lambda x: range(1, x + 1)) - - self.assertFalse(flatMappedRDD.isCheckpointed()) - self.assertTrue(flatMappedRDD.getCheckpointFile() is None) - - flatMappedRDD.checkpoint() - result = flatMappedRDD.collect() - time.sleep(1) # 1 second - self.assertTrue(flatMappedRDD.isCheckpointed()) - self.assertEqual(flatMappedRDD.collect(), result) - self.assertEqual("file:" + self.checkpointDir.name, - os.path.dirname(os.path.dirname(flatMappedRDD.getCheckpointFile()))) - - def test_checkpoint_and_restore(self): - parCollection = self.sc.parallelize([1, 2, 3, 4]) - flatMappedRDD = parCollection.flatMap(lambda x: [x]) - - self.assertFalse(flatMappedRDD.isCheckpointed()) - self.assertTrue(flatMappedRDD.getCheckpointFile() is None) - - flatMappedRDD.checkpoint() - flatMappedRDD.count() # forces a checkpoint to be computed - time.sleep(1) # 1 second - - self.assertTrue(flatMappedRDD.getCheckpointFile() is not None) - recovered = self.sc._checkpointFile(flatMappedRDD.getCheckpointFile(), - flatMappedRDD._jrdd_deserializer) - self.assertEqual([1, 2, 3, 4], recovered.collect()) - - -class LocalCheckpointTests(ReusedPySparkTestCase): - - def test_basic_localcheckpointing(self): - parCollection = self.sc.parallelize([1, 2, 3, 4]) - flatMappedRDD = parCollection.flatMap(lambda x: range(1, x + 1)) - - self.assertFalse(flatMappedRDD.isCheckpointed()) - self.assertFalse(flatMappedRDD.isLocallyCheckpointed()) - - flatMappedRDD.localCheckpoint() - result = flatMappedRDD.collect() - time.sleep(1) # 1 second - self.assertTrue(flatMappedRDD.isCheckpointed()) - self.assertTrue(flatMappedRDD.isLocallyCheckpointed()) - self.assertEqual(flatMappedRDD.collect(), result) - - -class AddFileTests(PySparkTestCase): - - def test_add_py_file(self): - # To ensure that we're actually testing addPyFile's effects, check that - # this job fails due to `userlibrary` not being on the Python path: - # disable logging in log4j temporarily - def func(x): - from userlibrary import UserClass - return UserClass().hello() - with QuietTest(self.sc): - self.assertRaises(Exception, self.sc.parallelize(range(2)).map(func).first) - - # Add the file, so the job should now succeed: - path = os.path.join(SPARK_HOME, "python/test_support/userlibrary.py") - self.sc.addPyFile(path) - res = self.sc.parallelize(range(2)).map(func).first() - self.assertEqual("Hello World!", res) - - def test_add_file_locally(self): - path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") - self.sc.addFile(path) - download_path = SparkFiles.get("hello.txt") - self.assertNotEqual(path, download_path) - with open(download_path) as test_file: - self.assertEqual("Hello World!\n", test_file.readline()) - - def test_add_file_recursively_locally(self): - path = os.path.join(SPARK_HOME, "python/test_support/hello") - self.sc.addFile(path, True) - download_path = SparkFiles.get("hello") - self.assertNotEqual(path, download_path) - with open(download_path + "/hello.txt") as test_file: - self.assertEqual("Hello World!\n", test_file.readline()) - with open(download_path + "/sub_hello/sub_hello.txt") as test_file: - self.assertEqual("Sub Hello World!\n", test_file.readline()) - - def test_add_py_file_locally(self): - # To ensure that we're actually testing addPyFile's effects, check that - # this fails due to `userlibrary` not being on the Python path: - def func(): - from userlibrary import UserClass - self.assertRaises(ImportError, func) - path = os.path.join(SPARK_HOME, "python/test_support/userlibrary.py") - self.sc.addPyFile(path) - from userlibrary import UserClass - self.assertEqual("Hello World!", UserClass().hello()) - - def test_add_egg_file_locally(self): - # To ensure that we're actually testing addPyFile's effects, check that - # this fails due to `userlibrary` not being on the Python path: - def func(): - from userlib import UserClass - self.assertRaises(ImportError, func) - path = os.path.join(SPARK_HOME, "python/test_support/userlib-0.1.zip") - self.sc.addPyFile(path) - from userlib import UserClass - self.assertEqual("Hello World from inside a package!", UserClass().hello()) - - def test_overwrite_system_module(self): - self.sc.addPyFile(os.path.join(SPARK_HOME, "python/test_support/SimpleHTTPServer.py")) - - import SimpleHTTPServer - self.assertEqual("My Server", SimpleHTTPServer.__name__) - - def func(x): - import SimpleHTTPServer - return SimpleHTTPServer.__name__ - - self.assertEqual(["My Server"], self.sc.parallelize(range(1)).map(func).collect()) - - -class TaskContextTests(PySparkTestCase): - - def setUp(self): - self._old_sys_path = list(sys.path) - class_name = self.__class__.__name__ - # Allow retries even though they are normally disabled in local mode - self.sc = SparkContext('local[4, 2]', class_name) - - def test_stage_id(self): - """Test the stage ids are available and incrementing as expected.""" - rdd = self.sc.parallelize(range(10)) - stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] - stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] - # Test using the constructor directly rather than the get() - stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0] - self.assertEqual(stage1 + 1, stage2) - self.assertEqual(stage1 + 2, stage3) - self.assertEqual(stage2 + 1, stage3) - - def test_partition_id(self): - """Test the partition id.""" - rdd1 = self.sc.parallelize(range(10), 1) - rdd2 = self.sc.parallelize(range(10), 2) - pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect() - pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect() - self.assertEqual(0, pids1[0]) - self.assertEqual(0, pids1[9]) - self.assertEqual(0, pids2[0]) - self.assertEqual(1, pids2[9]) - - def test_attempt_number(self): - """Verify the attempt numbers are correctly reported.""" - rdd = self.sc.parallelize(range(10)) - # Verify a simple job with no failures - attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect() - map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers) - - def fail_on_first(x): - """Fail on the first attempt so we get a positive attempt number""" - tc = TaskContext.get() - attempt_number = tc.attemptNumber() - partition_id = tc.partitionId() - attempt_id = tc.taskAttemptId() - if attempt_number == 0 and partition_id == 0: - raise Exception("Failing on first attempt") - else: - return [x, partition_id, attempt_number, attempt_id] - result = rdd.map(fail_on_first).collect() - # We should re-submit the first partition to it but other partitions should be attempt 0 - self.assertEqual([0, 0, 1], result[0][0:3]) - self.assertEqual([9, 3, 0], result[9][0:3]) - first_partition = filter(lambda x: x[1] == 0, result) - map(lambda x: self.assertEqual(1, x[2]), first_partition) - other_partitions = filter(lambda x: x[1] != 0, result) - map(lambda x: self.assertEqual(0, x[2]), other_partitions) - # The task attempt id should be different - self.assertTrue(result[0][3] != result[9][3]) - - def test_tc_on_driver(self): - """Verify that getting the TaskContext on the driver returns None.""" - tc = TaskContext.get() - self.assertTrue(tc is None) - - def test_get_local_property(self): - """Verify that local properties set on the driver are available in TaskContext.""" - key = "testkey" - value = "testvalue" - self.sc.setLocalProperty(key, value) - try: - rdd = self.sc.parallelize(range(1), 1) - prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0] - self.assertEqual(prop1, value) - prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0] - self.assertTrue(prop2 is None) - finally: - self.sc.setLocalProperty(key, None) - - def test_barrier(self): - """ - Verify that BarrierTaskContext.barrier() performs global sync among all barrier tasks - within a stage. - """ - rdd = self.sc.parallelize(range(10), 4) - - def f(iterator): - yield sum(iterator) - - def context_barrier(x): - tc = BarrierTaskContext.get() - time.sleep(random.randint(1, 10)) - tc.barrier() - return time.time() - - times = rdd.barrier().mapPartitions(f).map(context_barrier).collect() - self.assertTrue(max(times) - min(times) < 1) - - def test_barrier_with_python_worker_reuse(self): - """ - Verify that BarrierTaskContext.barrier() with reused python worker. - """ - self.sc._conf.set("spark.python.work.reuse", "true") - rdd = self.sc.parallelize(range(4), 4) - # start a normal job first to start all worker - result = rdd.map(lambda x: x ** 2).collect() - self.assertEqual([0, 1, 4, 9], result) - # make sure `spark.python.work.reuse=true` - self.assertEqual(self.sc._conf.get("spark.python.work.reuse"), "true") - - # worker will be reused in this barrier job - self.test_barrier() - - def test_barrier_infos(self): - """ - Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the - barrier stage. - """ - rdd = self.sc.parallelize(range(10), 4) - - def f(iterator): - yield sum(iterator) - - taskInfos = rdd.barrier().mapPartitions(f).map(lambda x: BarrierTaskContext.get() - .getTaskInfos()).collect() - self.assertTrue(len(taskInfos) == 4) - self.assertTrue(len(taskInfos[0]) == 4) - - -class RDDTests(ReusedPySparkTestCase): - - def test_range(self): - self.assertEqual(self.sc.range(1, 1).count(), 0) - self.assertEqual(self.sc.range(1, 0, -1).count(), 1) - self.assertEqual(self.sc.range(0, 1 << 40, 1 << 39).count(), 2) - - def test_id(self): - rdd = self.sc.parallelize(range(10)) - id = rdd.id() - self.assertEqual(id, rdd.id()) - rdd2 = rdd.map(str).filter(bool) - id2 = rdd2.id() - self.assertEqual(id + 1, id2) - self.assertEqual(id2, rdd2.id()) - - def test_empty_rdd(self): - rdd = self.sc.emptyRDD() - self.assertTrue(rdd.isEmpty()) - - def test_sum(self): - self.assertEqual(0, self.sc.emptyRDD().sum()) - self.assertEqual(6, self.sc.parallelize([1, 2, 3]).sum()) - - def test_to_localiterator(self): - from time import sleep - rdd = self.sc.parallelize([1, 2, 3]) - it = rdd.toLocalIterator() - sleep(5) - self.assertEqual([1, 2, 3], sorted(it)) - - rdd2 = rdd.repartition(1000) - it2 = rdd2.toLocalIterator() - sleep(5) - self.assertEqual([1, 2, 3], sorted(it2)) - - def test_save_as_textfile_with_unicode(self): - # Regression test for SPARK-970 - x = u"\u00A1Hola, mundo!" - data = self.sc.parallelize([x]) - tempFile = tempfile.NamedTemporaryFile(delete=True) - tempFile.close() - data.saveAsTextFile(tempFile.name) - raw_contents = b''.join(open(p, 'rb').read() - for p in glob(tempFile.name + "/part-0000*")) - self.assertEqual(x, raw_contents.strip().decode("utf-8")) - - def test_save_as_textfile_with_utf8(self): - x = u"\u00A1Hola, mundo!" - data = self.sc.parallelize([x.encode("utf-8")]) - tempFile = tempfile.NamedTemporaryFile(delete=True) - tempFile.close() - data.saveAsTextFile(tempFile.name) - raw_contents = b''.join(open(p, 'rb').read() - for p in glob(tempFile.name + "/part-0000*")) - self.assertEqual(x, raw_contents.strip().decode('utf8')) - - def test_transforming_cartesian_result(self): - # Regression test for SPARK-1034 - rdd1 = self.sc.parallelize([1, 2]) - rdd2 = self.sc.parallelize([3, 4]) - cart = rdd1.cartesian(rdd2) - result = cart.map(lambda x_y3: x_y3[0] + x_y3[1]).collect() - - def test_transforming_pickle_file(self): - # Regression test for SPARK-2601 - data = self.sc.parallelize([u"Hello", u"World!"]) - tempFile = tempfile.NamedTemporaryFile(delete=True) - tempFile.close() - data.saveAsPickleFile(tempFile.name) - pickled_file = self.sc.pickleFile(tempFile.name) - pickled_file.map(lambda x: x).collect() - - def test_cartesian_on_textfile(self): - # Regression test for - path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") - a = self.sc.textFile(path) - result = a.cartesian(a).collect() - (x, y) = result[0] - self.assertEqual(u"Hello World!", x.strip()) - self.assertEqual(u"Hello World!", y.strip()) - - def test_cartesian_chaining(self): - # Tests for SPARK-16589 - rdd = self.sc.parallelize(range(10), 2) - self.assertSetEqual( - set(rdd.cartesian(rdd).cartesian(rdd).collect()), - set([((x, y), z) for x in range(10) for y in range(10) for z in range(10)]) - ) - - self.assertSetEqual( - set(rdd.cartesian(rdd.cartesian(rdd)).collect()), - set([(x, (y, z)) for x in range(10) for y in range(10) for z in range(10)]) - ) - - self.assertSetEqual( - set(rdd.cartesian(rdd.zip(rdd)).collect()), - set([(x, (y, y)) for x in range(10) for y in range(10)]) - ) - - def test_zip_chaining(self): - # Tests for SPARK-21985 - rdd = self.sc.parallelize('abc', 2) - self.assertSetEqual( - set(rdd.zip(rdd).zip(rdd).collect()), - set([((x, x), x) for x in 'abc']) - ) - self.assertSetEqual( - set(rdd.zip(rdd.zip(rdd)).collect()), - set([(x, (x, x)) for x in 'abc']) - ) - - def test_deleting_input_files(self): - # Regression test for SPARK-1025 - tempFile = tempfile.NamedTemporaryFile(delete=False) - tempFile.write(b"Hello World!") - tempFile.close() - data = self.sc.textFile(tempFile.name) - filtered_data = data.filter(lambda x: True) - self.assertEqual(1, filtered_data.count()) - os.unlink(tempFile.name) - with QuietTest(self.sc): - self.assertRaises(Exception, lambda: filtered_data.count()) - - def test_sampling_default_seed(self): - # Test for SPARK-3995 (default seed setting) - data = self.sc.parallelize(xrange(1000), 1) - subset = data.takeSample(False, 10) - self.assertEqual(len(subset), 10) - - def test_aggregate_mutable_zero_value(self): - # Test for SPARK-9021; uses aggregate and treeAggregate to build dict - # representing a counter of ints - # NOTE: dict is used instead of collections.Counter for Python 2.6 - # compatibility - from collections import defaultdict - - # Show that single or multiple partitions work - data1 = self.sc.range(10, numSlices=1) - data2 = self.sc.range(10, numSlices=2) - - def seqOp(x, y): - x[y] += 1 - return x - - def comboOp(x, y): - for key, val in y.items(): - x[key] += val - return x - - counts1 = data1.aggregate(defaultdict(int), seqOp, comboOp) - counts2 = data2.aggregate(defaultdict(int), seqOp, comboOp) - counts3 = data1.treeAggregate(defaultdict(int), seqOp, comboOp, 2) - counts4 = data2.treeAggregate(defaultdict(int), seqOp, comboOp, 2) - - ground_truth = defaultdict(int, dict((i, 1) for i in range(10))) - self.assertEqual(counts1, ground_truth) - self.assertEqual(counts2, ground_truth) - self.assertEqual(counts3, ground_truth) - self.assertEqual(counts4, ground_truth) - - def test_aggregate_by_key_mutable_zero_value(self): - # Test for SPARK-9021; uses aggregateByKey to make a pair RDD that - # contains lists of all values for each key in the original RDD - - # list(range(...)) for Python 3.x compatibility (can't use * operator - # on a range object) - # list(zip(...)) for Python 3.x compatibility (want to parallelize a - # collection, not a zip object) - tuples = list(zip(list(range(10))*2, [1]*20)) - # Show that single or multiple partitions work - data1 = self.sc.parallelize(tuples, 1) - data2 = self.sc.parallelize(tuples, 2) - - def seqOp(x, y): - x.append(y) - return x - - def comboOp(x, y): - x.extend(y) - return x - - values1 = data1.aggregateByKey([], seqOp, comboOp).collect() - values2 = data2.aggregateByKey([], seqOp, comboOp).collect() - # Sort lists to ensure clean comparison with ground_truth - values1.sort() - values2.sort() - - ground_truth = [(i, [1]*2) for i in range(10)] - self.assertEqual(values1, ground_truth) - self.assertEqual(values2, ground_truth) - - def test_fold_mutable_zero_value(self): - # Test for SPARK-9021; uses fold to merge an RDD of dict counters into - # a single dict - # NOTE: dict is used instead of collections.Counter for Python 2.6 - # compatibility - from collections import defaultdict - - counts1 = defaultdict(int, dict((i, 1) for i in range(10))) - counts2 = defaultdict(int, dict((i, 1) for i in range(3, 8))) - counts3 = defaultdict(int, dict((i, 1) for i in range(4, 7))) - counts4 = defaultdict(int, dict((i, 1) for i in range(5, 6))) - all_counts = [counts1, counts2, counts3, counts4] - # Show that single or multiple partitions work - data1 = self.sc.parallelize(all_counts, 1) - data2 = self.sc.parallelize(all_counts, 2) - - def comboOp(x, y): - for key, val in y.items(): - x[key] += val - return x - - fold1 = data1.fold(defaultdict(int), comboOp) - fold2 = data2.fold(defaultdict(int), comboOp) - - ground_truth = defaultdict(int) - for counts in all_counts: - for key, val in counts.items(): - ground_truth[key] += val - self.assertEqual(fold1, ground_truth) - self.assertEqual(fold2, ground_truth) - - def test_fold_by_key_mutable_zero_value(self): - # Test for SPARK-9021; uses foldByKey to make a pair RDD that contains - # lists of all values for each key in the original RDD - - tuples = [(i, range(i)) for i in range(10)]*2 - # Show that single or multiple partitions work - data1 = self.sc.parallelize(tuples, 1) - data2 = self.sc.parallelize(tuples, 2) - - def comboOp(x, y): - x.extend(y) - return x - - values1 = data1.foldByKey([], comboOp).collect() - values2 = data2.foldByKey([], comboOp).collect() - # Sort lists to ensure clean comparison with ground_truth - values1.sort() - values2.sort() - - # list(range(...)) for Python 3.x compatibility - ground_truth = [(i, list(range(i))*2) for i in range(10)] - self.assertEqual(values1, ground_truth) - self.assertEqual(values2, ground_truth) - - def test_aggregate_by_key(self): - data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) - - def seqOp(x, y): - x.add(y) - return x - - def combOp(x, y): - x |= y - return x - - sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) - self.assertEqual(3, len(sets)) - self.assertEqual(set([1]), sets[1]) - self.assertEqual(set([2]), sets[3]) - self.assertEqual(set([1, 3]), sets[5]) - - def test_itemgetter(self): - rdd = self.sc.parallelize([range(10)]) - from operator import itemgetter - self.assertEqual([1], rdd.map(itemgetter(1)).collect()) - self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) - - def test_namedtuple_in_rdd(self): - from collections import namedtuple - Person = namedtuple("Person", "id firstName lastName") - jon = Person(1, "Jon", "Doe") - jane = Person(2, "Jane", "Doe") - theDoes = self.sc.parallelize([jon, jane]) - self.assertEqual([jon, jane], theDoes.collect()) - - def test_large_broadcast(self): - N = 10000 - data = [[float(i) for i in range(300)] for i in range(N)] - bdata = self.sc.broadcast(data) # 27MB - m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() - self.assertEqual(N, m) - - def test_unpersist(self): - N = 1000 - data = [[float(i) for i in range(300)] for i in range(N)] - bdata = self.sc.broadcast(data) # 3MB - bdata.unpersist() - m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() - self.assertEqual(N, m) - bdata.destroy() - try: - self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() - except Exception as e: - pass - else: - raise Exception("job should fail after destroy the broadcast") - - def test_multiple_broadcasts(self): - N = 1 << 21 - b1 = self.sc.broadcast(set(range(N))) # multiple blocks in JVM - r = list(range(1 << 15)) - random.shuffle(r) - s = str(r).encode() - checksum = hashlib.md5(s).hexdigest() - b2 = self.sc.broadcast(s) - r = list(set(self.sc.parallelize(range(10), 10).map( - lambda x: (len(b1.value), hashlib.md5(b2.value).hexdigest())).collect())) - self.assertEqual(1, len(r)) - size, csum = r[0] - self.assertEqual(N, size) - self.assertEqual(checksum, csum) - - random.shuffle(r) - s = str(r).encode() - checksum = hashlib.md5(s).hexdigest() - b2 = self.sc.broadcast(s) - r = list(set(self.sc.parallelize(range(10), 10).map( - lambda x: (len(b1.value), hashlib.md5(b2.value).hexdigest())).collect())) - self.assertEqual(1, len(r)) - size, csum = r[0] - self.assertEqual(N, size) - self.assertEqual(checksum, csum) - - def test_multithread_broadcast_pickle(self): - import threading - - b1 = self.sc.broadcast(list(range(3))) - b2 = self.sc.broadcast(list(range(3))) - - def f1(): - return b1.value - - def f2(): - return b2.value - - funcs_num_pickled = {f1: None, f2: None} - - def do_pickle(f, sc): - command = (f, None, sc.serializer, sc.serializer) - ser = CloudPickleSerializer() - ser.dumps(command) - - def process_vars(sc): - broadcast_vars = list(sc._pickled_broadcast_vars) - num_pickled = len(broadcast_vars) - sc._pickled_broadcast_vars.clear() - return num_pickled - - def run(f, sc): - do_pickle(f, sc) - funcs_num_pickled[f] = process_vars(sc) - - # pickle f1, adds b1 to sc._pickled_broadcast_vars in main thread local storage - do_pickle(f1, self.sc) - - # run all for f2, should only add/count/clear b2 from worker thread local storage - t = threading.Thread(target=run, args=(f2, self.sc)) - t.start() - t.join() - - # count number of vars pickled in main thread, only b1 should be counted and cleared - funcs_num_pickled[f1] = process_vars(self.sc) - - self.assertEqual(funcs_num_pickled[f1], 1) - self.assertEqual(funcs_num_pickled[f2], 1) - self.assertEqual(len(list(self.sc._pickled_broadcast_vars)), 0) - - def test_large_closure(self): - N = 200000 - data = [float(i) for i in xrange(N)] - rdd = self.sc.parallelize(range(1), 1).map(lambda x: len(data)) - self.assertEqual(N, rdd.first()) - # regression test for SPARK-6886 - self.assertEqual(1, rdd.map(lambda x: (x, 1)).groupByKey().count()) - - def test_zip_with_different_serializers(self): - a = self.sc.parallelize(range(5)) - b = self.sc.parallelize(range(100, 105)) - self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) - a = a._reserialize(BatchedSerializer(PickleSerializer(), 2)) - b = b._reserialize(MarshalSerializer()) - self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) - # regression test for SPARK-4841 - path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") - t = self.sc.textFile(path) - cnt = t.count() - self.assertEqual(cnt, t.zip(t).count()) - rdd = t.map(str) - self.assertEqual(cnt, t.zip(rdd).count()) - # regression test for bug in _reserializer() - self.assertEqual(cnt, t.zip(rdd).count()) - - def test_zip_with_different_object_sizes(self): - # regress test for SPARK-5973 - a = self.sc.parallelize(xrange(10000)).map(lambda i: '*' * i) - b = self.sc.parallelize(xrange(10000, 20000)).map(lambda i: '*' * i) - self.assertEqual(10000, a.zip(b).count()) - - def test_zip_with_different_number_of_items(self): - a = self.sc.parallelize(range(5), 2) - # different number of partitions - b = self.sc.parallelize(range(100, 106), 3) - self.assertRaises(ValueError, lambda: a.zip(b)) - with QuietTest(self.sc): - # different number of batched items in JVM - b = self.sc.parallelize(range(100, 104), 2) - self.assertRaises(Exception, lambda: a.zip(b).count()) - # different number of items in one pair - b = self.sc.parallelize(range(100, 106), 2) - self.assertRaises(Exception, lambda: a.zip(b).count()) - # same total number of items, but different distributions - a = self.sc.parallelize([2, 3], 2).flatMap(range) - b = self.sc.parallelize([3, 2], 2).flatMap(range) - self.assertEqual(a.count(), b.count()) - self.assertRaises(Exception, lambda: a.zip(b).count()) - - def test_count_approx_distinct(self): - rdd = self.sc.parallelize(xrange(1000)) - self.assertTrue(950 < rdd.countApproxDistinct(0.03) < 1050) - self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.03) < 1050) - self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.03) < 1050) - self.assertTrue(950 < rdd.map(lambda x: (x, -x)).countApproxDistinct(0.03) < 1050) - - rdd = self.sc.parallelize([i % 20 for i in range(1000)], 7) - self.assertTrue(18 < rdd.countApproxDistinct() < 22) - self.assertTrue(18 < rdd.map(float).countApproxDistinct() < 22) - self.assertTrue(18 < rdd.map(str).countApproxDistinct() < 22) - self.assertTrue(18 < rdd.map(lambda x: (x, -x)).countApproxDistinct() < 22) - - self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.00000001)) - - def test_histogram(self): - # empty - rdd = self.sc.parallelize([]) - self.assertEqual([0], rdd.histogram([0, 10])[1]) - self.assertEqual([0, 0], rdd.histogram([0, 4, 10])[1]) - self.assertRaises(ValueError, lambda: rdd.histogram(1)) - - # out of range - rdd = self.sc.parallelize([10.01, -0.01]) - self.assertEqual([0], rdd.histogram([0, 10])[1]) - self.assertEqual([0, 0], rdd.histogram((0, 4, 10))[1]) - - # in range with one bucket - rdd = self.sc.parallelize(range(1, 5)) - self.assertEqual([4], rdd.histogram([0, 10])[1]) - self.assertEqual([3, 1], rdd.histogram([0, 4, 10])[1]) - - # in range with one bucket exact match - self.assertEqual([4], rdd.histogram([1, 4])[1]) - - # out of range with two buckets - rdd = self.sc.parallelize([10.01, -0.01]) - self.assertEqual([0, 0], rdd.histogram([0, 5, 10])[1]) - - # out of range with two uneven buckets - rdd = self.sc.parallelize([10.01, -0.01]) - self.assertEqual([0, 0], rdd.histogram([0, 4, 10])[1]) - - # in range with two buckets - rdd = self.sc.parallelize([1, 2, 3, 5, 6]) - self.assertEqual([3, 2], rdd.histogram([0, 5, 10])[1]) - - # in range with two bucket and None - rdd = self.sc.parallelize([1, 2, 3, 5, 6, None, float('nan')]) - self.assertEqual([3, 2], rdd.histogram([0, 5, 10])[1]) - - # in range with two uneven buckets - rdd = self.sc.parallelize([1, 2, 3, 5, 6]) - self.assertEqual([3, 2], rdd.histogram([0, 5, 11])[1]) - - # mixed range with two uneven buckets - rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01]) - self.assertEqual([4, 3], rdd.histogram([0, 5, 11])[1]) - - # mixed range with four uneven buckets - rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1]) - self.assertEqual([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1]) - - # mixed range with uneven buckets and NaN - rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, - 199.0, 200.0, 200.1, None, float('nan')]) - self.assertEqual([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1]) - - # out of range with infinite buckets - rdd = self.sc.parallelize([10.01, -0.01, float('nan'), float("inf")]) - self.assertEqual([1, 2], rdd.histogram([float('-inf'), 0, float('inf')])[1]) - - # invalid buckets - self.assertRaises(ValueError, lambda: rdd.histogram([])) - self.assertRaises(ValueError, lambda: rdd.histogram([1])) - self.assertRaises(ValueError, lambda: rdd.histogram(0)) - self.assertRaises(TypeError, lambda: rdd.histogram({})) - - # without buckets - rdd = self.sc.parallelize(range(1, 5)) - self.assertEqual(([1, 4], [4]), rdd.histogram(1)) - - # without buckets single element - rdd = self.sc.parallelize([1]) - self.assertEqual(([1, 1], [1]), rdd.histogram(1)) - - # without bucket no range - rdd = self.sc.parallelize([1] * 4) - self.assertEqual(([1, 1], [4]), rdd.histogram(1)) - - # without buckets basic two - rdd = self.sc.parallelize(range(1, 5)) - self.assertEqual(([1, 2.5, 4], [2, 2]), rdd.histogram(2)) - - # without buckets with more requested than elements - rdd = self.sc.parallelize([1, 2]) - buckets = [1 + 0.2 * i for i in range(6)] - hist = [1, 0, 0, 0, 1] - self.assertEqual((buckets, hist), rdd.histogram(5)) - - # invalid RDDs - rdd = self.sc.parallelize([1, float('inf')]) - self.assertRaises(ValueError, lambda: rdd.histogram(2)) - rdd = self.sc.parallelize([float('nan')]) - self.assertRaises(ValueError, lambda: rdd.histogram(2)) - - # string - rdd = self.sc.parallelize(["ab", "ac", "b", "bd", "ef"], 2) - self.assertEqual([2, 2], rdd.histogram(["a", "b", "c"])[1]) - self.assertEqual((["ab", "ef"], [5]), rdd.histogram(1)) - self.assertRaises(TypeError, lambda: rdd.histogram(2)) - - def test_repartitionAndSortWithinPartitions_asc(self): - rdd = self.sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)], 2) - - repartitioned = rdd.repartitionAndSortWithinPartitions(2, lambda key: key % 2, True) - partitions = repartitioned.glom().collect() - self.assertEqual(partitions[0], [(0, 5), (0, 8), (2, 6)]) - self.assertEqual(partitions[1], [(1, 3), (3, 8), (3, 8)]) - - def test_repartitionAndSortWithinPartitions_desc(self): - rdd = self.sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)], 2) - - repartitioned = rdd.repartitionAndSortWithinPartitions(2, lambda key: key % 2, False) - partitions = repartitioned.glom().collect() - self.assertEqual(partitions[0], [(2, 6), (0, 5), (0, 8)]) - self.assertEqual(partitions[1], [(3, 8), (3, 8), (1, 3)]) - - def test_repartition_no_skewed(self): - num_partitions = 20 - a = self.sc.parallelize(range(int(1000)), 2) - l = a.repartition(num_partitions).glom().map(len).collect() - zeros = len([x for x in l if x == 0]) - self.assertTrue(zeros == 0) - l = a.coalesce(num_partitions, True).glom().map(len).collect() - zeros = len([x for x in l if x == 0]) - self.assertTrue(zeros == 0) - - def test_repartition_on_textfile(self): - path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") - rdd = self.sc.textFile(path) - result = rdd.repartition(1).collect() - self.assertEqual(u"Hello World!", result[0]) - - def test_distinct(self): - rdd = self.sc.parallelize((1, 2, 3)*10, 10) - self.assertEqual(rdd.getNumPartitions(), 10) - self.assertEqual(rdd.distinct().count(), 3) - result = rdd.distinct(5) - self.assertEqual(result.getNumPartitions(), 5) - self.assertEqual(result.count(), 3) - - def test_external_group_by_key(self): - self.sc._conf.set("spark.python.worker.memory", "1m") - N = 200001 - kv = self.sc.parallelize(xrange(N)).map(lambda x: (x % 3, x)) - gkv = kv.groupByKey().cache() - self.assertEqual(3, gkv.count()) - filtered = gkv.filter(lambda kv: kv[0] == 1) - self.assertEqual(1, filtered.count()) - self.assertEqual([(1, N // 3)], filtered.mapValues(len).collect()) - self.assertEqual([(N // 3, N // 3)], - filtered.values().map(lambda x: (len(x), len(list(x)))).collect()) - result = filtered.collect()[0][1] - self.assertEqual(N // 3, len(result)) - self.assertTrue(isinstance(result.data, shuffle.ExternalListOfList)) - - def test_sort_on_empty_rdd(self): - self.assertEqual([], self.sc.parallelize(zip([], [])).sortByKey().collect()) - - def test_sample(self): - rdd = self.sc.parallelize(range(0, 100), 4) - wo = rdd.sample(False, 0.1, 2).collect() - wo_dup = rdd.sample(False, 0.1, 2).collect() - self.assertSetEqual(set(wo), set(wo_dup)) - wr = rdd.sample(True, 0.2, 5).collect() - wr_dup = rdd.sample(True, 0.2, 5).collect() - self.assertSetEqual(set(wr), set(wr_dup)) - wo_s10 = rdd.sample(False, 0.3, 10).collect() - wo_s20 = rdd.sample(False, 0.3, 20).collect() - self.assertNotEqual(set(wo_s10), set(wo_s20)) - wr_s11 = rdd.sample(True, 0.4, 11).collect() - wr_s21 = rdd.sample(True, 0.4, 21).collect() - self.assertNotEqual(set(wr_s11), set(wr_s21)) - - def test_null_in_rdd(self): - jrdd = self.sc._jvm.PythonUtils.generateRDDWithNull(self.sc._jsc) - rdd = RDD(jrdd, self.sc, UTF8Deserializer()) - self.assertEqual([u"a", None, u"b"], rdd.collect()) - rdd = RDD(jrdd, self.sc, NoOpSerializer()) - self.assertEqual([b"a", None, b"b"], rdd.collect()) - - def test_multiple_python_java_RDD_conversions(self): - # Regression test for SPARK-5361 - data = [ - (u'1', {u'director': u'David Lean'}), - (u'2', {u'director': u'Andrew Dominik'}) - ] - data_rdd = self.sc.parallelize(data) - data_java_rdd = data_rdd._to_java_object_rdd() - data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd) - converted_rdd = RDD(data_python_rdd, self.sc) - self.assertEqual(2, converted_rdd.count()) - - # conversion between python and java RDD threw exceptions - data_java_rdd = converted_rdd._to_java_object_rdd() - data_python_rdd = self.sc._jvm.SerDeUtil.javaToPython(data_java_rdd) - converted_rdd = RDD(data_python_rdd, self.sc) - self.assertEqual(2, converted_rdd.count()) - - def test_narrow_dependency_in_join(self): - rdd = self.sc.parallelize(range(10)).map(lambda x: (x, x)) - parted = rdd.partitionBy(2) - self.assertEqual(2, parted.union(parted).getNumPartitions()) - self.assertEqual(rdd.getNumPartitions() + 2, parted.union(rdd).getNumPartitions()) - self.assertEqual(rdd.getNumPartitions() + 2, rdd.union(parted).getNumPartitions()) - - tracker = self.sc.statusTracker() - - self.sc.setJobGroup("test1", "test", True) - d = sorted(parted.join(parted).collect()) - self.assertEqual(10, len(d)) - self.assertEqual((0, (0, 0)), d[0]) - jobId = tracker.getJobIdsForGroup("test1")[0] - self.assertEqual(2, len(tracker.getJobInfo(jobId).stageIds)) - - self.sc.setJobGroup("test2", "test", True) - d = sorted(parted.join(rdd).collect()) - self.assertEqual(10, len(d)) - self.assertEqual((0, (0, 0)), d[0]) - jobId = tracker.getJobIdsForGroup("test2")[0] - self.assertEqual(3, len(tracker.getJobInfo(jobId).stageIds)) - - self.sc.setJobGroup("test3", "test", True) - d = sorted(parted.cogroup(parted).collect()) - self.assertEqual(10, len(d)) - self.assertEqual([[0], [0]], list(map(list, d[0][1]))) - jobId = tracker.getJobIdsForGroup("test3")[0] - self.assertEqual(2, len(tracker.getJobInfo(jobId).stageIds)) - - self.sc.setJobGroup("test4", "test", True) - d = sorted(parted.cogroup(rdd).collect()) - self.assertEqual(10, len(d)) - self.assertEqual([[0], [0]], list(map(list, d[0][1]))) - jobId = tracker.getJobIdsForGroup("test4")[0] - self.assertEqual(3, len(tracker.getJobInfo(jobId).stageIds)) - - # Regression test for SPARK-6294 - def test_take_on_jrdd(self): - rdd = self.sc.parallelize(xrange(1 << 20)).map(lambda x: str(x)) - rdd._jrdd.first() - - def test_sortByKey_uses_all_partitions_not_only_first_and_last(self): - # Regression test for SPARK-5969 - seq = [(i * 59 % 101, i) for i in range(101)] # unsorted sequence - rdd = self.sc.parallelize(seq) - for ascending in [True, False]: - sort = rdd.sortByKey(ascending=ascending, numPartitions=5) - self.assertEqual(sort.collect(), sorted(seq, reverse=not ascending)) - sizes = sort.glom().map(len).collect() - for size in sizes: - self.assertGreater(size, 0) - - def test_pipe_functions(self): - data = ['1', '2', '3'] - rdd = self.sc.parallelize(data) - with QuietTest(self.sc): - self.assertEqual([], rdd.pipe('cc').collect()) - self.assertRaises(Py4JJavaError, rdd.pipe('cc', checkCode=True).collect) - result = rdd.pipe('cat').collect() - result.sort() - for x, y in zip(data, result): - self.assertEqual(x, y) - self.assertRaises(Py4JJavaError, rdd.pipe('grep 4', checkCode=True).collect) - self.assertEqual([], rdd.pipe('grep 4').collect()) - - def test_pipe_unicode(self): - # Regression test for SPARK-20947 - data = [u'\u6d4b\u8bd5', '1'] - rdd = self.sc.parallelize(data) - result = rdd.pipe('cat').collect() - self.assertEqual(data, result) - - def test_stopiteration_in_user_code(self): - - def stopit(*x): - raise StopIteration() - - seq_rdd = self.sc.parallelize(range(10)) - keyed_rdd = self.sc.parallelize((x % 2, x) for x in range(10)) - msg = "Caught StopIteration thrown from user's code; failing the task" - - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.map(stopit).collect) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.filter(stopit).collect) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.foreach, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.reduce, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.fold, 0, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.foreach, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, - seq_rdd.cartesian(seq_rdd).flatMap(stopit).collect) - - # these methods call the user function both in the driver and in the executor - # the exception raised is different according to where the StopIteration happens - # RuntimeError is raised if in the driver - # Py4JJavaError is raised if in the executor (wraps the RuntimeError raised in the worker) - self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg, - keyed_rdd.reduceByKeyLocally, stopit) - self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg, - seq_rdd.aggregate, 0, stopit, lambda *x: 1) - self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg, - seq_rdd.aggregate, 0, lambda *x: 1, stopit) - - -class ProfilerTests(PySparkTestCase): - - def setUp(self): - self._old_sys_path = list(sys.path) - class_name = self.__class__.__name__ - conf = SparkConf().set("spark.python.profile", "true") - self.sc = SparkContext('local[4]', class_name, conf=conf) - - def test_profiler(self): - self.do_computation() - - profilers = self.sc.profiler_collector.profilers - self.assertEqual(1, len(profilers)) - id, profiler, _ = profilers[0] - stats = profiler.stats() - self.assertTrue(stats is not None) - width, stat_list = stats.get_print_list([]) - func_names = [func_name for fname, n, func_name in stat_list] - self.assertTrue("heavy_foo" in func_names) - - old_stdout = sys.stdout - sys.stdout = io = StringIO() - self.sc.show_profiles() - self.assertTrue("heavy_foo" in io.getvalue()) - sys.stdout = old_stdout - - d = tempfile.gettempdir() - self.sc.dump_profiles(d) - self.assertTrue("rdd_%d.pstats" % id in os.listdir(d)) - - def test_custom_profiler(self): - class TestCustomProfiler(BasicProfiler): - def show(self, id): - self.result = "Custom formatting" - - self.sc.profiler_collector.profiler_cls = TestCustomProfiler - - self.do_computation() - - profilers = self.sc.profiler_collector.profilers - self.assertEqual(1, len(profilers)) - _, profiler, _ = profilers[0] - self.assertTrue(isinstance(profiler, TestCustomProfiler)) - - self.sc.show_profiles() - self.assertEqual("Custom formatting", profiler.result) - - def do_computation(self): - def heavy_foo(x): - for i in range(1 << 18): - x = 1 - - rdd = self.sc.parallelize(range(100)) - rdd.foreach(heavy_foo) - - -class ProfilerTests2(unittest.TestCase): - def test_profiler_disabled(self): - sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false")) - try: - self.assertRaisesRegexp( - RuntimeError, - "'spark.python.profile' configuration must be set", - lambda: sc.show_profiles()) - self.assertRaisesRegexp( - RuntimeError, - "'spark.python.profile' configuration must be set", - lambda: sc.dump_profiles("/tmp/abc")) - finally: - sc.stop() - - -class InputFormatTests(ReusedPySparkTestCase): - - @classmethod - def setUpClass(cls): - ReusedPySparkTestCase.setUpClass() - cls.tempdir = tempfile.NamedTemporaryFile(delete=False) - os.unlink(cls.tempdir.name) - cls.sc._jvm.WriteInputFormatTestDataGenerator.generateData(cls.tempdir.name, cls.sc._jsc) - - @classmethod - def tearDownClass(cls): - ReusedPySparkTestCase.tearDownClass() - shutil.rmtree(cls.tempdir.name) - - @unittest.skipIf(sys.version >= "3", "serialize array of byte") - def test_sequencefiles(self): - basepath = self.tempdir.name - ints = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfint/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text").collect()) - ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')] - self.assertEqual(ints, ei) - - doubles = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfdouble/", - "org.apache.hadoop.io.DoubleWritable", - "org.apache.hadoop.io.Text").collect()) - ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')] - self.assertEqual(doubles, ed) - - bytes = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbytes/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.BytesWritable").collect()) - ebs = [(1, bytearray('aa', 'utf-8')), - (1, bytearray('aa', 'utf-8')), - (2, bytearray('aa', 'utf-8')), - (2, bytearray('bb', 'utf-8')), - (2, bytearray('bb', 'utf-8')), - (3, bytearray('cc', 'utf-8'))] - self.assertEqual(bytes, ebs) - - text = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sftext/", - "org.apache.hadoop.io.Text", - "org.apache.hadoop.io.Text").collect()) - et = [(u'1', u'aa'), - (u'1', u'aa'), - (u'2', u'aa'), - (u'2', u'bb'), - (u'2', u'bb'), - (u'3', u'cc')] - self.assertEqual(text, et) - - bools = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfbool/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.BooleanWritable").collect()) - eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)] - self.assertEqual(bools, eb) - - nulls = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfnull/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.BooleanWritable").collect()) - en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)] - self.assertEqual(nulls, en) - - maps = self.sc.sequenceFile(basepath + "/sftestdata/sfmap/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.MapWritable").collect() - em = [(1, {}), - (1, {3.0: u'bb'}), - (2, {1.0: u'aa'}), - (2, {1.0: u'cc'}), - (3, {2.0: u'dd'})] - for v in maps: - self.assertTrue(v in em) - - # arrays get pickled to tuples by default - tuples = sorted(self.sc.sequenceFile( - basepath + "/sftestdata/sfarray/", - "org.apache.hadoop.io.IntWritable", - "org.apache.spark.api.python.DoubleArrayWritable").collect()) - et = [(1, ()), - (2, (3.0, 4.0, 5.0)), - (3, (4.0, 5.0, 6.0))] - self.assertEqual(tuples, et) - - # with custom converters, primitive arrays can stay as arrays - arrays = sorted(self.sc.sequenceFile( - basepath + "/sftestdata/sfarray/", - "org.apache.hadoop.io.IntWritable", - "org.apache.spark.api.python.DoubleArrayWritable", - valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect()) - ea = [(1, array('d')), - (2, array('d', [3.0, 4.0, 5.0])), - (3, array('d', [4.0, 5.0, 6.0]))] - self.assertEqual(arrays, ea) - - clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/", - "org.apache.hadoop.io.Text", - "org.apache.spark.api.python.TestWritable").collect()) - cname = u'org.apache.spark.api.python.TestWritable' - ec = [(u'1', {u'__class__': cname, u'double': 1.0, u'int': 1, u'str': u'test1'}), - (u'2', {u'__class__': cname, u'double': 2.3, u'int': 2, u'str': u'test2'}), - (u'3', {u'__class__': cname, u'double': 3.1, u'int': 3, u'str': u'test3'}), - (u'4', {u'__class__': cname, u'double': 4.2, u'int': 4, u'str': u'test4'}), - (u'5', {u'__class__': cname, u'double': 5.5, u'int': 5, u'str': u'test56'})] - self.assertEqual(clazz, ec) - - unbatched_clazz = sorted(self.sc.sequenceFile(basepath + "/sftestdata/sfclass/", - "org.apache.hadoop.io.Text", - "org.apache.spark.api.python.TestWritable", - ).collect()) - self.assertEqual(unbatched_clazz, ec) - - def test_oldhadoop(self): - basepath = self.tempdir.name - ints = sorted(self.sc.hadoopFile(basepath + "/sftestdata/sfint/", - "org.apache.hadoop.mapred.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text").collect()) - ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')] - self.assertEqual(ints, ei) - - hellopath = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") - oldconf = {"mapreduce.input.fileinputformat.inputdir": hellopath} - hello = self.sc.hadoopRDD("org.apache.hadoop.mapred.TextInputFormat", - "org.apache.hadoop.io.LongWritable", - "org.apache.hadoop.io.Text", - conf=oldconf).collect() - result = [(0, u'Hello World!')] - self.assertEqual(hello, result) - - def test_newhadoop(self): - basepath = self.tempdir.name - ints = sorted(self.sc.newAPIHadoopFile( - basepath + "/sftestdata/sfint/", - "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text").collect()) - ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')] - self.assertEqual(ints, ei) - - hellopath = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt") - newconf = {"mapreduce.input.fileinputformat.inputdir": hellopath} - hello = self.sc.newAPIHadoopRDD("org.apache.hadoop.mapreduce.lib.input.TextInputFormat", - "org.apache.hadoop.io.LongWritable", - "org.apache.hadoop.io.Text", - conf=newconf).collect() - result = [(0, u'Hello World!')] - self.assertEqual(hello, result) - - def test_newolderror(self): - basepath = self.tempdir.name - self.assertRaises(Exception, lambda: self.sc.hadoopFile( - basepath + "/sftestdata/sfint/", - "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text")) - - self.assertRaises(Exception, lambda: self.sc.newAPIHadoopFile( - basepath + "/sftestdata/sfint/", - "org.apache.hadoop.mapred.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text")) - - def test_bad_inputs(self): - basepath = self.tempdir.name - self.assertRaises(Exception, lambda: self.sc.sequenceFile( - basepath + "/sftestdata/sfint/", - "org.apache.hadoop.io.NotValidWritable", - "org.apache.hadoop.io.Text")) - self.assertRaises(Exception, lambda: self.sc.hadoopFile( - basepath + "/sftestdata/sfint/", - "org.apache.hadoop.mapred.NotValidInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text")) - self.assertRaises(Exception, lambda: self.sc.newAPIHadoopFile( - basepath + "/sftestdata/sfint/", - "org.apache.hadoop.mapreduce.lib.input.NotValidInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text")) - - def test_converters(self): - # use of custom converters - basepath = self.tempdir.name - maps = sorted(self.sc.sequenceFile( - basepath + "/sftestdata/sfmap/", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.MapWritable", - keyConverter="org.apache.spark.api.python.TestInputKeyConverter", - valueConverter="org.apache.spark.api.python.TestInputValueConverter").collect()) - em = [(u'\x01', []), - (u'\x01', [3.0]), - (u'\x02', [1.0]), - (u'\x02', [1.0]), - (u'\x03', [2.0])] - self.assertEqual(maps, em) - - def test_binary_files(self): - path = os.path.join(self.tempdir.name, "binaryfiles") - os.mkdir(path) - data = b"short binary data" - with open(os.path.join(path, "part-0000"), 'wb') as f: - f.write(data) - [(p, d)] = self.sc.binaryFiles(path).collect() - self.assertTrue(p.endswith("part-0000")) - self.assertEqual(d, data) - - def test_binary_records(self): - path = os.path.join(self.tempdir.name, "binaryrecords") - os.mkdir(path) - with open(os.path.join(path, "part-0000"), 'w') as f: - for i in range(100): - f.write('%04d' % i) - result = self.sc.binaryRecords(path, 4).map(int).collect() - self.assertEqual(list(range(100)), result) - - -class OutputFormatTests(ReusedPySparkTestCase): - - def setUp(self): - self.tempdir = tempfile.NamedTemporaryFile(delete=False) - os.unlink(self.tempdir.name) - - def tearDown(self): - shutil.rmtree(self.tempdir.name, ignore_errors=True) - - @unittest.skipIf(sys.version >= "3", "serialize array of byte") - def test_sequencefiles(self): - basepath = self.tempdir.name - ei = [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')] - self.sc.parallelize(ei).saveAsSequenceFile(basepath + "/sfint/") - ints = sorted(self.sc.sequenceFile(basepath + "/sfint/").collect()) - self.assertEqual(ints, ei) - - ed = [(1.0, u'aa'), (1.0, u'aa'), (2.0, u'aa'), (2.0, u'bb'), (2.0, u'bb'), (3.0, u'cc')] - self.sc.parallelize(ed).saveAsSequenceFile(basepath + "/sfdouble/") - doubles = sorted(self.sc.sequenceFile(basepath + "/sfdouble/").collect()) - self.assertEqual(doubles, ed) - - ebs = [(1, bytearray(b'\x00\x07spam\x08')), (2, bytearray(b'\x00\x07spam\x08'))] - self.sc.parallelize(ebs).saveAsSequenceFile(basepath + "/sfbytes/") - bytes = sorted(self.sc.sequenceFile(basepath + "/sfbytes/").collect()) - self.assertEqual(bytes, ebs) - - et = [(u'1', u'aa'), - (u'2', u'bb'), - (u'3', u'cc')] - self.sc.parallelize(et).saveAsSequenceFile(basepath + "/sftext/") - text = sorted(self.sc.sequenceFile(basepath + "/sftext/").collect()) - self.assertEqual(text, et) - - eb = [(1, False), (1, True), (2, False), (2, False), (2, True), (3, True)] - self.sc.parallelize(eb).saveAsSequenceFile(basepath + "/sfbool/") - bools = sorted(self.sc.sequenceFile(basepath + "/sfbool/").collect()) - self.assertEqual(bools, eb) - - en = [(1, None), (1, None), (2, None), (2, None), (2, None), (3, None)] - self.sc.parallelize(en).saveAsSequenceFile(basepath + "/sfnull/") - nulls = sorted(self.sc.sequenceFile(basepath + "/sfnull/").collect()) - self.assertEqual(nulls, en) - - em = [(1, {}), - (1, {3.0: u'bb'}), - (2, {1.0: u'aa'}), - (2, {1.0: u'cc'}), - (3, {2.0: u'dd'})] - self.sc.parallelize(em).saveAsSequenceFile(basepath + "/sfmap/") - maps = self.sc.sequenceFile(basepath + "/sfmap/").collect() - for v in maps: - self.assertTrue(v, em) - - def test_oldhadoop(self): - basepath = self.tempdir.name - dict_data = [(1, {}), - (1, {"row1": 1.0}), - (2, {"row2": 2.0})] - self.sc.parallelize(dict_data).saveAsHadoopFile( - basepath + "/oldhadoop/", - "org.apache.hadoop.mapred.SequenceFileOutputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.MapWritable") - result = self.sc.hadoopFile( - basepath + "/oldhadoop/", - "org.apache.hadoop.mapred.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.MapWritable").collect() - for v in result: - self.assertTrue(v, dict_data) - - conf = { - "mapred.output.format.class": "org.apache.hadoop.mapred.SequenceFileOutputFormat", - "mapreduce.job.output.key.class": "org.apache.hadoop.io.IntWritable", - "mapreduce.job.output.value.class": "org.apache.hadoop.io.MapWritable", - "mapreduce.output.fileoutputformat.outputdir": basepath + "/olddataset/" - } - self.sc.parallelize(dict_data).saveAsHadoopDataset(conf) - input_conf = {"mapreduce.input.fileinputformat.inputdir": basepath + "/olddataset/"} - result = self.sc.hadoopRDD( - "org.apache.hadoop.mapred.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.MapWritable", - conf=input_conf).collect() - for v in result: - self.assertTrue(v, dict_data) - - def test_newhadoop(self): - basepath = self.tempdir.name - data = [(1, ""), - (1, "a"), - (2, "bcdf")] - self.sc.parallelize(data).saveAsNewAPIHadoopFile( - basepath + "/newhadoop/", - "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text") - result = sorted(self.sc.newAPIHadoopFile( - basepath + "/newhadoop/", - "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text").collect()) - self.assertEqual(result, data) - - conf = { - "mapreduce.job.outputformat.class": - "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", - "mapreduce.job.output.key.class": "org.apache.hadoop.io.IntWritable", - "mapreduce.job.output.value.class": "org.apache.hadoop.io.Text", - "mapreduce.output.fileoutputformat.outputdir": basepath + "/newdataset/" - } - self.sc.parallelize(data).saveAsNewAPIHadoopDataset(conf) - input_conf = {"mapreduce.input.fileinputformat.inputdir": basepath + "/newdataset/"} - new_dataset = sorted(self.sc.newAPIHadoopRDD( - "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.hadoop.io.Text", - conf=input_conf).collect()) - self.assertEqual(new_dataset, data) - - @unittest.skipIf(sys.version >= "3", "serialize of array") - def test_newhadoop_with_array(self): - basepath = self.tempdir.name - # use custom ArrayWritable types and converters to handle arrays - array_data = [(1, array('d')), - (1, array('d', [1.0, 2.0, 3.0])), - (2, array('d', [3.0, 4.0, 5.0]))] - self.sc.parallelize(array_data).saveAsNewAPIHadoopFile( - basepath + "/newhadoop/", - "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.spark.api.python.DoubleArrayWritable", - valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter") - result = sorted(self.sc.newAPIHadoopFile( - basepath + "/newhadoop/", - "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.spark.api.python.DoubleArrayWritable", - valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter").collect()) - self.assertEqual(result, array_data) - - conf = { - "mapreduce.job.outputformat.class": - "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", - "mapreduce.job.output.key.class": "org.apache.hadoop.io.IntWritable", - "mapreduce.job.output.value.class": "org.apache.spark.api.python.DoubleArrayWritable", - "mapreduce.output.fileoutputformat.outputdir": basepath + "/newdataset/" - } - self.sc.parallelize(array_data).saveAsNewAPIHadoopDataset( - conf, - valueConverter="org.apache.spark.api.python.DoubleArrayToWritableConverter") - input_conf = {"mapreduce.input.fileinputformat.inputdir": basepath + "/newdataset/"} - new_dataset = sorted(self.sc.newAPIHadoopRDD( - "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", - "org.apache.hadoop.io.IntWritable", - "org.apache.spark.api.python.DoubleArrayWritable", - valueConverter="org.apache.spark.api.python.WritableToDoubleArrayConverter", - conf=input_conf).collect()) - self.assertEqual(new_dataset, array_data) - - def test_newolderror(self): - basepath = self.tempdir.name - rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x)) - self.assertRaises(Exception, lambda: rdd.saveAsHadoopFile( - basepath + "/newolderror/saveAsHadoopFile/", - "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")) - self.assertRaises(Exception, lambda: rdd.saveAsNewAPIHadoopFile( - basepath + "/newolderror/saveAsNewAPIHadoopFile/", - "org.apache.hadoop.mapred.SequenceFileOutputFormat")) - - def test_bad_inputs(self): - basepath = self.tempdir.name - rdd = self.sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x)) - self.assertRaises(Exception, lambda: rdd.saveAsHadoopFile( - basepath + "/badinputs/saveAsHadoopFile/", - "org.apache.hadoop.mapred.NotValidOutputFormat")) - self.assertRaises(Exception, lambda: rdd.saveAsNewAPIHadoopFile( - basepath + "/badinputs/saveAsNewAPIHadoopFile/", - "org.apache.hadoop.mapreduce.lib.output.NotValidOutputFormat")) - - def test_converters(self): - # use of custom converters - basepath = self.tempdir.name - data = [(1, {3.0: u'bb'}), - (2, {1.0: u'aa'}), - (3, {2.0: u'dd'})] - self.sc.parallelize(data).saveAsNewAPIHadoopFile( - basepath + "/converters/", - "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", - keyConverter="org.apache.spark.api.python.TestOutputKeyConverter", - valueConverter="org.apache.spark.api.python.TestOutputValueConverter") - converted = sorted(self.sc.sequenceFile(basepath + "/converters/").collect()) - expected = [(u'1', 3.0), - (u'2', 1.0), - (u'3', 2.0)] - self.assertEqual(converted, expected) - - def test_reserialization(self): - basepath = self.tempdir.name - x = range(1, 5) - y = range(1001, 1005) - data = list(zip(x, y)) - rdd = self.sc.parallelize(x).zip(self.sc.parallelize(y)) - rdd.saveAsSequenceFile(basepath + "/reserialize/sequence") - result1 = sorted(self.sc.sequenceFile(basepath + "/reserialize/sequence").collect()) - self.assertEqual(result1, data) - - rdd.saveAsHadoopFile( - basepath + "/reserialize/hadoop", - "org.apache.hadoop.mapred.SequenceFileOutputFormat") - result2 = sorted(self.sc.sequenceFile(basepath + "/reserialize/hadoop").collect()) - self.assertEqual(result2, data) - - rdd.saveAsNewAPIHadoopFile( - basepath + "/reserialize/newhadoop", - "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat") - result3 = sorted(self.sc.sequenceFile(basepath + "/reserialize/newhadoop").collect()) - self.assertEqual(result3, data) - - conf4 = { - "mapred.output.format.class": "org.apache.hadoop.mapred.SequenceFileOutputFormat", - "mapreduce.job.output.key.class": "org.apache.hadoop.io.IntWritable", - "mapreduce.job.output.value.class": "org.apache.hadoop.io.IntWritable", - "mapreduce.output.fileoutputformat.outputdir": basepath + "/reserialize/dataset"} - rdd.saveAsHadoopDataset(conf4) - result4 = sorted(self.sc.sequenceFile(basepath + "/reserialize/dataset").collect()) - self.assertEqual(result4, data) - - conf5 = {"mapreduce.job.outputformat.class": - "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", - "mapreduce.job.output.key.class": "org.apache.hadoop.io.IntWritable", - "mapreduce.job.output.value.class": "org.apache.hadoop.io.IntWritable", - "mapreduce.output.fileoutputformat.outputdir": basepath + "/reserialize/newdataset" - } - rdd.saveAsNewAPIHadoopDataset(conf5) - result5 = sorted(self.sc.sequenceFile(basepath + "/reserialize/newdataset").collect()) - self.assertEqual(result5, data) - - def test_malformed_RDD(self): - basepath = self.tempdir.name - # non-batch-serialized RDD[[(K, V)]] should be rejected - data = [[(1, "a")], [(2, "aa")], [(3, "aaa")]] - rdd = self.sc.parallelize(data, len(data)) - self.assertRaises(Exception, lambda: rdd.saveAsSequenceFile( - basepath + "/malformed/sequence")) - - -class DaemonTests(unittest.TestCase): - def connect(self, port): - from socket import socket, AF_INET, SOCK_STREAM - sock = socket(AF_INET, SOCK_STREAM) - sock.connect(('127.0.0.1', port)) - # send a split index of -1 to shutdown the worker - sock.send(b"\xFF\xFF\xFF\xFF") - sock.close() - return True - - def do_termination_test(self, terminator): - from subprocess import Popen, PIPE - from errno import ECONNREFUSED - - # start daemon - daemon_path = os.path.join(os.path.dirname(__file__), "daemon.py") - python_exec = sys.executable or os.environ.get("PYSPARK_PYTHON") - daemon = Popen([python_exec, daemon_path], stdin=PIPE, stdout=PIPE) - - # read the port number - port = read_int(daemon.stdout) - - # daemon should accept connections - self.assertTrue(self.connect(port)) - - # request shutdown - terminator(daemon) - time.sleep(1) - - # daemon should no longer accept connections - try: - self.connect(port) - except EnvironmentError as exception: - self.assertEqual(exception.errno, ECONNREFUSED) - else: - self.fail("Expected EnvironmentError to be raised") - - def test_termination_stdin(self): - """Ensure that daemon and workers terminate when stdin is closed.""" - self.do_termination_test(lambda daemon: daemon.stdin.close()) - - def test_termination_sigterm(self): - """Ensure that daemon and workers terminate on SIGTERM.""" - from signal import SIGTERM - self.do_termination_test(lambda daemon: os.kill(daemon.pid, SIGTERM)) - - -class WorkerTests(ReusedPySparkTestCase): - def test_cancel_task(self): - temp = tempfile.NamedTemporaryFile(delete=True) - temp.close() - path = temp.name - - def sleep(x): - import os - import time - with open(path, 'w') as f: - f.write("%d %d" % (os.getppid(), os.getpid())) - time.sleep(100) - - # start job in background thread - def run(): - try: - self.sc.parallelize(range(1), 1).foreach(sleep) - except Exception: - pass - import threading - t = threading.Thread(target=run) - t.daemon = True - t.start() - - daemon_pid, worker_pid = 0, 0 - while True: - if os.path.exists(path): - with open(path) as f: - data = f.read().split(' ') - daemon_pid, worker_pid = map(int, data) - break - time.sleep(0.1) - - # cancel jobs - self.sc.cancelAllJobs() - t.join() - - for i in range(50): - try: - os.kill(worker_pid, 0) - time.sleep(0.1) - except OSError: - break # worker was killed - else: - self.fail("worker has not been killed after 5 seconds") - - try: - os.kill(daemon_pid, 0) - except OSError: - self.fail("daemon had been killed") - - # run a normal job - rdd = self.sc.parallelize(xrange(100), 1) - self.assertEqual(100, rdd.map(str).count()) - - def test_after_exception(self): - def raise_exception(_): - raise Exception() - rdd = self.sc.parallelize(xrange(100), 1) - with QuietTest(self.sc): - self.assertRaises(Exception, lambda: rdd.foreach(raise_exception)) - self.assertEqual(100, rdd.map(str).count()) - - def test_after_jvm_exception(self): - tempFile = tempfile.NamedTemporaryFile(delete=False) - tempFile.write(b"Hello World!") - tempFile.close() - data = self.sc.textFile(tempFile.name, 1) - filtered_data = data.filter(lambda x: True) - self.assertEqual(1, filtered_data.count()) - os.unlink(tempFile.name) - with QuietTest(self.sc): - self.assertRaises(Exception, lambda: filtered_data.count()) - - rdd = self.sc.parallelize(xrange(100), 1) - self.assertEqual(100, rdd.map(str).count()) - - def test_accumulator_when_reuse_worker(self): - from pyspark.accumulators import INT_ACCUMULATOR_PARAM - acc1 = self.sc.accumulator(0, INT_ACCUMULATOR_PARAM) - self.sc.parallelize(xrange(100), 20).foreach(lambda x: acc1.add(x)) - self.assertEqual(sum(range(100)), acc1.value) - - acc2 = self.sc.accumulator(0, INT_ACCUMULATOR_PARAM) - self.sc.parallelize(xrange(100), 20).foreach(lambda x: acc2.add(x)) - self.assertEqual(sum(range(100)), acc2.value) - self.assertEqual(sum(range(100)), acc1.value) - - def test_reuse_worker_after_take(self): - rdd = self.sc.parallelize(xrange(100000), 1) - self.assertEqual(0, rdd.first()) - - def count(): - try: - rdd.count() - except Exception: - pass - - t = threading.Thread(target=count) - t.daemon = True - t.start() - t.join(5) - self.assertTrue(not t.isAlive()) - self.assertEqual(100000, rdd.count()) - - def test_with_different_versions_of_python(self): - rdd = self.sc.parallelize(range(10)) - rdd.count() - version = self.sc.pythonVer - self.sc.pythonVer = "2.0" - try: - with QuietTest(self.sc): - self.assertRaises(Py4JJavaError, lambda: rdd.count()) - finally: - self.sc.pythonVer = version - - -class SparkSubmitTests(unittest.TestCase): - - def setUp(self): - self.programDir = tempfile.mkdtemp() - tmp_dir = tempfile.gettempdir() - self.sparkSubmit = [ - os.path.join(os.environ.get("SPARK_HOME"), "bin", "spark-submit"), - "--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), - "--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), - ] - - def tearDown(self): - shutil.rmtree(self.programDir) - - def createTempFile(self, name, content, dir=None): - """ - Create a temp file with the given name and content and return its path. - Strips leading spaces from content up to the first '|' in each line. - """ - pattern = re.compile(r'^ *\|', re.MULTILINE) - content = re.sub(pattern, '', content.strip()) - if dir is None: - path = os.path.join(self.programDir, name) - else: - os.makedirs(os.path.join(self.programDir, dir)) - path = os.path.join(self.programDir, dir, name) - with open(path, "w") as f: - f.write(content) - return path - - def createFileInZip(self, name, content, ext=".zip", dir=None, zip_name=None): - """ - Create a zip archive containing a file with the given content and return its path. - Strips leading spaces from content up to the first '|' in each line. - """ - pattern = re.compile(r'^ *\|', re.MULTILINE) - content = re.sub(pattern, '', content.strip()) - if dir is None: - path = os.path.join(self.programDir, name + ext) - else: - path = os.path.join(self.programDir, dir, zip_name + ext) - zip = zipfile.ZipFile(path, 'w') - zip.writestr(name, content) - zip.close() - return path - - def create_spark_package(self, artifact_name): - group_id, artifact_id, version = artifact_name.split(":") - self.createTempFile("%s-%s.pom" % (artifact_id, version), (""" - | - | - | 4.0.0 - | %s - | %s - | %s - | - """ % (group_id, artifact_id, version)).lstrip(), - os.path.join(group_id, artifact_id, version)) - self.createFileInZip("%s.py" % artifact_id, """ - |def myfunc(x): - | return x + 1 - """, ".jar", os.path.join(group_id, artifact_id, version), - "%s-%s" % (artifact_id, version)) - - def test_single_script(self): - """Submit and test a single script file""" - script = self.createTempFile("test.py", """ - |from pyspark import SparkContext - | - |sc = SparkContext() - |print(sc.parallelize([1, 2, 3]).map(lambda x: x * 2).collect()) - """) - proc = subprocess.Popen(self.sparkSubmit + [script], stdout=subprocess.PIPE) - out, err = proc.communicate() - self.assertEqual(0, proc.returncode) - self.assertIn("[2, 4, 6]", out.decode('utf-8')) - - def test_script_with_local_functions(self): - """Submit and test a single script file calling a global function""" - script = self.createTempFile("test.py", """ - |from pyspark import SparkContext - | - |def foo(x): - | return x * 3 - | - |sc = SparkContext() - |print(sc.parallelize([1, 2, 3]).map(foo).collect()) - """) - proc = subprocess.Popen(self.sparkSubmit + [script], stdout=subprocess.PIPE) - out, err = proc.communicate() - self.assertEqual(0, proc.returncode) - self.assertIn("[3, 6, 9]", out.decode('utf-8')) - - def test_module_dependency(self): - """Submit and test a script with a dependency on another module""" - script = self.createTempFile("test.py", """ - |from pyspark import SparkContext - |from mylib import myfunc - | - |sc = SparkContext() - |print(sc.parallelize([1, 2, 3]).map(myfunc).collect()) - """) - zip = self.createFileInZip("mylib.py", """ - |def myfunc(x): - | return x + 1 - """) - proc = subprocess.Popen(self.sparkSubmit + ["--py-files", zip, script], - stdout=subprocess.PIPE) - out, err = proc.communicate() - self.assertEqual(0, proc.returncode) - self.assertIn("[2, 3, 4]", out.decode('utf-8')) - - def test_module_dependency_on_cluster(self): - """Submit and test a script with a dependency on another module on a cluster""" - script = self.createTempFile("test.py", """ - |from pyspark import SparkContext - |from mylib import myfunc - | - |sc = SparkContext() - |print(sc.parallelize([1, 2, 3]).map(myfunc).collect()) - """) - zip = self.createFileInZip("mylib.py", """ - |def myfunc(x): - | return x + 1 - """) - proc = subprocess.Popen(self.sparkSubmit + ["--py-files", zip, "--master", - "local-cluster[1,1,1024]", script], - stdout=subprocess.PIPE) - out, err = proc.communicate() - self.assertEqual(0, proc.returncode) - self.assertIn("[2, 3, 4]", out.decode('utf-8')) - - def test_package_dependency(self): - """Submit and test a script with a dependency on a Spark Package""" - script = self.createTempFile("test.py", """ - |from pyspark import SparkContext - |from mylib import myfunc - | - |sc = SparkContext() - |print(sc.parallelize([1, 2, 3]).map(myfunc).collect()) - """) - self.create_spark_package("a:mylib:0.1") - proc = subprocess.Popen( - self.sparkSubmit + ["--packages", "a:mylib:0.1", "--repositories", - "file:" + self.programDir, script], - stdout=subprocess.PIPE) - out, err = proc.communicate() - self.assertEqual(0, proc.returncode) - self.assertIn("[2, 3, 4]", out.decode('utf-8')) - - def test_package_dependency_on_cluster(self): - """Submit and test a script with a dependency on a Spark Package on a cluster""" - script = self.createTempFile("test.py", """ - |from pyspark import SparkContext - |from mylib import myfunc - | - |sc = SparkContext() - |print(sc.parallelize([1, 2, 3]).map(myfunc).collect()) - """) - self.create_spark_package("a:mylib:0.1") - proc = subprocess.Popen( - self.sparkSubmit + ["--packages", "a:mylib:0.1", "--repositories", - "file:" + self.programDir, "--master", "local-cluster[1,1,1024]", - script], - stdout=subprocess.PIPE) - out, err = proc.communicate() - self.assertEqual(0, proc.returncode) - self.assertIn("[2, 3, 4]", out.decode('utf-8')) - - def test_single_script_on_cluster(self): - """Submit and test a single script on a cluster""" - script = self.createTempFile("test.py", """ - |from pyspark import SparkContext - | - |def foo(x): - | return x * 2 - | - |sc = SparkContext() - |print(sc.parallelize([1, 2, 3]).map(foo).collect()) - """) - # this will fail if you have different spark.executor.memory - # in conf/spark-defaults.conf - proc = subprocess.Popen( - self.sparkSubmit + ["--master", "local-cluster[1,1,1024]", script], - stdout=subprocess.PIPE) - out, err = proc.communicate() - self.assertEqual(0, proc.returncode) - self.assertIn("[2, 4, 6]", out.decode('utf-8')) - - def test_user_configuration(self): - """Make sure user configuration is respected (SPARK-19307)""" - script = self.createTempFile("test.py", """ - |from pyspark import SparkConf, SparkContext - | - |conf = SparkConf().set("spark.test_config", "1") - |sc = SparkContext(conf = conf) - |try: - | if sc._conf.get("spark.test_config") != "1": - | raise Exception("Cannot find spark.test_config in SparkContext's conf.") - |finally: - | sc.stop() - """) - proc = subprocess.Popen( - self.sparkSubmit + ["--master", "local", script], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - out, err = proc.communicate() - self.assertEqual(0, proc.returncode, msg="Process failed with error:\n {0}".format(out)) - - -class ContextTests(unittest.TestCase): - - def test_failed_sparkcontext_creation(self): - # Regression test for SPARK-1550 - self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) - - def test_get_or_create(self): - with SparkContext.getOrCreate() as sc: - self.assertTrue(SparkContext.getOrCreate() is sc) - - def test_parallelize_eager_cleanup(self): - with SparkContext() as sc: - temp_files = os.listdir(sc._temp_dir) - rdd = sc.parallelize([0, 1, 2]) - post_parallalize_temp_files = os.listdir(sc._temp_dir) - self.assertEqual(temp_files, post_parallalize_temp_files) - - def test_set_conf(self): - # This is for an internal use case. When there is an existing SparkContext, - # SparkSession's builder needs to set configs into SparkContext's conf. - sc = SparkContext() - sc._conf.set("spark.test.SPARK16224", "SPARK16224") - self.assertEqual(sc._jsc.sc().conf().get("spark.test.SPARK16224"), "SPARK16224") - sc.stop() - - def test_stop(self): - sc = SparkContext() - self.assertNotEqual(SparkContext._active_spark_context, None) - sc.stop() - self.assertEqual(SparkContext._active_spark_context, None) - - def test_with(self): - with SparkContext() as sc: - self.assertNotEqual(SparkContext._active_spark_context, None) - self.assertEqual(SparkContext._active_spark_context, None) - - def test_with_exception(self): - try: - with SparkContext() as sc: - self.assertNotEqual(SparkContext._active_spark_context, None) - raise Exception() - except: - pass - self.assertEqual(SparkContext._active_spark_context, None) - - def test_with_stop(self): - with SparkContext() as sc: - self.assertNotEqual(SparkContext._active_spark_context, None) - sc.stop() - self.assertEqual(SparkContext._active_spark_context, None) - - def test_progress_api(self): - with SparkContext() as sc: - sc.setJobGroup('test_progress_api', '', True) - rdd = sc.parallelize(range(10)).map(lambda x: time.sleep(100)) - - def run(): - try: - rdd.count() - except Exception: - pass - t = threading.Thread(target=run) - t.daemon = True - t.start() - # wait for scheduler to start - time.sleep(1) - - tracker = sc.statusTracker() - jobIds = tracker.getJobIdsForGroup('test_progress_api') - self.assertEqual(1, len(jobIds)) - job = tracker.getJobInfo(jobIds[0]) - self.assertEqual(1, len(job.stageIds)) - stage = tracker.getStageInfo(job.stageIds[0]) - self.assertEqual(rdd.getNumPartitions(), stage.numTasks) - - sc.cancelAllJobs() - t.join() - # wait for event listener to update the status - time.sleep(1) - - job = tracker.getJobInfo(jobIds[0]) - self.assertEqual('FAILED', job.status) - self.assertEqual([], tracker.getActiveJobsIds()) - self.assertEqual([], tracker.getActiveStageIds()) - - sc.stop() - - def test_startTime(self): - with SparkContext() as sc: - self.assertGreater(sc.startTime, 0) - - def test_forbid_insecure_gateway(self): - # By default, we fail immediately if you try to create a SparkContext - # with an insecure gateway - gateway = _launch_gateway(insecure=True) - log4j = gateway.jvm.org.apache.log4j - old_level = log4j.LogManager.getRootLogger().getLevel() - try: - log4j.LogManager.getRootLogger().setLevel(log4j.Level.FATAL) - with self.assertRaises(Exception) as context: - SparkContext(gateway=gateway) - self.assertIn("insecure Py4j gateway", str(context.exception)) - self.assertIn("PYSPARK_ALLOW_INSECURE_GATEWAY", str(context.exception)) - self.assertIn("removed in Spark 3.0", str(context.exception)) - finally: - log4j.LogManager.getRootLogger().setLevel(old_level) - - def test_allow_insecure_gateway_with_conf(self): - with SparkContext._lock: - SparkContext._gateway = None - SparkContext._jvm = None - gateway = _launch_gateway(insecure=True) - try: - os.environ["PYSPARK_ALLOW_INSECURE_GATEWAY"] = "1" - with SparkContext(gateway=gateway) as sc: - a = sc.accumulator(1) - rdd = sc.parallelize([1, 2, 3]) - rdd.foreach(lambda x: a.add(x)) - self.assertEqual(7, a.value) - finally: - os.environ.pop("PYSPARK_ALLOW_INSECURE_GATEWAY", None) - - -class ConfTests(unittest.TestCase): - def test_memory_conf(self): - memoryList = ["1T", "1G", "1M", "1024K"] - for memory in memoryList: - sc = SparkContext(conf=SparkConf().set("spark.python.worker.memory", memory)) - l = list(range(1024)) - random.shuffle(l) - rdd = sc.parallelize(l, 4) - self.assertEqual(sorted(l), rdd.sortBy(lambda x: x).collect()) - sc.stop() - - -class KeywordOnlyTests(unittest.TestCase): - class Wrapped(object): - @keyword_only - def set(self, x=None, y=None): - if "x" in self._input_kwargs: - self._x = self._input_kwargs["x"] - if "y" in self._input_kwargs: - self._y = self._input_kwargs["y"] - return x, y - - def test_keywords(self): - w = self.Wrapped() - x, y = w.set(y=1) - self.assertEqual(y, 1) - self.assertEqual(y, w._y) - self.assertIsNone(x) - self.assertFalse(hasattr(w, "_x")) - - def test_non_keywords(self): - w = self.Wrapped() - self.assertRaises(TypeError, lambda: w.set(0, y=1)) - - def test_kwarg_ownership(self): - # test _input_kwargs is owned by each class instance and not a shared static variable - class Setter(object): - @keyword_only - def set(self, x=None, other=None, other_x=None): - if "other" in self._input_kwargs: - self._input_kwargs["other"].set(x=self._input_kwargs["other_x"]) - self._x = self._input_kwargs["x"] - - a = Setter() - b = Setter() - a.set(x=1, other=b, other_x=2) - self.assertEqual(a._x, 1) - self.assertEqual(b._x, 2) - - -class UtilTests(PySparkTestCase): - def test_py4j_exception_message(self): - from pyspark.util import _exception_message - - with self.assertRaises(Py4JJavaError) as context: - # This attempts java.lang.String(null) which throws an NPE. - self.sc._jvm.java.lang.String(None) - - self.assertTrue('NullPointerException' in _exception_message(context.exception)) - - def test_parsing_version_string(self): - from pyspark.util import VersionUtils - self.assertRaises(ValueError, lambda: VersionUtils.majorMinorVersion("abced")) - - -@unittest.skipIf(not _have_scipy, "SciPy not installed") -class SciPyTests(PySparkTestCase): - - """General PySpark tests that depend on scipy """ - - def test_serialize(self): - from scipy.special import gammaln - x = range(1, 5) - expected = list(map(gammaln, x)) - observed = self.sc.parallelize(x).map(gammaln).collect() - self.assertEqual(expected, observed) - - -@unittest.skipIf(not _have_numpy, "NumPy not installed") -class NumPyTests(PySparkTestCase): - - """General PySpark tests that depend on numpy """ - - def test_statcounter_array(self): - x = self.sc.parallelize([np.array([1.0, 1.0]), np.array([2.0, 2.0]), np.array([3.0, 3.0])]) - s = x.stats() - self.assertSequenceEqual([2.0, 2.0], s.mean().tolist()) - self.assertSequenceEqual([1.0, 1.0], s.min().tolist()) - self.assertSequenceEqual([3.0, 3.0], s.max().tolist()) - self.assertSequenceEqual([1.0, 1.0], s.sampleStdev().tolist()) - - stats_dict = s.asDict() - self.assertEqual(3, stats_dict['count']) - self.assertSequenceEqual([2.0, 2.0], stats_dict['mean'].tolist()) - self.assertSequenceEqual([1.0, 1.0], stats_dict['min'].tolist()) - self.assertSequenceEqual([3.0, 3.0], stats_dict['max'].tolist()) - self.assertSequenceEqual([6.0, 6.0], stats_dict['sum'].tolist()) - self.assertSequenceEqual([1.0, 1.0], stats_dict['stdev'].tolist()) - self.assertSequenceEqual([1.0, 1.0], stats_dict['variance'].tolist()) - - stats_sample_dict = s.asDict(sample=True) - self.assertEqual(3, stats_dict['count']) - self.assertSequenceEqual([2.0, 2.0], stats_sample_dict['mean'].tolist()) - self.assertSequenceEqual([1.0, 1.0], stats_sample_dict['min'].tolist()) - self.assertSequenceEqual([3.0, 3.0], stats_sample_dict['max'].tolist()) - self.assertSequenceEqual([6.0, 6.0], stats_sample_dict['sum'].tolist()) - self.assertSequenceEqual( - [0.816496580927726, 0.816496580927726], stats_sample_dict['stdev'].tolist()) - self.assertSequenceEqual( - [0.6666666666666666, 0.6666666666666666], stats_sample_dict['variance'].tolist()) - - -if __name__ == "__main__": - from pyspark.tests import * - if xmlrunner: - unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2) - else: - unittest.main(verbosity=2) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/traceback_utils.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/traceback_utils.py deleted file mode 100644 index bb8646d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/traceback_utils.py +++ /dev/null @@ -1,78 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from collections import namedtuple -import os -import traceback - - -CallSite = namedtuple("CallSite", "function file linenum") - - -def first_spark_call(): - """ - Return a CallSite representing the first Spark call in the current call stack. - """ - tb = traceback.extract_stack() - if len(tb) == 0: - return None - file, line, module, what = tb[len(tb) - 1] - sparkpath = os.path.dirname(file) - first_spark_frame = len(tb) - 1 - for i in range(0, len(tb)): - file, line, fun, what = tb[i] - if file.startswith(sparkpath): - first_spark_frame = i - break - if first_spark_frame == 0: - file, line, fun, what = tb[0] - return CallSite(function=fun, file=file, linenum=line) - sfile, sline, sfun, swhat = tb[first_spark_frame] - ufile, uline, ufun, uwhat = tb[first_spark_frame - 1] - return CallSite(function=sfun, file=ufile, linenum=uline) - - -class SCCallSiteSync(object): - """ - Helper for setting the spark context call site. - - Example usage: - from pyspark.context import SCCallSiteSync - with SCCallSiteSync() as css: - - """ - - _spark_stack_depth = 0 - - def __init__(self, sc): - call_site = first_spark_call() - if call_site is not None: - self._call_site = "%s at %s:%s" % ( - call_site.function, call_site.file, call_site.linenum) - else: - self._call_site = "Error! Could not extract traceback info" - self._context = sc - - def __enter__(self): - if SCCallSiteSync._spark_stack_depth == 0: - self._context._jsc.setCallSite(self._call_site) - SCCallSiteSync._spark_stack_depth += 1 - - def __exit__(self, type, value, tb): - SCCallSiteSync._spark_stack_depth -= 1 - if SCCallSiteSync._spark_stack_depth == 0: - self._context._jsc.setCallSite(None) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/util.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/util.py deleted file mode 100644 index f906f49..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/util.py +++ /dev/null @@ -1,113 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import re -import sys -import inspect -from py4j.protocol import Py4JJavaError - -__all__ = [] - - -def _exception_message(excp): - """Return the message from an exception as either a str or unicode object. Supports both - Python 2 and Python 3. - - >>> msg = "Exception message" - >>> excp = Exception(msg) - >>> msg == _exception_message(excp) - True - - >>> msg = u"unicöde" - >>> excp = Exception(msg) - >>> msg == _exception_message(excp) - True - """ - if isinstance(excp, Py4JJavaError): - # 'Py4JJavaError' doesn't contain the stack trace available on the Java side in 'message' - # attribute in Python 2. We should call 'str' function on this exception in general but - # 'Py4JJavaError' has an issue about addressing non-ascii strings. So, here we work - # around by the direct call, '__str__()'. Please see SPARK-23517. - return excp.__str__() - if hasattr(excp, "message"): - return excp.message - return str(excp) - - -def _get_argspec(f): - """ - Get argspec of a function. Supports both Python 2 and Python 3. - """ - if sys.version_info[0] < 3: - argspec = inspect.getargspec(f) - else: - # `getargspec` is deprecated since python3.0 (incompatible with function annotations). - # See SPARK-23569. - argspec = inspect.getfullargspec(f) - return argspec - - -class VersionUtils(object): - """ - Provides utility method to determine Spark versions with given input string. - """ - @staticmethod - def majorMinorVersion(sparkVersion): - """ - Given a Spark version string, return the (major version number, minor version number). - E.g., for 2.0.1-SNAPSHOT, return (2, 0). - - >>> sparkVersion = "2.4.0" - >>> VersionUtils.majorMinorVersion(sparkVersion) - (2, 4) - >>> sparkVersion = "2.3.0-SNAPSHOT" - >>> VersionUtils.majorMinorVersion(sparkVersion) - (2, 3) - - """ - m = re.search(r'^(\d+)\.(\d+)(\..*)?$', sparkVersion) - if m is not None: - return (int(m.group(1)), int(m.group(2))) - else: - raise ValueError("Spark tried to parse '%s' as a Spark" % sparkVersion + - " version string, but it could not find the major and minor" + - " version numbers.") - - -def fail_on_stopiteration(f): - """ - Wraps the input function to fail on 'StopIteration' by raising a 'RuntimeError' - prevents silent loss of data when 'f' is used in a for loop in Spark code - """ - def wrapper(*args, **kwargs): - try: - return f(*args, **kwargs) - except StopIteration as exc: - raise RuntimeError( - "Caught StopIteration thrown from user's code; failing the task", - exc - ) - - return wrapper - - -if __name__ == "__main__": - import doctest - (failure_count, test_count) = doctest.testmod() - if failure_count: - sys.exit(-1) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/version.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/version.py deleted file mode 100644 index 45ddab5..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__='2.4.3' diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/worker.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/worker.py deleted file mode 100644 index 953b468..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/pyspark/worker.py +++ /dev/null @@ -1,415 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Worker that receives input from Piped RDD. -""" -from __future__ import print_function -import os -import sys -import time -# 'resource' is a Unix specific module. -has_resource_module = True -try: - import resource -except ImportError: - has_resource_module = False -import socket -import traceback - -from pyspark.accumulators import _accumulatorRegistry -from pyspark.broadcast import Broadcast, _broadcastRegistry -from pyspark.java_gateway import local_connect_and_auth -from pyspark.taskcontext import BarrierTaskContext, TaskContext -from pyspark.files import SparkFiles -from pyspark.rdd import PythonEvalType -from pyspark.serializers import write_with_length, write_int, read_long, read_bool, \ - write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, \ - BatchedSerializer, ArrowStreamPandasSerializer -from pyspark.sql.types import to_arrow_type -from pyspark.util import _get_argspec, fail_on_stopiteration -from pyspark import shuffle - -if sys.version >= '3': - basestring = str - -pickleSer = PickleSerializer() -utf8_deserializer = UTF8Deserializer() - - -def report_times(outfile, boot, init, finish): - write_int(SpecialLengths.TIMING_DATA, outfile) - write_long(int(1000 * boot), outfile) - write_long(int(1000 * init), outfile) - write_long(int(1000 * finish), outfile) - - -def add_path(path): - # worker can be used, so donot add path multiple times - if path not in sys.path: - # overwrite system packages - sys.path.insert(1, path) - - -def read_command(serializer, file): - command = serializer._read_with_length(file) - if isinstance(command, Broadcast): - command = serializer.loads(command.value) - return command - - -def chain(f, g): - """chain two functions together """ - return lambda *a: g(f(*a)) - - -def wrap_udf(f, return_type): - if return_type.needConversion(): - toInternal = return_type.toInternal - return lambda *a: toInternal(f(*a)) - else: - return lambda *a: f(*a) - - -def wrap_scalar_pandas_udf(f, return_type): - arrow_return_type = to_arrow_type(return_type) - - def verify_result_length(*a): - result = f(*a) - if not hasattr(result, "__len__"): - raise TypeError("Return type of the user-defined function should be " - "Pandas.Series, but is {}".format(type(result))) - if len(result) != len(a[0]): - raise RuntimeError("Result vector from pandas_udf was not the required length: " - "expected %d, got %d" % (len(a[0]), len(result))) - return result - - return lambda *a: (verify_result_length(*a), arrow_return_type) - - -def wrap_grouped_map_pandas_udf(f, return_type, argspec, runner_conf): - assign_cols_by_name = runner_conf.get( - "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true") - assign_cols_by_name = assign_cols_by_name.lower() == "true" - - def wrapped(key_series, value_series): - import pandas as pd - - if len(argspec.args) == 1: - result = f(pd.concat(value_series, axis=1)) - elif len(argspec.args) == 2: - key = tuple(s[0] for s in key_series) - result = f(key, pd.concat(value_series, axis=1)) - - if not isinstance(result, pd.DataFrame): - raise TypeError("Return type of the user-defined function should be " - "pandas.DataFrame, but is {}".format(type(result))) - if not len(result.columns) == len(return_type): - raise RuntimeError( - "Number of columns of the returned pandas.DataFrame " - "doesn't match specified schema. " - "Expected: {} Actual: {}".format(len(return_type), len(result.columns))) - - # Assign result columns by schema name if user labeled with strings, else use position - if assign_cols_by_name and any(isinstance(name, basestring) for name in result.columns): - return [(result[field.name], to_arrow_type(field.dataType)) for field in return_type] - else: - return [(result[result.columns[i]], to_arrow_type(field.dataType)) - for i, field in enumerate(return_type)] - - return wrapped - - -def wrap_grouped_agg_pandas_udf(f, return_type): - arrow_return_type = to_arrow_type(return_type) - - def wrapped(*series): - import pandas as pd - result = f(*series) - return pd.Series([result]) - - return lambda *a: (wrapped(*a), arrow_return_type) - - -def wrap_window_agg_pandas_udf(f, return_type): - # This is similar to grouped_agg_pandas_udf, the only difference - # is that window_agg_pandas_udf needs to repeat the return value - # to match window length, where grouped_agg_pandas_udf just returns - # the scalar value. - arrow_return_type = to_arrow_type(return_type) - - def wrapped(*series): - import pandas as pd - result = f(*series) - return pd.Series([result]).repeat(len(series[0])) - - return lambda *a: (wrapped(*a), arrow_return_type) - - -def read_single_udf(pickleSer, infile, eval_type, runner_conf): - num_arg = read_int(infile) - arg_offsets = [read_int(infile) for i in range(num_arg)] - row_func = None - for i in range(read_int(infile)): - f, return_type = read_command(pickleSer, infile) - if row_func is None: - row_func = f - else: - row_func = chain(row_func, f) - - # make sure StopIteration's raised in the user code are not ignored - # when they are processed in a for loop, raise them as RuntimeError's instead - func = fail_on_stopiteration(row_func) - - # the last returnType will be the return type of UDF - if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF: - return arg_offsets, wrap_scalar_pandas_udf(func, return_type) - elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: - argspec = _get_argspec(row_func) # signature was lost when wrapping it - return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type, argspec, runner_conf) - elif eval_type == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: - return arg_offsets, wrap_grouped_agg_pandas_udf(func, return_type) - elif eval_type == PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF: - return arg_offsets, wrap_window_agg_pandas_udf(func, return_type) - elif eval_type == PythonEvalType.SQL_BATCHED_UDF: - return arg_offsets, wrap_udf(func, return_type) - else: - raise ValueError("Unknown eval type: {}".format(eval_type)) - - -def read_udfs(pickleSer, infile, eval_type): - runner_conf = {} - - if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, - PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, - PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF, - PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF): - - # Load conf used for pandas_udf evaluation - num_conf = read_int(infile) - for i in range(num_conf): - k = utf8_deserializer.loads(infile) - v = utf8_deserializer.loads(infile) - runner_conf[k] = v - - # NOTE: if timezone is set here, that implies respectSessionTimeZone is True - timezone = runner_conf.get("spark.sql.session.timeZone", None) - ser = ArrowStreamPandasSerializer(timezone) - else: - ser = BatchedSerializer(PickleSerializer(), 100) - - num_udfs = read_int(infile) - udfs = {} - call_udf = [] - mapper_str = "" - if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: - # Create function like this: - # lambda a: f([a[0]], [a[0], a[1]]) - - # We assume there is only one UDF here because grouped map doesn't - # support combining multiple UDFs. - assert num_udfs == 1 - - # See FlatMapGroupsInPandasExec for how arg_offsets are used to - # distinguish between grouping attributes and data attributes - arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf) - udfs['f'] = udf - split_offset = arg_offsets[0] + 1 - arg0 = ["a[%d]" % o for o in arg_offsets[1: split_offset]] - arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]] - mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0), ", ".join(arg1)) - else: - # Create function like this: - # lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3])) - # In the special case of a single UDF this will return a single result rather - # than a tuple of results; this is the format that the JVM side expects. - for i in range(num_udfs): - arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf) - udfs['f%d' % i] = udf - args = ["a[%d]" % o for o in arg_offsets] - call_udf.append("f%d(%s)" % (i, ", ".join(args))) - mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) - - mapper = eval(mapper_str, udfs) - func = lambda _, it: map(mapper, it) - - # profiling is not supported for UDF - return func, None, ser, ser - - -def main(infile, outfile): - try: - boot_time = time.time() - split_index = read_int(infile) - if split_index == -1: # for unit tests - sys.exit(-1) - - version = utf8_deserializer.loads(infile) - if version != "%d.%d" % sys.version_info[:2]: - raise Exception(("Python in worker has different version %s than that in " + - "driver %s, PySpark cannot run with different minor versions." + - "Please check environment variables PYSPARK_PYTHON and " + - "PYSPARK_DRIVER_PYTHON are correctly set.") % - ("%d.%d" % sys.version_info[:2], version)) - - # read inputs only for a barrier task - isBarrier = read_bool(infile) - boundPort = read_int(infile) - secret = UTF8Deserializer().loads(infile) - - # set up memory limits - memory_limit_mb = int(os.environ.get('PYSPARK_EXECUTOR_MEMORY_MB', "-1")) - if memory_limit_mb > 0 and has_resource_module: - total_memory = resource.RLIMIT_AS - try: - (soft_limit, hard_limit) = resource.getrlimit(total_memory) - msg = "Current mem limits: {0} of max {1}\n".format(soft_limit, hard_limit) - print(msg, file=sys.stderr) - - # convert to bytes - new_limit = memory_limit_mb * 1024 * 1024 - - if soft_limit == resource.RLIM_INFINITY or new_limit < soft_limit: - msg = "Setting mem limits to {0} of max {1}\n".format(new_limit, new_limit) - print(msg, file=sys.stderr) - resource.setrlimit(total_memory, (new_limit, new_limit)) - - except (resource.error, OSError, ValueError) as e: - # not all systems support resource limits, so warn instead of failing - print("WARN: Failed to set memory limit: {0}\n".format(e), file=sys.stderr) - - # initialize global state - taskContext = None - if isBarrier: - taskContext = BarrierTaskContext._getOrCreate() - BarrierTaskContext._initialize(boundPort, secret) - else: - taskContext = TaskContext._getOrCreate() - # read inputs for TaskContext info - taskContext._stageId = read_int(infile) - taskContext._partitionId = read_int(infile) - taskContext._attemptNumber = read_int(infile) - taskContext._taskAttemptId = read_long(infile) - taskContext._localProperties = dict() - for i in range(read_int(infile)): - k = utf8_deserializer.loads(infile) - v = utf8_deserializer.loads(infile) - taskContext._localProperties[k] = v - - shuffle.MemoryBytesSpilled = 0 - shuffle.DiskBytesSpilled = 0 - _accumulatorRegistry.clear() - - # fetch name of workdir - spark_files_dir = utf8_deserializer.loads(infile) - SparkFiles._root_directory = spark_files_dir - SparkFiles._is_running_on_worker = True - - # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH - add_path(spark_files_dir) # *.py files that were added will be copied here - num_python_includes = read_int(infile) - for _ in range(num_python_includes): - filename = utf8_deserializer.loads(infile) - add_path(os.path.join(spark_files_dir, filename)) - if sys.version > '3': - import importlib - importlib.invalidate_caches() - - # fetch names and values of broadcast variables - needs_broadcast_decryption_server = read_bool(infile) - num_broadcast_variables = read_int(infile) - if needs_broadcast_decryption_server: - # read the decrypted data from a server in the jvm - port = read_int(infile) - auth_secret = utf8_deserializer.loads(infile) - (broadcast_sock_file, _) = local_connect_and_auth(port, auth_secret) - - for _ in range(num_broadcast_variables): - bid = read_long(infile) - if bid >= 0: - if needs_broadcast_decryption_server: - read_bid = read_long(broadcast_sock_file) - assert(read_bid == bid) - _broadcastRegistry[bid] = \ - Broadcast(sock_file=broadcast_sock_file) - else: - path = utf8_deserializer.loads(infile) - _broadcastRegistry[bid] = Broadcast(path=path) - - else: - bid = - bid - 1 - _broadcastRegistry.pop(bid) - - if needs_broadcast_decryption_server: - broadcast_sock_file.write(b'1') - broadcast_sock_file.close() - - _accumulatorRegistry.clear() - eval_type = read_int(infile) - if eval_type == PythonEvalType.NON_UDF: - func, profiler, deserializer, serializer = read_command(pickleSer, infile) - else: - func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type) - - init_time = time.time() - - def process(): - iterator = deserializer.load_stream(infile) - serializer.dump_stream(func(split_index, iterator), outfile) - - if profiler: - profiler.profile(process) - else: - process() - except Exception: - try: - write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile) - write_with_length(traceback.format_exc().encode("utf-8"), outfile) - except IOError: - # JVM close the socket - pass - except Exception: - # Write the error to stderr if it happened while serializing - print("PySpark worker failed with exception:", file=sys.stderr) - print(traceback.format_exc(), file=sys.stderr) - sys.exit(-1) - finish_time = time.time() - report_times(outfile, boot_time, init_time, finish_time) - write_long(shuffle.MemoryBytesSpilled, outfile) - write_long(shuffle.DiskBytesSpilled, outfile) - - # Mark the beginning of the accumulators section of the output - write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) - write_int(len(_accumulatorRegistry), outfile) - for (aid, accum) in _accumulatorRegistry.items(): - pickleSer._write_with_length((aid, accum._value), outfile) - - # check end of stream - if read_int(infile) == SpecialLengths.END_OF_STREAM: - write_int(SpecialLengths.END_OF_STREAM, outfile) - else: - # write a different value to tell JVM to not reuse this worker - write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) - sys.exit(-1) - - -if __name__ == '__main__': - # Read information about how to connect back to the JVM from the environment. - java_port = int(os.environ["PYTHON_WORKER_FACTORY_PORT"]) - auth_secret = os.environ["PYTHON_WORKER_FACTORY_SECRET"] - (sock_file, _) = local_connect_and_auth(java_port, auth_secret) - main(sock_file, sock_file) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/run-tests b/scripts/spark-2.4.3-bin-hadoop2.7/python/run-tests deleted file mode 100755 index 2494965..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/run-tests +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -FWDIR="$(cd "`dirname $0`"/..; pwd)" -cd "$FWDIR" - -exec python -u ./python/run-tests.py "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/run-tests-with-coverage b/scripts/spark-2.4.3-bin-hadoop2.7/python/run-tests-with-coverage deleted file mode 100755 index 6d74b56..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/run-tests-with-coverage +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -set -o pipefail -set -e - -# This variable indicates which coverage executable to run to combine coverages -# and generate HTMLs, for example, 'coverage3' in Python 3. -COV_EXEC="${COV_EXEC:-coverage}" -FWDIR="$(cd "`dirname $0`"; pwd)" -pushd "$FWDIR" > /dev/null - -# Ensure that coverage executable is installed. -if ! hash $COV_EXEC 2>/dev/null; then - echo "Missing coverage executable in your path, skipping PySpark coverage" - exit 1 -fi - -# Set up the directories for coverage results. -export COVERAGE_DIR="$FWDIR/test_coverage" -rm -fr "$COVERAGE_DIR/coverage_data" -rm -fr "$COVERAGE_DIR/htmlcov" -mkdir -p "$COVERAGE_DIR/coverage_data" - -# Current directory are added in the python path so that it doesn't refer our built -# pyspark zip library first. -export PYTHONPATH="$FWDIR:$PYTHONPATH" -# Also, our sitecustomize.py and coverage_daemon.py are included in the path. -export PYTHONPATH="$COVERAGE_DIR:$PYTHONPATH" - -# We use 'spark.python.daemon.module' configuration to insert the coverage supported workers. -export SPARK_CONF_DIR="$COVERAGE_DIR/conf" - -# This environment variable enables the coverage. -export COVERAGE_PROCESS_START="$FWDIR/.coveragerc" - -# If you'd like to run a specific unittest class, you could do such as -# SPARK_TESTING=1 ../bin/pyspark pyspark.sql.tests VectorizedUDFTests -./run-tests "$@" - -# Don't run coverage for the coverage command itself -unset COVERAGE_PROCESS_START - -# Coverage could generate empty coverage data files. Remove it to get rid of warnings when combining. -find $COVERAGE_DIR/coverage_data -size 0 -print0 | xargs -0 rm -echo "Combining collected coverage data under $COVERAGE_DIR/coverage_data" -$COV_EXEC combine -echo "Reporting the coverage data at $COVERAGE_DIR/coverage_data/coverage" -$COV_EXEC report --include "pyspark/*" -echo "Generating HTML files for PySpark coverage under $COVERAGE_DIR/htmlcov" -$COV_EXEC html --ignore-errors --include "pyspark/*" --directory "$COVERAGE_DIR/htmlcov" - -popd diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/run-tests.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/run-tests.py deleted file mode 100755 index 5059905..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/run-tests.py +++ /dev/null @@ -1,296 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function -import logging -from optparse import OptionParser -import os -import re -import shutil -import subprocess -import sys -import tempfile -from threading import Thread, Lock -import time -import uuid -if sys.version < '3': - import Queue -else: - import queue as Queue -from distutils.version import LooseVersion -from multiprocessing import Manager - - -# Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module -sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../dev/")) - - -from sparktestsupport import SPARK_HOME # noqa (suppress pep8 warnings) -from sparktestsupport.shellutils import which, subprocess_check_output # noqa -from sparktestsupport.modules import all_modules, pyspark_sql # noqa - - -python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root') - - -def print_red(text): - print('\033[31m' + text + '\033[0m') - - -SKIPPED_TESTS = Manager().dict() -LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log") -FAILURE_REPORTING_LOCK = Lock() -LOGGER = logging.getLogger() - -# Find out where the assembly jars are located. -for scala in ["2.11", "2.12"]: - build_dir = os.path.join(SPARK_HOME, "assembly", "target", "scala-" + scala) - if os.path.isdir(build_dir): - SPARK_DIST_CLASSPATH = os.path.join(build_dir, "jars", "*") - break -else: - raise Exception("Cannot find assembly build directory, please build Spark first.") - - -def run_individual_python_test(target_dir, test_name, pyspark_python): - env = dict(os.environ) - env.update({ - 'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH, - 'SPARK_TESTING': '1', - 'SPARK_PREPEND_CLASSES': '1', - 'PYSPARK_PYTHON': which(pyspark_python), - 'PYSPARK_DRIVER_PYTHON': which(pyspark_python) - }) - - # Create a unique temp directory under 'target/' for each run. The TMPDIR variable is - # recognized by the tempfile module to override the default system temp directory. - tmp_dir = os.path.join(target_dir, str(uuid.uuid4())) - while os.path.isdir(tmp_dir): - tmp_dir = os.path.join(target_dir, str(uuid.uuid4())) - os.mkdir(tmp_dir) - env["TMPDIR"] = tmp_dir - - # Also override the JVM's temp directory by setting driver and executor options. - spark_args = [ - "--conf", "spark.driver.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), - "--conf", "spark.executor.extraJavaOptions=-Djava.io.tmpdir={0}".format(tmp_dir), - "pyspark-shell" - ] - env["PYSPARK_SUBMIT_ARGS"] = " ".join(spark_args) - - LOGGER.info("Starting test(%s): %s", pyspark_python, test_name) - start_time = time.time() - try: - per_test_output = tempfile.TemporaryFile() - retcode = subprocess.Popen( - [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], - stderr=per_test_output, stdout=per_test_output, env=env).wait() - shutil.rmtree(tmp_dir, ignore_errors=True) - except: - LOGGER.exception("Got exception while running %s with %s", test_name, pyspark_python) - # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if - # this code is invoked from a thread other than the main thread. - os._exit(1) - duration = time.time() - start_time - # Exit on the first failure. - if retcode != 0: - try: - with FAILURE_REPORTING_LOCK: - with open(LOG_FILE, 'ab') as log_file: - per_test_output.seek(0) - log_file.writelines(per_test_output) - per_test_output.seek(0) - for line in per_test_output: - decoded_line = line.decode() - if not re.match('[0-9]+', decoded_line): - print(decoded_line, end='') - per_test_output.close() - except: - LOGGER.exception("Got an exception while trying to print failed test output") - finally: - print_red("\nHad test failures in %s with %s; see logs." % (test_name, pyspark_python)) - # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if - # this code is invoked from a thread other than the main thread. - os._exit(-1) - else: - skipped_counts = 0 - try: - per_test_output.seek(0) - # Here expects skipped test output from unittest when verbosity level is - # 2 (or --verbose option is enabled). - decoded_lines = map(lambda line: line.decode(), iter(per_test_output)) - skipped_tests = list(filter( - lambda line: re.search(r'test_.* \(pyspark\..*\) ... skipped ', line), - decoded_lines)) - skipped_counts = len(skipped_tests) - if skipped_counts > 0: - key = (pyspark_python, test_name) - SKIPPED_TESTS[key] = skipped_tests - per_test_output.close() - except: - import traceback - print_red("\nGot an exception while trying to store " - "skipped test output:\n%s" % traceback.format_exc()) - # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if - # this code is invoked from a thread other than the main thread. - os._exit(-1) - if skipped_counts != 0: - LOGGER.info( - "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name, - duration, skipped_counts) - else: - LOGGER.info( - "Finished test(%s): %s (%is)", pyspark_python, test_name, duration) - - -def get_default_python_executables(): - python_execs = [x for x in ["python2.7", "python3.6", "pypy"] if which(x)] - if "python2.7" not in python_execs: - LOGGER.warning("Not testing against `python2.7` because it could not be found; falling" - " back to `python` instead") - python_execs.insert(0, "python") - return python_execs - - -def parse_opts(): - parser = OptionParser( - prog="run-tests" - ) - parser.add_option( - "--python-executables", type="string", default=','.join(get_default_python_executables()), - help="A comma-separated list of Python executables to test against (default: %default)" - ) - parser.add_option( - "--modules", type="string", - default=",".join(sorted(python_modules.keys())), - help="A comma-separated list of Python modules to test (default: %default)" - ) - parser.add_option( - "-p", "--parallelism", type="int", default=4, - help="The number of suites to test in parallel (default %default)" - ) - parser.add_option( - "--verbose", action="store_true", - help="Enable additional debug logging" - ) - - (opts, args) = parser.parse_args() - if args: - parser.error("Unsupported arguments: %s" % ' '.join(args)) - if opts.parallelism < 1: - parser.error("Parallelism cannot be less than 1") - return opts - - -def _check_coverage(python_exec): - # Make sure if coverage is installed. - try: - subprocess_check_output( - [python_exec, "-c", "import coverage"], - stderr=open(os.devnull, 'w')) - except: - print_red("Coverage is not installed in Python executable '%s' " - "but 'COVERAGE_PROCESS_START' environment variable is set, " - "exiting." % python_exec) - sys.exit(-1) - - -def main(): - opts = parse_opts() - if (opts.verbose): - log_level = logging.DEBUG - else: - log_level = logging.INFO - logging.basicConfig(stream=sys.stdout, level=log_level, format="%(message)s") - LOGGER.info("Running PySpark tests. Output is in %s", LOG_FILE) - if os.path.exists(LOG_FILE): - os.remove(LOG_FILE) - python_execs = opts.python_executables.split(',') - modules_to_test = [] - for module_name in opts.modules.split(','): - if module_name in python_modules: - modules_to_test.append(python_modules[module_name]) - else: - print("Error: unrecognized module '%s'. Supported modules: %s" % - (module_name, ", ".join(python_modules))) - sys.exit(-1) - LOGGER.info("Will test against the following Python executables: %s", python_execs) - LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test]) - - task_queue = Queue.PriorityQueue() - for python_exec in python_execs: - # Check if the python executable has coverage installed when 'COVERAGE_PROCESS_START' - # environmental variable is set. - if "COVERAGE_PROCESS_START" in os.environ: - _check_coverage(python_exec) - - python_implementation = subprocess_check_output( - [python_exec, "-c", "import platform; print(platform.python_implementation())"], - universal_newlines=True).strip() - LOGGER.debug("%s python_implementation is %s", python_exec, python_implementation) - LOGGER.debug("%s version is: %s", python_exec, subprocess_check_output( - [python_exec, "--version"], stderr=subprocess.STDOUT, universal_newlines=True).strip()) - for module in modules_to_test: - if python_implementation not in module.blacklisted_python_implementations: - for test_goal in module.python_test_goals: - if test_goal in ('pyspark.streaming.tests', 'pyspark.mllib.tests', - 'pyspark.tests', 'pyspark.sql.tests'): - priority = 0 - else: - priority = 100 - task_queue.put((priority, (python_exec, test_goal))) - - # Create the target directory before starting tasks to avoid races. - target_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'target')) - if not os.path.isdir(target_dir): - os.mkdir(target_dir) - - def process_queue(task_queue): - while True: - try: - (priority, (python_exec, test_goal)) = task_queue.get_nowait() - except Queue.Empty: - break - try: - run_individual_python_test(target_dir, test_goal, python_exec) - finally: - task_queue.task_done() - - start_time = time.time() - for _ in range(opts.parallelism): - worker = Thread(target=process_queue, args=(task_queue,)) - worker.daemon = True - worker.start() - try: - task_queue.join() - except (KeyboardInterrupt, SystemExit): - print_red("Exiting due to interrupt") - sys.exit(-1) - total_duration = time.time() - start_time - LOGGER.info("Tests passed in %i seconds", total_duration) - - for key, lines in sorted(SKIPPED_TESTS.items()): - pyspark_python, test_name = key - LOGGER.info("\nSkipped tests in %s with %s:" % (test_name, pyspark_python)) - for line in lines: - LOGGER.info(" %s" % line.rstrip()) - - -if __name__ == "__main__": - main() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/setup.cfg b/scripts/spark-2.4.3-bin-hadoop2.7/python/setup.cfg deleted file mode 100644 index d100b93..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/setup.cfg +++ /dev/null @@ -1,22 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -[bdist_wheel] -universal = 1 - -[metadata] -description-file = README.md diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/setup.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/setup.py deleted file mode 100644 index c447f2d..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/setup.py +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -import glob -import os -import sys -from setuptools import setup, find_packages -from shutil import copyfile, copytree, rmtree - -if sys.version_info < (2, 7): - print("Python versions prior to 2.7 are not supported for pip installed PySpark.", - file=sys.stderr) - sys.exit(-1) - -try: - exec(open('pyspark/version.py').read()) -except IOError: - print("Failed to load PySpark version file for packaging. You must be in Spark's python dir.", - file=sys.stderr) - sys.exit(-1) -VERSION = __version__ # noqa -# A temporary path so we can access above the Python project root and fetch scripts and jars we need -TEMP_PATH = "deps" -SPARK_HOME = os.path.abspath("../") - -# Provide guidance about how to use setup.py -incorrect_invocation_message = """ -If you are installing pyspark from spark source, you must first build Spark and -run sdist. - - To build Spark with maven you can run: - ./build/mvn -DskipTests clean package - Building the source dist is done in the Python directory: - cd python - python setup.py sdist - pip install dist/*.tar.gz""" - -# Figure out where the jars are we need to package with PySpark. -JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/")) - -if len(JARS_PATH) == 1: - JARS_PATH = JARS_PATH[0] -elif (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1): - # Release mode puts the jars in a jars directory - JARS_PATH = os.path.join(SPARK_HOME, "jars") -elif len(JARS_PATH) > 1: - print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format( - JARS_PATH), file=sys.stderr) - sys.exit(-1) -elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH): - print(incorrect_invocation_message, file=sys.stderr) - sys.exit(-1) - -EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python") -SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin") -DATA_PATH = os.path.join(SPARK_HOME, "data") -LICENSES_PATH = os.path.join(SPARK_HOME, "licenses") - -SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin") -JARS_TARGET = os.path.join(TEMP_PATH, "jars") -EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples") -DATA_TARGET = os.path.join(TEMP_PATH, "data") -LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses") - -# Check and see if we are under the spark path in which case we need to build the symlink farm. -# This is important because we only want to build the symlink farm while under Spark otherwise we -# want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a -# partially built sdist) we should error and have the user sort it out. -in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or - (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1)) - - -def _supports_symlinks(): - """Check if the system supports symlinks (e.g. *nix) or not.""" - return getattr(os, "symlink", None) is not None - - -if (in_spark): - # Construct links for setup - try: - os.mkdir(TEMP_PATH) - except: - print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH), - file=sys.stderr) - sys.exit(-1) - -# If you are changing the versions here, please also change ./python/pyspark/sql/utils.py and -# ./python/run-tests.py. In case of Arrow, you should also check ./pom.xml. -_minimum_pandas_version = "0.19.2" -_minimum_pyarrow_version = "0.8.0" - -try: - # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts - # find it where expected. The rest of the files aren't copied because they are accessed - # using Python imports instead which will be resolved correctly. - try: - os.makedirs("pyspark/python/pyspark") - except OSError: - # Don't worry if the directory already exists. - pass - copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py") - - if (in_spark): - # Construct the symlink farm - this is necessary since we can't refer to the path above the - # package root and we need to copy the jars and scripts which are up above the python root. - if _supports_symlinks(): - os.symlink(JARS_PATH, JARS_TARGET) - os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET) - os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET) - os.symlink(DATA_PATH, DATA_TARGET) - os.symlink(LICENSES_PATH, LICENSES_TARGET) - else: - # For windows fall back to the slower copytree - copytree(JARS_PATH, JARS_TARGET) - copytree(SCRIPTS_PATH, SCRIPTS_TARGET) - copytree(EXAMPLES_PATH, EXAMPLES_TARGET) - copytree(DATA_PATH, DATA_TARGET) - copytree(LICENSES_PATH, LICENSES_TARGET) - else: - # If we are not inside of SPARK_HOME verify we have the required symlink farm - if not os.path.exists(JARS_TARGET): - print("To build packaging must be in the python directory under the SPARK_HOME.", - file=sys.stderr) - - if not os.path.isdir(SCRIPTS_TARGET): - print(incorrect_invocation_message, file=sys.stderr) - sys.exit(-1) - - # Scripts directive requires a list of each script path and does not take wild cards. - script_names = os.listdir(SCRIPTS_TARGET) - scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names)) - # We add find_spark_home.py to the bin directory we install so that pip installed PySpark - # will search for SPARK_HOME with Python. - scripts.append("pyspark/find_spark_home.py") - - # Parse the README markdown file into rst for PyPI - long_description = "!!!!! missing pandoc do not upload to PyPI !!!!" - try: - import pypandoc - long_description = pypandoc.convert('README.md', 'rst') - except ImportError: - print("Could not import pypandoc - required to package PySpark", file=sys.stderr) - except OSError: - print("Could not convert - pandoc is not installed", file=sys.stderr) - - setup( - name='pyspark', - version=VERSION, - description='Apache Spark Python API', - long_description=long_description, - author='Spark Developers', - author_email='dev@spark.apache.org', - url='https://github.com/apache/spark/tree/master/python', - packages=['pyspark', - 'pyspark.mllib', - 'pyspark.mllib.linalg', - 'pyspark.mllib.stat', - 'pyspark.ml', - 'pyspark.ml.linalg', - 'pyspark.ml.param', - 'pyspark.sql', - 'pyspark.streaming', - 'pyspark.bin', - 'pyspark.jars', - 'pyspark.python.pyspark', - 'pyspark.python.lib', - 'pyspark.data', - 'pyspark.licenses', - 'pyspark.examples.src.main.python'], - include_package_data=True, - package_dir={ - 'pyspark.jars': 'deps/jars', - 'pyspark.bin': 'deps/bin', - 'pyspark.python.lib': 'lib', - 'pyspark.data': 'deps/data', - 'pyspark.licenses': 'deps/licenses', - 'pyspark.examples.src.main.python': 'deps/examples', - }, - package_data={ - 'pyspark.jars': ['*.jar'], - 'pyspark.bin': ['*'], - 'pyspark.python.lib': ['*.zip'], - 'pyspark.data': ['*.txt', '*.data'], - 'pyspark.licenses': ['*.txt'], - 'pyspark.examples.src.main.python': ['*.py', '*/*.py']}, - scripts=scripts, - license='http://www.apache.org/licenses/LICENSE-2.0', - install_requires=['py4j==0.10.7'], - setup_requires=['pypandoc'], - extras_require={ - 'ml': ['numpy>=1.7'], - 'mllib': ['numpy>=1.7'], - 'sql': [ - 'pandas>=%s' % _minimum_pandas_version, - 'pyarrow>=%s' % _minimum_pyarrow_version, - ] - }, - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: Implementation :: CPython', - 'Programming Language :: Python :: Implementation :: PyPy'] - ) -finally: - # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than - # packaging. - if (in_spark): - # Depending on cleaning up the symlink farm or copied version - if _supports_symlinks(): - os.remove(os.path.join(TEMP_PATH, "jars")) - os.remove(os.path.join(TEMP_PATH, "bin")) - os.remove(os.path.join(TEMP_PATH, "examples")) - os.remove(os.path.join(TEMP_PATH, "data")) - os.remove(os.path.join(TEMP_PATH, "licenses")) - else: - rmtree(os.path.join(TEMP_PATH, "jars")) - rmtree(os.path.join(TEMP_PATH, "bin")) - rmtree(os.path.join(TEMP_PATH, "examples")) - rmtree(os.path.join(TEMP_PATH, "data")) - rmtree(os.path.join(TEMP_PATH, "licenses")) - os.rmdir(TEMP_PATH) diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_coverage/conf/spark-defaults.conf b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_coverage/conf/spark-defaults.conf deleted file mode 100644 index bf44ea6..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_coverage/conf/spark-defaults.conf +++ /dev/null @@ -1,21 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This is used to generate PySpark coverage results. Seems there's no way to -# add a configuration when SPARK_TESTING environment variable is set because -# we will directly execute modules by python -m. -spark.python.daemon.module coverage_daemon diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_coverage/coverage_daemon.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_coverage/coverage_daemon.py deleted file mode 100644 index c87366a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_coverage/coverage_daemon.py +++ /dev/null @@ -1,45 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import imp - - -# This is a hack to always refer the main code rather than built zip. -main_code_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -daemon = imp.load_source("daemon", "%s/pyspark/daemon.py" % main_code_dir) - -if "COVERAGE_PROCESS_START" in os.environ: - worker = imp.load_source("worker", "%s/pyspark/worker.py" % main_code_dir) - - def _cov_wrapped(*args, **kwargs): - import coverage - cov = coverage.coverage( - config_file=os.environ["COVERAGE_PROCESS_START"]) - cov.start() - try: - worker.main(*args, **kwargs) - finally: - cov.stop() - cov.save() - daemon.worker_main = _cov_wrapped -else: - raise RuntimeError("COVERAGE_PROCESS_START environment variable is not set, exiting.") - - -if __name__ == '__main__': - daemon.manager() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_coverage/sitecustomize.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_coverage/sitecustomize.py deleted file mode 100644 index 630237a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_coverage/sitecustomize.py +++ /dev/null @@ -1,23 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Note that this 'sitecustomize' module is a built-in feature in Python. -# If this module is defined, it's executed when the Python session begins. -# `coverage.process_startup()` seeks if COVERAGE_PROCESS_START environment -# variable is set or not. If set, it starts to run the coverage. -import coverage -coverage.process_startup() diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/SimpleHTTPServer.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/SimpleHTTPServer.py deleted file mode 100644 index eddbd58..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/SimpleHTTPServer.py +++ /dev/null @@ -1,22 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Used to test override standard SimpleHTTPServer module. -""" - -__name__ = "My Server" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/hello/hello.txt b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/hello/hello.txt deleted file mode 100755 index 980a0d5..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/hello/hello.txt +++ /dev/null @@ -1 +0,0 @@ -Hello World! diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/hello/sub_hello/sub_hello.txt b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/hello/sub_hello/sub_hello.txt deleted file mode 100644 index ce2d435..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/hello/sub_hello/sub_hello.txt +++ /dev/null @@ -1 +0,0 @@ -Sub Hello World! diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/ages.csv b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/ages.csv deleted file mode 100644 index 18991fe..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/ages.csv +++ /dev/null @@ -1,4 +0,0 @@ -Joe,20 -Tom,30 -Hyukjin,25 - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/ages_newlines.csv b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/ages_newlines.csv deleted file mode 100644 index d19f673..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/ages_newlines.csv +++ /dev/null @@ -1,6 +0,0 @@ -Joe,20,"Hi, -I am Jeo" -Tom,30,"My name is Tom" -Hyukjin,25,"I am Hyukjin - -I love Spark!" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/_SUCCESS b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/_SUCCESS deleted file mode 100755 index e69de29..0000000 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc deleted file mode 100644 index 834cf0b..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc deleted file mode 100755 index 4943801..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc deleted file mode 100644 index 693dcee..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc deleted file mode 100755 index 4cbb95a..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/_SUCCESS b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/_SUCCESS deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/_common_metadata b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/_common_metadata deleted file mode 100644 index 7ef2320..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/_common_metadata and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/_metadata b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/_metadata deleted file mode 100644 index 78a1ca7..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/_metadata and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/.part-r-00008.gz.parquet.crc b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/.part-r-00008.gz.parquet.crc deleted file mode 100644 index e93f42e..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/.part-r-00008.gz.parquet.crc and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/part-r-00008.gz.parquet b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/part-r-00008.gz.parquet deleted file mode 100644 index 461c382..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/part-r-00008.gz.parquet and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00002.gz.parquet.crc b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00002.gz.parquet.crc deleted file mode 100644 index b63c4d6..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00002.gz.parquet.crc and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00004.gz.parquet.crc b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00004.gz.parquet.crc deleted file mode 100644 index 5bc0ebd..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00004.gz.parquet.crc and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00002.gz.parquet b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00002.gz.parquet deleted file mode 100644 index 62a6391..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00002.gz.parquet and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00004.gz.parquet b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00004.gz.parquet deleted file mode 100644 index 67665a7..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00004.gz.parquet and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/.part-r-00005.gz.parquet.crc b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/.part-r-00005.gz.parquet.crc deleted file mode 100644 index ae94a15..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/.part-r-00005.gz.parquet.crc and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/part-r-00005.gz.parquet b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/part-r-00005.gz.parquet deleted file mode 100644 index 6cb8538..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/part-r-00005.gz.parquet and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/.part-r-00007.gz.parquet.crc b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/.part-r-00007.gz.parquet.crc deleted file mode 100644 index 58d9bb5..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/.part-r-00007.gz.parquet.crc and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/part-r-00007.gz.parquet b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/part-r-00007.gz.parquet deleted file mode 100644 index 9b00805..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/part-r-00007.gz.parquet and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people.json b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people.json deleted file mode 100644 index 50a859c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people.json +++ /dev/null @@ -1,3 +0,0 @@ -{"name":"Michael"} -{"name":"Andy", "age":30} -{"name":"Justin", "age":19} diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people1.json b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people1.json deleted file mode 100644 index 6d217da..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people1.json +++ /dev/null @@ -1,2 +0,0 @@ -{"name":"Jonathan", "aka": "John"} - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people_array.json b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people_array.json deleted file mode 100644 index c27c48f..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people_array.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - { - "name": "Michael" - }, - { - "name": "Andy", - "age": 30 - }, - { - "name": "Justin", - "age": 19 - } -] diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people_array_utf16le.json b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people_array_utf16le.json deleted file mode 100644 index 9c657fa..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/people_array_utf16le.json and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/streaming/text-test.txt b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/streaming/text-test.txt deleted file mode 100644 index ae1e76c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/streaming/text-test.txt +++ /dev/null @@ -1,2 +0,0 @@ -hello -this \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/text-test.txt b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/text-test.txt deleted file mode 100644 index ae1e76c..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/sql/text-test.txt +++ /dev/null @@ -1,2 +0,0 @@ -hello -this \ No newline at end of file diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/userlib-0.1.zip b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/userlib-0.1.zip deleted file mode 100644 index 496e134..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/userlib-0.1.zip and /dev/null differ diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/userlibrary.py b/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/userlibrary.py deleted file mode 100755 index 73fd26e..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/python/test_support/userlibrary.py +++ /dev/null @@ -1,26 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Used to test shipping of code depenencies with SparkContext.addPyFile(). -""" - - -class UserClass(object): - - def hello(self): - return "Hello World!" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/slaves.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/slaves.sh deleted file mode 100755 index c971aa3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/slaves.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Run a shell command on all slave hosts. -# -# Environment Variables -# -# SPARK_SLAVES File naming remote hosts. -# Default is ${SPARK_CONF_DIR}/slaves. -# SPARK_CONF_DIR Alternate conf dir. Default is ${SPARK_HOME}/conf. -# SPARK_SLAVE_SLEEP Seconds to sleep between spawning remote commands. -# SPARK_SSH_OPTS Options passed to ssh when running remote commands. -## - -usage="Usage: slaves.sh [--config ] command..." - -# if no args specified, show usage -if [ $# -le 0 ]; then - echo $usage - exit 1 -fi - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" - -# If the slaves file is specified in the command line, -# then it takes precedence over the definition in -# spark-env.sh. Save it here. -if [ -f "$SPARK_SLAVES" ]; then - HOSTLIST=`cat "$SPARK_SLAVES"` -fi - -# Check if --config is passed as an argument. It is an optional parameter. -# Exit if the argument is not a directory. -if [ "$1" == "--config" ] -then - shift - conf_dir="$1" - if [ ! -d "$conf_dir" ] - then - echo "ERROR : $conf_dir is not a directory" - echo $usage - exit 1 - else - export SPARK_CONF_DIR="$conf_dir" - fi - shift -fi - -. "${SPARK_HOME}/bin/load-spark-env.sh" - -if [ "$HOSTLIST" = "" ]; then - if [ "$SPARK_SLAVES" = "" ]; then - if [ -f "${SPARK_CONF_DIR}/slaves" ]; then - HOSTLIST=`cat "${SPARK_CONF_DIR}/slaves"` - else - HOSTLIST=localhost - fi - else - HOSTLIST=`cat "${SPARK_SLAVES}"` - fi -fi - - - -# By default disable strict host key checking -if [ "$SPARK_SSH_OPTS" = "" ]; then - SPARK_SSH_OPTS="-o StrictHostKeyChecking=no" -fi - -for slave in `echo "$HOSTLIST"|sed "s/#.*$//;/^$/d"`; do - if [ -n "${SPARK_SSH_FOREGROUND}" ]; then - ssh $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \ - 2>&1 | sed "s/^/$slave: /" - else - ssh $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \ - 2>&1 | sed "s/^/$slave: /" & - fi - if [ "$SPARK_SLAVE_SLEEP" != "" ]; then - sleep $SPARK_SLAVE_SLEEP - fi -done - -wait diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/spark-config.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/spark-config.sh deleted file mode 100755 index bf3da18..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/spark-config.sh +++ /dev/null @@ -1,33 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# included in all the spark scripts with source command -# should not be executable directly -# also should not be passed any arguments, since we need original $* - -# symlink and absolute path should rely on SPARK_HOME to resolve -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"${SPARK_HOME}/conf"}" -# Add the PySpark classes to the PYTHONPATH: -if [ -z "${PYSPARK_PYTHONPATH_SET}" ]; then - export PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}" - export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:${PYTHONPATH}" - export PYSPARK_PYTHONPATH_SET=1 -fi diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/spark-daemon.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/spark-daemon.sh deleted file mode 100755 index 6de67e0..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/spark-daemon.sh +++ /dev/null @@ -1,242 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Runs a Spark command as a daemon. -# -# Environment Variables -# -# SPARK_CONF_DIR Alternate conf dir. Default is ${SPARK_HOME}/conf. -# SPARK_LOG_DIR Where log files are stored. ${SPARK_HOME}/logs by default. -# SPARK_MASTER host:path where spark code should be rsync'd from -# SPARK_PID_DIR The pid files are stored. /tmp by default. -# SPARK_IDENT_STRING A string representing this instance of spark. $USER by default -# SPARK_NICENESS The scheduling priority for daemons. Defaults to 0. -# SPARK_NO_DAEMONIZE If set, will run the proposed command in the foreground. It will not output a PID file. -## - -usage="Usage: spark-daemon.sh [--config ] (start|stop|submit|status) " - -# if no args specified, show usage -if [ $# -le 1 ]; then - echo $usage - exit 1 -fi - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" - -# get arguments - -# Check if --config is passed as an argument. It is an optional parameter. -# Exit if the argument is not a directory. - -if [ "$1" == "--config" ] -then - shift - conf_dir="$1" - if [ ! -d "$conf_dir" ] - then - echo "ERROR : $conf_dir is not a directory" - echo $usage - exit 1 - else - export SPARK_CONF_DIR="$conf_dir" - fi - shift -fi - -option=$1 -shift -command=$1 -shift -instance=$1 -shift - -spark_rotate_log () -{ - log=$1; - num=5; - if [ -n "$2" ]; then - num=$2 - fi - if [ -f "$log" ]; then # rotate logs - while [ $num -gt 1 ]; do - prev=`expr $num - 1` - [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num" - num=$prev - done - mv "$log" "$log.$num"; - fi -} - -. "${SPARK_HOME}/bin/load-spark-env.sh" - -if [ "$SPARK_IDENT_STRING" = "" ]; then - export SPARK_IDENT_STRING="$USER" -fi - - -export SPARK_PRINT_LAUNCH_COMMAND="1" - -# get log directory -if [ "$SPARK_LOG_DIR" = "" ]; then - export SPARK_LOG_DIR="${SPARK_HOME}/logs" -fi -mkdir -p "$SPARK_LOG_DIR" -touch "$SPARK_LOG_DIR"/.spark_test > /dev/null 2>&1 -TEST_LOG_DIR=$? -if [ "${TEST_LOG_DIR}" = "0" ]; then - rm -f "$SPARK_LOG_DIR"/.spark_test -else - chown "$SPARK_IDENT_STRING" "$SPARK_LOG_DIR" -fi - -if [ "$SPARK_PID_DIR" = "" ]; then - SPARK_PID_DIR=/tmp -fi - -# some variables -log="$SPARK_LOG_DIR/spark-$SPARK_IDENT_STRING-$command-$instance-$HOSTNAME.out" -pid="$SPARK_PID_DIR/spark-$SPARK_IDENT_STRING-$command-$instance.pid" - -# Set default scheduling priority -if [ "$SPARK_NICENESS" = "" ]; then - export SPARK_NICENESS=0 -fi - -execute_command() { - if [ -z ${SPARK_NO_DAEMONIZE+set} ]; then - nohup -- "$@" >> $log 2>&1 < /dev/null & - newpid="$!" - - echo "$newpid" > "$pid" - - # Poll for up to 5 seconds for the java process to start - for i in {1..10} - do - if [[ $(ps -p "$newpid" -o comm=) =~ "java" ]]; then - break - fi - sleep 0.5 - done - - sleep 2 - # Check if the process has died; in that case we'll tail the log so the user can see - if [[ ! $(ps -p "$newpid" -o comm=) =~ "java" ]]; then - echo "failed to launch: $@" - tail -10 "$log" | sed 's/^/ /' - echo "full log in $log" - fi - else - "$@" - fi -} - -run_command() { - mode="$1" - shift - - mkdir -p "$SPARK_PID_DIR" - - if [ -f "$pid" ]; then - TARGET_ID="$(cat "$pid")" - if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then - echo "$command running as process $TARGET_ID. Stop it first." - exit 1 - fi - fi - - if [ "$SPARK_MASTER" != "" ]; then - echo rsync from "$SPARK_MASTER" - rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}" - fi - - spark_rotate_log "$log" - echo "starting $command, logging to $log" - - case "$mode" in - (class) - execute_command nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class "$command" "$@" - ;; - - (submit) - execute_command nice -n "$SPARK_NICENESS" bash "${SPARK_HOME}"/bin/spark-submit --class "$command" "$@" - ;; - - (*) - echo "unknown mode: $mode" - exit 1 - ;; - esac - -} - -case $option in - - (submit) - run_command submit "$@" - ;; - - (start) - run_command class "$@" - ;; - - (stop) - - if [ -f $pid ]; then - TARGET_ID="$(cat "$pid")" - if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then - echo "stopping $command" - kill "$TARGET_ID" && rm -f "$pid" - else - echo "no $command to stop" - fi - else - echo "no $command to stop" - fi - ;; - - (status) - - if [ -f $pid ]; then - TARGET_ID="$(cat "$pid")" - if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then - echo $command is running. - exit 0 - else - echo $pid file is present but $command not running - exit 1 - fi - else - echo $command not running. - exit 2 - fi - ;; - - (*) - echo $usage - exit 1 - ;; - -esac - - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/spark-daemons.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/spark-daemons.sh deleted file mode 100755 index dec2f44..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/spark-daemons.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Run a Spark command on all slave hosts. - -usage="Usage: spark-daemons.sh [--config ] [start|stop] command instance-number args..." - -# if no args specified, show usage -if [ $# -le 1 ]; then - echo $usage - exit 1 -fi - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" - -exec "${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/spark-daemon.sh" "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-all.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-all.sh deleted file mode 100755 index a5d30d2..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-all.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Start all spark daemons. -# Starts the master on this node. -# Starts a worker on each node specified in conf/slaves - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -# Load the Spark configuration -. "${SPARK_HOME}/sbin/spark-config.sh" - -# Start Master -"${SPARK_HOME}/sbin"/start-master.sh - -# Start Workers -"${SPARK_HOME}/sbin"/start-slaves.sh diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-history-server.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-history-server.sh deleted file mode 100755 index 38a43b9..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-history-server.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Starts the history server on the machine this script is executed on. -# -# Usage: start-history-server.sh -# -# Use the SPARK_HISTORY_OPTS environment variable to set history server configuration. -# - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" -. "${SPARK_HOME}/bin/load-spark-env.sh" - -exec "${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.history.HistoryServer 1 "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-master.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-master.sh deleted file mode 100755 index 97ee321..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-master.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Starts the master on the machine this script is executed on. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -# NOTE: This exact class name is matched downstream by SparkSubmit. -# Any changes need to be reflected there. -CLASS="org.apache.spark.deploy.master.Master" - -if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then - echo "Usage: ./sbin/start-master.sh [options]" - pattern="Usage:" - pattern+="\|Using Spark's default log4j profile:" - pattern+="\|Registered signal handlers for" - - "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2 - exit 1 -fi - -ORIGINAL_ARGS="$@" - -. "${SPARK_HOME}/sbin/spark-config.sh" - -. "${SPARK_HOME}/bin/load-spark-env.sh" - -if [ "$SPARK_MASTER_PORT" = "" ]; then - SPARK_MASTER_PORT=7077 -fi - -if [ "$SPARK_MASTER_HOST" = "" ]; then - case `uname` in - (SunOS) - SPARK_MASTER_HOST="`/usr/sbin/check-hostname | awk '{print $NF}'`" - ;; - (*) - SPARK_MASTER_HOST="`hostname -f`" - ;; - esac -fi - -if [ "$SPARK_MASTER_WEBUI_PORT" = "" ]; then - SPARK_MASTER_WEBUI_PORT=8080 -fi - -"${SPARK_HOME}/sbin"/spark-daemon.sh start $CLASS 1 \ - --host $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT \ - $ORIGINAL_ARGS diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-mesos-dispatcher.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-mesos-dispatcher.sh deleted file mode 100755 index ecaad7a..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-mesos-dispatcher.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Starts the Mesos Cluster Dispatcher on the machine this script is executed on. -# The Mesos Cluster Dispatcher is responsible for launching the Mesos framework and -# Rest server to handle driver requests for Mesos cluster mode. -# Only one cluster dispatcher is needed per Mesos cluster. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" - -. "${SPARK_HOME}/bin/load-spark-env.sh" - -if [ "$SPARK_MESOS_DISPATCHER_PORT" = "" ]; then - SPARK_MESOS_DISPATCHER_PORT=7077 -fi - -if [ "$SPARK_MESOS_DISPATCHER_HOST" = "" ]; then - case `uname` in - (SunOS) - SPARK_MESOS_DISPATCHER_HOST="`/usr/sbin/check-hostname | awk '{print $NF}'`" - ;; - (*) - SPARK_MESOS_DISPATCHER_HOST="`hostname -f`" - ;; - esac -fi - -if [ "$SPARK_MESOS_DISPATCHER_NUM" = "" ]; then - SPARK_MESOS_DISPATCHER_NUM=1 -fi - -"${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.mesos.MesosClusterDispatcher $SPARK_MESOS_DISPATCHER_NUM --host $SPARK_MESOS_DISPATCHER_HOST --port $SPARK_MESOS_DISPATCHER_PORT "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-mesos-shuffle-service.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-mesos-shuffle-service.sh deleted file mode 100755 index 1845845..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-mesos-shuffle-service.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Starts the Mesos external shuffle server on the machine this script is executed on. -# The Mesos external shuffle service detects when an application exits and automatically -# cleans up its shuffle files. -# -# Usage: start-mesos-shuffle-server.sh -# -# Use the SPARK_SHUFFLE_OPTS environment variable to set shuffle service configuration. -# - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" -. "${SPARK_HOME}/bin/load-spark-env.sh" - -exec "${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.mesos.MesosExternalShuffleService 1 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-shuffle-service.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-shuffle-service.sh deleted file mode 100755 index 793e165..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-shuffle-service.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Starts the external shuffle server on the machine this script is executed on. -# -# Usage: start-shuffle-server.sh -# -# Use the SPARK_SHUFFLE_OPTS environment variable to set shuffle server configuration. -# - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" -. "${SPARK_HOME}/bin/load-spark-env.sh" - -exec "${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.ExternalShuffleService 1 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-slave.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-slave.sh deleted file mode 100755 index 8c268b8..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-slave.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Starts a slave on the machine this script is executed on. -# -# Environment Variables -# -# SPARK_WORKER_INSTANCES The number of worker instances to run on this -# slave. Default is 1. -# SPARK_WORKER_PORT The base port number for the first worker. If set, -# subsequent workers will increment this number. If -# unset, Spark will find a valid port number, but -# with no guarantee of a predictable pattern. -# SPARK_WORKER_WEBUI_PORT The base port for the web interface of the first -# worker. Subsequent workers will increment this -# number. Default is 8081. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -# NOTE: This exact class name is matched downstream by SparkSubmit. -# Any changes need to be reflected there. -CLASS="org.apache.spark.deploy.worker.Worker" - -if [[ $# -lt 1 ]] || [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then - echo "Usage: ./sbin/start-slave.sh [options] " - pattern="Usage:" - pattern+="\|Using Spark's default log4j profile:" - pattern+="\|Registered signal handlers for" - - "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2 - exit 1 -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" - -. "${SPARK_HOME}/bin/load-spark-env.sh" - -# First argument should be the master; we need to store it aside because we may -# need to insert arguments between it and the other arguments -MASTER=$1 -shift - -# Determine desired worker port -if [ "$SPARK_WORKER_WEBUI_PORT" = "" ]; then - SPARK_WORKER_WEBUI_PORT=8081 -fi - -# Start up the appropriate number of workers on this machine. -# quick local function to start a worker -function start_instance { - WORKER_NUM=$1 - shift - - if [ "$SPARK_WORKER_PORT" = "" ]; then - PORT_FLAG= - PORT_NUM= - else - PORT_FLAG="--port" - PORT_NUM=$(( $SPARK_WORKER_PORT + $WORKER_NUM - 1 )) - fi - WEBUI_PORT=$(( $SPARK_WORKER_WEBUI_PORT + $WORKER_NUM - 1 )) - - "${SPARK_HOME}/sbin"/spark-daemon.sh start $CLASS $WORKER_NUM \ - --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@" -} - -if [ "$SPARK_WORKER_INSTANCES" = "" ]; then - start_instance 1 "$@" -else - for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do - start_instance $(( 1 + $i )) "$@" - done -fi diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-slaves.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-slaves.sh deleted file mode 100755 index f5269df..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-slaves.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Starts a slave instance on each machine specified in the conf/slaves file. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" -. "${SPARK_HOME}/bin/load-spark-env.sh" - -# Find the port number for the master -if [ "$SPARK_MASTER_PORT" = "" ]; then - SPARK_MASTER_PORT=7077 -fi - -if [ "$SPARK_MASTER_HOST" = "" ]; then - case `uname` in - (SunOS) - SPARK_MASTER_HOST="`/usr/sbin/check-hostname | awk '{print $NF}'`" - ;; - (*) - SPARK_MASTER_HOST="`hostname -f`" - ;; - esac -fi - -# Launch the slaves -"${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/start-slave.sh" "spark://$SPARK_MASTER_HOST:$SPARK_MASTER_PORT" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-thriftserver.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-thriftserver.sh deleted file mode 100755 index f02f317..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/start-thriftserver.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# Shell script for starting the Spark SQL Thrift server - -# Enter posix mode for bash -set -o posix - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -# NOTE: This exact class name is matched downstream by SparkSubmit. -# Any changes need to be reflected there. -CLASS="org.apache.spark.sql.hive.thriftserver.HiveThriftServer2" - -function usage { - echo "Usage: ./sbin/start-thriftserver [options] [thrift server options]" - pattern="usage" - pattern+="\|Spark assembly has been built with Hive" - pattern+="\|NOTE: SPARK_PREPEND_CLASSES is set" - pattern+="\|Spark Command: " - pattern+="\|=======" - pattern+="\|--help" - - "${SPARK_HOME}"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2 - echo - echo "Thrift server options:" - "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2 -} - -if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then - usage - exit 0 -fi - -export SUBMIT_USAGE_FUNCTION=usage - -exec "${SPARK_HOME}"/sbin/spark-daemon.sh submit $CLASS 1 --name "Thrift JDBC/ODBC Server" "$@" diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-all.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-all.sh deleted file mode 100755 index 4e476ca0..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-all.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Stop all spark daemons. -# Run this on the master node. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -# Load the Spark configuration -. "${SPARK_HOME}/sbin/spark-config.sh" - -# Stop the slaves, then the master -"${SPARK_HOME}/sbin"/stop-slaves.sh -"${SPARK_HOME}/sbin"/stop-master.sh - -if [ "$1" == "--wait" ] -then - printf "Waiting for workers to shut down..." - while true - do - running=`${SPARK_HOME}/sbin/slaves.sh ps -ef | grep -v grep | grep deploy.worker.Worker` - if [ -z "$running" ] - then - printf "\nAll workers successfully shut down.\n" - break - else - printf "." - sleep 10 - fi - done -fi diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-history-server.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-history-server.sh deleted file mode 100755 index 14e3af4..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-history-server.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Stops the history server on the machine this script is executed on. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -"${SPARK_HOME}/sbin/spark-daemon.sh" stop org.apache.spark.deploy.history.HistoryServer 1 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-master.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-master.sh deleted file mode 100755 index 14644ea..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-master.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Stops the master on the machine this script is executed on. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" - -"${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.master.Master 1 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-mesos-dispatcher.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-mesos-dispatcher.sh deleted file mode 100755 index b13e018..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-mesos-dispatcher.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Stop the Mesos Cluster dispatcher on the machine this script is executed on. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" - -if [ "$SPARK_MESOS_DISPATCHER_NUM" = "" ]; then - SPARK_MESOS_DISPATCHER_NUM=1 -fi - -"${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.mesos.MesosClusterDispatcher \ - $SPARK_MESOS_DISPATCHER_NUM - diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-mesos-shuffle-service.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-mesos-shuffle-service.sh deleted file mode 100755 index d23cad3..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-mesos-shuffle-service.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Stops the Mesos external shuffle service on the machine this script is executed on. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -"${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.mesos.MesosExternalShuffleService 1 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-shuffle-service.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-shuffle-service.sh deleted file mode 100755 index 50d69cf..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-shuffle-service.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Stops the external shuffle service on the machine this script is executed on. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -"${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.ExternalShuffleService 1 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-slave.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-slave.sh deleted file mode 100755 index 685bcf5..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-slave.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# A shell script to stop all workers on a single slave -# -# Environment variables -# -# SPARK_WORKER_INSTANCES The number of worker instances that should be -# running on this slave. Default is 1. - -# Usage: stop-slave.sh -# Stops all slaves on this worker machine - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" - -. "${SPARK_HOME}/bin/load-spark-env.sh" - -if [ "$SPARK_WORKER_INSTANCES" = "" ]; then - "${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.worker.Worker 1 -else - for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do - "${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.deploy.worker.Worker $(( $i + 1 )) - done -fi diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-slaves.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-slaves.sh deleted file mode 100755 index a57441b..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-slaves.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -. "${SPARK_HOME}/sbin/spark-config.sh" - -. "${SPARK_HOME}/bin/load-spark-env.sh" - -"${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin"/stop-slave.sh diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-thriftserver.sh b/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-thriftserver.sh deleted file mode 100755 index cf45058..0000000 --- a/scripts/spark-2.4.3-bin-hadoop2.7/sbin/stop-thriftserver.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Stops the thrift server on the machine this script is executed on. - -if [ -z "${SPARK_HOME}" ]; then - export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" -fi - -"${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 1 diff --git a/scripts/spark-2.4.3-bin-hadoop2.7/yarn/spark-2.4.3-yarn-shuffle.jar b/scripts/spark-2.4.3-bin-hadoop2.7/yarn/spark-2.4.3-yarn-shuffle.jar deleted file mode 100644 index 7919686..0000000 Binary files a/scripts/spark-2.4.3-bin-hadoop2.7/yarn/spark-2.4.3-yarn-shuffle.jar and /dev/null differ diff --git a/scripts/submit.py b/scripts/submit.py index 915192d..73b3ea9 100755 --- a/scripts/submit.py +++ b/scripts/submit.py @@ -1,43 +1,88 @@ import sys import yaml import os - from optparse import OptionParser +import subprocess +from collections import deque + +# Load configuration from config.yml +config_path = os.path.join(sys.path[0], "config/config.yml") +with open(config_path, "r") as ymlfile: + cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) +# Parsing command-line options parser = OptionParser() -parser.add_option("-s", "--script", dest="script", - default='', help="Script to submit") -parser.add_option("-r", "--ring", dest="ring", - default='IT', help="RING name") -parser.add_option("-x", "--extra", dest="extra", - default='', help="Extra parameter") +parser.add_option("-s", "--script", dest="script", default="", help="Script to submit. use 'shell' to run a Spark shell.") +parser.add_option("-r", "--ring", dest="ring", default=cfg.get("ring","DATA"), help="RING name") +parser.add_option("-x", "--extra", dest="extra", default="", help="Extra parameter") (options, args) = parser.parse_args() -config_path = "%s/%s" % ( sys.path[0] ,"config/config.yml") -with open(config_path, 'r') as ymlfile: - cfg = yaml.load(ymlfile) +script_path = options.script -script=options.script -opt=options.ring -opt2=options.extra -localdir = "%s/tmp/" % "/var" +ring = options.ring +arg2 = options.extra total_cores = int(cfg["spark.executor.instances"]) * int(cfg["spark.executor.cores"]) -cmd = "./spark-2.4.3-bin-hadoop2.7/bin/spark-submit --master %s \ - --driver-memory=10g \ - --executor-memory=10g \ - --total-executor-cores=%s \ - --conf spark.executorEnv.SHELL=/bin/bash \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.hadoop.fs.s3a.access.key=%s \ - --conf spark.hadoop.fs.s3a.secret.key=%s \ - --conf spark.worker.cleanup.enabled=true \ - --conf spark.worker.cleanup.interval=60 \ - --conf spark.worker.cleanup.appDataTtl=604800 \ - --conf spark.hadoop.fs.s3a.endpoint=%s \ - --conf spark.local.dir=%s \ - --jars file:/root/spark/aws-java-sdk-1.7.4.jar,file:/root/spark/hadoop-aws-2.7.3.jar \ - --driver-class-path=/root/spark/aws-java-sdk-1.7.4.jar:/root/spark/hadoop-aws-2.7.3.jar \ - ./%s %s %s" % ( cfg["master"], total_cores, cfg["s3"]["access_key"] , cfg["s3"]["secret_key"] , cfg["s3"]["endpoint"] , localdir, script , opt, opt2 ) - -os.system(cmd) +spark_args = f"--master {cfg['master']} \ + --conf spark.executor.instances={cfg['spark.executor.instances']} \ + --conf spark.executor.cores={cfg['spark.executor.cores']} \ + --conf spark.driver.host={cfg['spark.driver.bindAddress']} \ + --conf spark.driver.bindAddress={cfg['spark.driver.bindAddress']} \ + --conf spark.worker.cleanup.enabled=false \ + --conf spark.worker.cleanup.interval=60 \ + --conf spark.worker.cleanup.appDataTtl=604800 \ + --conf spark.memory.offHeap.enabled={str(cfg['spark.memory.offHeap.enabled']).lower()} \ + --conf spark.memory.offHeap.size={cfg['spark.memory.offHeap.size']} \ + --total-executor-cores={total_cores} \ + --executor-memory={cfg['spark.executor.memory']} \ + --driver-memory={cfg['spark.driver.memory']} \ + --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ + --conf spark.hadoop.fs.s3a.access.key={cfg['s3']['access_key']} \ + --conf spark.hadoop.fs.s3a.secret.key={cfg['s3']['secret_key']} \ + --conf spark.hadoop.fs.s3a.endpoint={cfg['s3']['endpoint']} \ + --conf spark.hadoop.fs.s3a.buffer.dir={cfg['datadir']['container']}/s3a-buffer \ + --conf spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled=true \ + --conf spark.hadoop.fs.fs.s3a.path.style.access=true \ + --conf spark.local.dir={cfg['datadir']['container']} \ + --jars file:/spark/jars/aws-java-sdk-bundle-1.12.770.jar,file:/spark/jars/hadoop-aws-3.3.4.jar,file:/spark/jars/spark-hadoop-cloud_2.13-3.5.2.jar \ + --driver-class-path=/spark/jars/aws-java-sdk-bundle-1.12.770.jar:/spark/jars/hadoop-aws-3.3.4.jar:/spark/jars/spark-hadoop-cloud_2.13-3.5.2.jar \ + --deploy-mode client" + +def tail(filename, n=10): + with open(filename) as f: + return deque(f, n) + +def run_script(worker_id, script_path, flag1, flag2): + cmd = f"spark-submit {spark_args} {script_path} {flag1} {flag2}" + + # Define file paths for stdout and stderr + stdout_file = f"{cfg['logdir']['container']}/worker_{worker_id}_stdout.log" + stderr_file = f"{cfg['logdir']['container']}/worker_{worker_id}_stderr.log" + + print(f"Running command: {cmd}") + with open(stdout_file, "w") as stdout_f, open(stderr_file, "w") as stderr_f: + process = subprocess.Popen(cmd, shell=True, stdout=stdout_f, stderr=stderr_f) + process.communicate() + + if process.returncode == 0: + print(f"Worker {worker_id} executed {script_path} successfully. Output extract:\n") + for line in tail(stdout_file, 10): + print(line.strip()[:300]) + print("\n") + return f"Worker {worker_id} executed {script_path} successfully. Output written to {stdout_file}." + else: + return f"Worker {worker_id} failed to execute {script_path}. Check {stderr_file} and {stdout_file} for errors." + +def run_shell(): + cmd = f"pyspark --master {cfg['master']}" + process = subprocess.Popen(cmd, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, shell=True) + process.wait() + +os.environ['AWS_JAVA_V1_DISABLE_DEPRECATION_ANNOUNCEMENT'] = 'true' +os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:3.3.4" pyspark-shell' + +if script_path == "shell": + run_shell() +else: + res = run_script(0, script_path, ring, arg2) + print(res) diff --git a/scripts/test_spark.py b/scripts/test_spark.py new file mode 100644 index 0000000..39a91ad --- /dev/null +++ b/scripts/test_spark.py @@ -0,0 +1,28 @@ +from pyspark import SparkContext + +def calculate_sum_of_squares(): + # Créer un contexte Spark + sc = SparkContext.getOrCreate() + + # Exemple de données : une liste de nombres + data = [1, 2, 3, 4, 5] + data = range(1, 200000001) + + # Distribuer les données sur les workers + rdd = sc.parallelize(data, 100) + + # Calculer le carré de chaque nombre + squares = rdd.map(lambda x: x ** 2) + + # Somme des carrés + sum_of_squares = squares.reduce(lambda a, b: a + b) + + # Retourner le résultat + return sum_of_squares + +if __name__ == "__main__": + # Appeler la fonction de calcul + result = calculate_sum_of_squares() + + # Afficher le résultat + print(f"La somme des carrés est : {result}") diff --git a/spark_run.sh b/spark_run.sh new file mode 100755 index 0000000..c0f8174 --- /dev/null +++ b/spark_run.sh @@ -0,0 +1,299 @@ +#!/bin/bash + +[ -z "$1" ] && echo "Usage: $0 " && exit 1 + +# Version of the Spark image and tooling. Export it. +export VERSION=3.5.2-12 +export IMAGE_NAME=registry.scality.com/spark/spark-container + +# Please change the IPs according to your architecture +# without tuning the entrypoint.sh, a worker cannot have a master running on the same host +master="10.160.169.162" +workers="10.160.174.4 10.160.168.21 10.160.171.166 10.160.172.100 10.160.169.238" + + +# Please change the directory where Spark workers will write temporary data + +datadir="/scality/ssd01/spark" + +# Directories for apps (scripts) and logs +appsdir="/root/spark-apps" +logsdir="/root/spark-logs" + +# Extra hosts you need containers to be able to resolve. +# format: "ip1:host1 ip2:host2 ..." +# Example: +# extrahosts="10.2.4.1:s3.scality.com +# 19.10.3.1:microsoft.com" + +extrahosts="" + +[ "${1}" = "env" ] && return 0 + +########### END of tunable variables + +spark_image_name="${IMAGE_NAME}" +spark_image_name_escaped=$(echo "${spark_image_name}" | sed "s#/#\\/#g") +spark_image_version="${VERSION}" + +spark_image_full="${spark_image_name}:${spark_image_version}" + +# Detecting IPs +local_ips=$(ip ad |grep inet |grep -Ev "127.0.0.1|::" |awk '{print $2}') + +local_master="" +if echo "$local_ips" |grep -Fqw $master ; then local_master=$master ; fi + +local_worker="" +for worker in $workers ; do + if echo "$local_ips" |grep -Fqw $worker ; then local_worker=$worker ; fi +done + +container_command="$(basename "$(type -p docker || type -p ctr)")" +test -z "$container_command" && echo "docker or CTR not found!" && exit 1 + +echo "Checking the Spark image with $container_command" +case $container_command + in + docker) version_check_result=$($container_command images --format '{{.Repository}}:{{.Tag}}' | grep "${spark_image_full}") + # extrahosts update + add_hosts="" + for i in $extrahosts ; do add_hosts+=" --add-host=$(echo $i)"; done + ;; + ctr) version_check_result=$($container_command images list | grep "${spark_image_full}" 2> /dev/null) + # hosts update + host_storage="" + count=01 + localname= + for i in $workers ; do + n=spark-worker-$(printf "%02d" ${count}) + ((count++)) + if echo "$local_ips" |grep -Fqw "$i" ; then n="${n} `hostname -s`" ; fi + test "$(grep -qw "$n" /etc/hosts ; echo $?)" == 0 && continue + host_storage+="$i $n\n" + done + for i in $extrahosts ; do + test "$(grep -qw "$i" /etc/hosts ; echo $?)" == 0 && continue + host_storage+="$(echo $i | sed 's#:# #') # Part of Spark extrahosts $i\n" + done + test -n "$host_storage" && echo -e "# Host file updated for Spark\n$host_storage" >> /etc/hosts + + # Manage when a role has changed in the same node + # spark-master should be known on all nodes + if ! grep -q "$master spark-master # set-by-spark_run.sh" /etc/hosts; + then + grep -q "spark-master # set-by-spark_run.sh" /etc/hosts && \ + sed -i '/spark-master # set-by-spark_run.sh/d' /etc/hosts + echo "$master spark-master # set-by-spark_run.sh" >> /etc/hosts + fi + ;; +esac + +if [ -n "${version_check_result}" ];then + echo "OK" +else + echo "Please install the image ${spark_image_name}:${spark_image_version}." + exit 1 +fi + +echo "Checking appsdir" + +if [ ! -f "${appsdir}/config/config.yml" ];then + + echo "appsdir ${appsdir} must be copied from the supervisor." + exit 1 +fi + +case $1 +in + exec) + shift + case $container_command + in + docker) + + $container_command run --rm --net=host --name=EXEC \ + -v "${appsdir}:/opt/spark/apps:rw" \ + -v "${datadir}:/opt/spark/tmp:rw" \ + -v "${logsdir}:/opt/spark/spark-events:rw" \ + ${add_hosts} \ + "${spark_image_full}" \ + exec "$@" + ;; + ctr) + $container_command run --net-host --rm --mount="type=bind,src=${appsdir},dst=/opt/spark/apps,options=rbind:rw" \ + --mount="type=bind,src=${datadir},dst=/opt/spark/tmp,options=rbind:rw" \ + --mount="type=bind,src=${logsdir},dst=/opt/spark/spark-events,options=rbind:rw" \ + "${spark_image_full}" EXEC /opt/spark/entrypoint.sh exec "$@" + esac + ;; + driver) + case $container_command + in + docker) + $container_command run --rm --net=host --name=SPARK-DRIVER -ti \ + -v "${appsdir}:/opt/spark/apps:rw" \ + -v "${datadir}:/opt/spark/tmp:rw" \ + -v "${logsdir}:/opt/spark/spark-events:rw" \ + --add-host="spark-master:${master}" \ + --add-host="$(hostname -s):$(echo ${workers} | awk '{print $1}')" \ + ${add_hosts} \ + --workdir=/opt/spark/apps \ + "${spark_image_full}" \ + driver + ;; + ctr) + $container_command run --net-host --rm -cwd /opt/spark/apps --mount="type=bind,src=${appsdir},dst=/opt/spark/apps,options=rbind:rw" \ + --mount="type=bind,src=${datadir},dst=/opt/spark/tmp,options=rbind:rw" \ + --mount="type=bind,src=${logsdir},dst=/opt/spark/spark-events,options=rbind:rw" -t \ + "${spark_image_full}" SPARK-DRIVER /opt/spark/entrypoint.sh driver 2> /dev/null + esac + ;; + start) echo "Starting Spark node" + echo "Creating data dir ${datadir}" + mkdir -p "${datadir}/s3a-buffer" + + echo "Creating log dir" + mkdir -p "${logsdir}" + + case $container_command + in + docker) echo "Running $container_command" + + # hosts update + host_storage="${add_hosts}" + count=01 + for i in $workers ; do host_storage+=" --add-host=spark-worker-$(printf "%02d" ${count}):$i"; ((count++)) ; done + + if [ -n "$local_master" ] ; then + echo "Running master here" + $container_command run --rm -d --net=host --name spark-master \ + --env='SPARK_NO_DAEMONIZE=true' \ + --hostname=spark-master \ + --add-host=spark-master:$master \ + --volume "${appsdir}:/opt/spark/apps:rw" \ + --volume "${logsdir}:/opt/spark/spark-events:rw" \ + --volume "${datadir}:/opt/spark/tmp:rw" \ + ${host_storage} \ + ${spark_image_full} \ + master + fi + + if [ -n "$local_worker" ] ; then + echo "Running worker here" + $container_command run --rm -d --net=host --name spark-worker \ + --env='SPARK_NO_DAEMONIZE=true' \ + --hostname=spark-worker \ + --add-host=spark-master:$master \ + --add-host=spark-worker:"$local_worker" \ + --volume "${appsdir}:/opt/spark/apps:rw" \ + --volume "${logsdir}:/opt/spark/spark-events:rw" \ + --volume "${datadir}:/opt/spark/tmp:rw" \ + ${host_storage} \ + ${spark_image_full} \ + worker + fi + ;; + ctr) echo "Running $container_command" + + if [ -n "$local_master" ] ; then + echo "Running master here" + # Add the server's short hostname for master + echo "${master} $(hostname -s) # Added by spark_run.sh" >> /etc/hosts + # remove if exists, throw error if running + c=$($container_command c ls |grep -w spark-master 2> /dev/null) + echo "Checking if the container aleady exists" + test -n "$c" && \ + $container_command c rm spark-master 2> /dev/null + + # start + $container_command run -d --net-host \ + --env='SPARK_NO_DAEMONIZE=true' \ + --mount="type=bind,src=${appsdir},dst=/opt/spark/apps,options=rbind:rw" \ + --mount='type=bind,src='${logsdir}',dst=/opt/spark/spark-events,options=rbind:rw' \ + --mount='type=bind,src='${datadir}',dst=/opt/spark/tmp,options=rbind:rw' \ + ${spark_image_full} spark-master \ + ./entrypoint.sh master 2> /dev/null + + fi + + if [ -n "$local_worker" ] ; then + # Manage when a role has changed in the same node + # spark-worker is only a local name + if ! grep -q "$local_worker spark-worker # set-by-spark_run.sh" /etc/hosts; + then + grep -q "spark-worker # set-by-spark_run.sh" /etc/hosts && \ + sed -i '/spark-worker # set-by-spark_run.sh/d' /etc/hosts + echo "$local_worker spark-worker # set-by-spark_run.sh" >> /etc/hosts + fi + + echo "Running worker here" + # remove if exists, throw error if running + c=$($container_command c ls |grep -w spark-worker 2> /dev/null) + echo "Checking if the container aleady exists" + test -n "$c" && \ + $container_command c rm spark-worker 2> /dev/null + + $container_command run -d --net-host \ + --mount='type=bind,src='${appsdir}',dst=/opt/spark/apps,options=rbind:rw' \ + --mount='type=bind,src='${datadir}',dst=/opt/spark/tmp,options=rbind:rw' \ + --mount='type=bind,src='${logsdir}',dst=/opt/spark/spark-events,options=rbind:rw' \ + --mount='type=bind,src=/etc/scality/node/,dst=/etc/scality/node/,options=rbind:r' \ + --env='SPARK_NO_DAEMONIZE=true' \ + ${spark_image_full} spark-worker \ + ./entrypoint.sh worker 2> /dev/null + + else + grep -q "spark-worker # set-by-spark_run.sh" /etc/hosts && \ + sed -i '/spark-worker # set-by-spark_run.sh/d' /etc/hosts + fi + ;; + esac + ;; + stop) echo "Stopping Spark node" + case $container_command + in + docker) echo "Stopping using $container_command" + if [ -n "$local_master" ] ; then + echo "Stopping master here" + $container_command stop spark-master + fi + + if [ -n "$local_worker" ] ; then + echo "Stopping worker here" + $container_command stop spark-worker + fi + ;; + ctr) echo "Stopping using $container_command" + # Remember, ctr cannot run in daemon with rm argument like docker + + if [ -n "$local_master" ] ; then + echo "Stopping master here" + $container_command t kill --signal 9 -a spark-master + sleep 1 + $container_command c rm spark-master + fi + + if [ -n "$local_worker" ] ; then + echo "Stopping worker here" + $container_command t kill --signal 9 -a spark-worker + sleep 1 + $container_command c rm spark-worker + fi + ;; + esac + ;; + status) echo "Status of Spark node" + case $container_command + in + docker) $container_command ps |grep spark + ;; + ctr) $container_command c ls + $container_command t ls + ;; + esac + ;; + *) echo "Usage: $0 " + exit 1 + ;; +esac diff --git a/start-master.sh b/start-master.sh deleted file mode 100644 index 7267b9f..0000000 --- a/start-master.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh -/spark/bin/spark-class org.apache.spark.deploy.master.Master \ - --ip spark-master \ - --port 7077 \ - --webui-port 8080 diff --git a/start-worker.sh b/start-worker.sh deleted file mode 100644 index b79341d..0000000 --- a/start-worker.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -/spark/bin/spark-class org.apache.spark.deploy.worker.Worker \ - --webui-port 8080 spark://spark-master:7077 diff --git a/verifyBucketSproxydKeys_1.15.3.js.patch b/verifyBucketSproxydKeys_1.15.3.js.patch new file mode 100644 index 0000000..212f172 --- /dev/null +++ b/verifyBucketSproxydKeys_1.15.3.js.patch @@ -0,0 +1,18 @@ +--- nodejs/verifyBucketSproxydKeys.js 2025-06-26 17:22:56.295562337 +0200 ++++ nodejs/verifyBucketSproxydKeys.js.new 2025-06-26 17:23:52.550265817 +0200 +@@ -316,15 +316,6 @@ + return cb(); + }; + +- const dupInfo = findDuplicateSproxydKeys.insertVersion(objectUrl, locations.map(loc => loc.key)); +- if (dupInfo) { +- log.error('duplicate sproxyd key found', { +- objectUrl, +- objectUrl2: dupInfo.objectId, +- sproxydKey: dupInfo.key, +- }); +- dupKey = true; +- } + if (NO_MISSING_KEY_CHECK) { + if (VERBOSE) { + locations.forEach(loc => log.info('sproxyd key', { diff --git a/worker.sh b/worker.sh deleted file mode 100644 index 3c5c824..0000000 --- a/worker.sh +++ /dev/null @@ -1,4 +0,0 @@ -#docker run --rm -it --name spark-master --hostname spark-master \ -# -p 7077:7077 -p 8080:8080 spark /spark/bin/spark-class org.apache.spark.deploy.master.Master --ip `hostname` --port 7077 --webui-port 8080 - -docker run -d --rm -it --net=host --name spark-worker --hostname spark-worker --add-host spark-master:178.33.63.238 --add-host spark-worker:178.33.63.238 --add-host node1:178.33.63.238 spark-worker