diff --git a/.github/workflows/build-and-package.yaml b/.github/workflows/build-and-package.yaml
deleted file mode 100644
index 37add713..00000000
--- a/.github/workflows/build-and-package.yaml
+++ /dev/null
@@ -1,528 +0,0 @@
-name: BuildAndPackage
-
-on:
- push:
- tags:
- - "v*"
- branches: master
- pull_request:
- branches: master
- paths-ignore:
- - "*.md"
- - "*.markdown"
- - "mkdocs/**/*"
- tags-ignore:
- - "*"
-
-jobs:
- version_info:
- runs-on: ubuntu-18.04
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - id: vars
- run: |
- set -x -e
-
- echo "github event ref is ${{ github.ref }}"
-
- if [ "x${{ startsWith(github.ref, 'refs/tags/v') }}" == "xtrue" ]
- then
- echo "Trigger was a version tag - ${{ github.ref }}"
- echo ::set-output name=q_version::${GITHUB_REF#refs/tags/v}
- echo ::set-output name=is_release::true
- else
- # For testing version propagation inside the PR
- echo "Either branch of a non-version tag - setting version to 0.0.0"
- echo ::set-output name=q_version::0.0.0
- echo ::set-output name=is_release::false
- fi
-
- outputs:
- q_version: ${{ steps.vars.outputs.q_version }}
- is_release: ${{ steps.vars.outputs.is_release }}
-
- check_version_info:
- runs-on: ubuntu-18.04
- needs: version_info
- steps:
- - name: test q_version
- run: |
- set -e -x
-
- echo "outputs: ${{ toJson(needs.version_info) }}"
-
- create-man:
- runs-on: ubuntu-18.04
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Install Ruby
- uses: ruby/setup-ruby@v1
- with:
- ruby-version: '2.6'
- - name: Create man page
- run: |
- set -x -e
- gem install ronn
-
- ronn doc/USAGE.markdown
- # Must be gzipped, otherwise debian does not install it
- gzip doc/USAGE
- - name: Upload man page
- uses: actions/upload-artifact@v1.0.0
- with:
- name: q-man-page
- path: doc/USAGE.gz
-
- build-linux:
- runs-on: ubuntu-18.04
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Cache pyox
- uses: actions/cache@v2
- with:
- path: |
- ~/.cache/pyoxidizer
- key: ${{ runner.os }}-pyox
- - name: Install pyoxidizer
- run: |
- set -e -x
-
- sudo apt-get update
- sudo apt-get install -y zip sqlite3 rpm
-
- curl -o pyoxidizer.zip -L "https://github.com/indygreg/PyOxidizer/releases/download/pyoxidizer%2F0.17/pyoxidizer-0.17.0-linux_x86_64.zip"
- unzip pyoxidizer.zip
- chmod +x ./pyoxidizer
- - name: Create Q Executable - Linux
- run: |
- set -e -x
-
- ./pyoxidizer build --release
-
- export Q_EXECUTABLE=./build/x86_64-unknown-linux-gnu/release/install/q
- chmod 755 $Q_EXECUTABLE
-
- seq 1 100 | $Q_EXECUTABLE -c 1 "select sum(c1),count(*) from -" -S test.sqlite
-
- mkdir -p packages/linux/
- cp $Q_EXECUTABLE packages/linux/linux-q
- - name: Upload Linux Executable
- uses: actions/upload-artifact@v1.0.0
- with:
- name: linux-q
- path: packages/linux/linux-q
-
- test-linux:
- needs: build-linux
- runs-on: ubuntu-18.04
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Install Python for Testing
- uses: actions/setup-python@v2
- with:
- python-version: '3.8.12'
- architecture: 'x64'
- - name: Prepare Testing
- run: |
- set -e -x
-
- pip3 install -r test-requirements.txt
- - name: Download Linux Executable
- uses: actions/download-artifact@v2
- with:
- name: linux-q
- - name: Run Tests on Linux Executable
- run: |
- set -x -e
-
- find ./ -ls
-
- chmod 755 ./linux-q
-
- Q_EXECUTABLE=`pwd`/linux-q Q_SKIP_EXECUTABLE_VALIDATION=true ./run-tests.sh -v
-
- package-linux-deb:
- needs: [test-linux, create-man, version_info]
- runs-on: ubuntu-18.04
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Install Ruby
- uses: ruby/setup-ruby@v1
- with:
- ruby-version: '2.6'
- - name: Downoad man page
- uses: actions/download-artifact@v2
- with:
- name: q-man-page
- - name: Download Linux Executable
- uses: actions/download-artifact@v2
- with:
- name: linux-q
- - name: Build DEB Package
- run: |
- set -e -x
-
- mkdir -p packages/linux/
-
- find ./ -ls
-
- chmod 755 ./linux-q
-
- export q_version=${{ needs.version_info.outputs.q_version }}
-
- gem install fpm
- cp dist/fpm-config ~/.fpm
- fpm -s dir -t deb --deb-use-file-permissions -p packages/linux/q-text-as-data-${q_version}-1.x86_64.deb --version ${q_version} ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz
- - name: Upload DEB Package
- uses: actions/upload-artifact@v1.0.0
- with:
- name: q-text-as-data-${{ needs.version_info.outputs.q_version }}-1.x86_64.deb
- path: packages/linux/q-text-as-data-${{ needs.version_info.outputs.q_version }}-1.x86_64.deb
-
- test-deb-packaging:
- runs-on: ubuntu-18.04
- needs: [package-linux-deb, version_info]
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Download DEB
- uses: actions/download-artifact@v2
- with:
- name: q-text-as-data-${{ needs.version_info.outputs.q_version }}-1.x86_64.deb
- - name: Install Python for Testing
- uses: actions/setup-python@v2
- with:
- python-version: '3.8.12'
- architecture: 'x64'
- - name: Prepare Testing
- run: |
- set -e -x
-
- pip3 install -r test-requirements.txt
- - name: Test DEB Package Installation
- run: ./dist/test-using-deb.sh ./q-text-as-data-${{ needs.version_info.outputs.q_version }}-1.x86_64.deb
-
- package-linux-rpm:
- needs: [test-linux, create-man, version_info]
- runs-on: ubuntu-18.04
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Install Ruby
- uses: ruby/setup-ruby@v1
- with:
- ruby-version: '2.6'
- - name: Download man page
- uses: actions/download-artifact@v2
- with:
- name: q-man-page
- - name: Download Linux Executable
- uses: actions/download-artifact@v2
- with:
- name: linux-q
- - name: Build RPM Package
- run: |
- set -e -x
-
- mkdir -p packages/linux
-
-
- chmod 755 ./linux-q
-
- export q_version=${{ needs.version_info.outputs.q_version }}
-
- gem install fpm
- cp dist/fpm-config ~/.fpm
- fpm -s dir -t rpm --rpm-use-file-permissions -p packages/linux/q-text-as-data-${q_version}.x86_64.rpm --version ${q_version} ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz
- - name: Upload RPM Package
- uses: actions/upload-artifact@v1.0.0
- with:
- name: q-text-as-data-${{ needs.version_info.outputs.q_version }}.x86_64.rpm
- path: packages/linux/q-text-as-data-${{ needs.version_info.outputs.q_version }}.x86_64.rpm
-
- test-rpm-packaging:
- runs-on: ubuntu-18.04
- needs: [package-linux-rpm, version_info]
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Download RPM
- uses: actions/download-artifact@v2
- with:
- name: q-text-as-data-${{ needs.version_info.outputs.q_version }}.x86_64.rpm
- - name: Retest using RPM
- run: ./dist/test-using-rpm.sh ./q-text-as-data-${{ needs.version_info.outputs.q_version }}.x86_64.rpm
-
- build-mac:
- runs-on: macos-11
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Cache pyox
- uses: actions/cache@v2
- with:
- path: |
- ~/.cache/pyoxidizer
- key: ${{ runner.os }}-pyox
- - name: Install pyoxidizer
- run: |
- set -e -x
-
- curl -o pyoxidizer.zip -L "https://github.com/indygreg/PyOxidizer/releases/download/pyoxidizer%2F0.17/pyoxidizer-0.17.0-macos-universal.zip"
- unzip pyoxidizer.zip
- mv macos-universal/pyoxidizer ./pyoxidizer
-
- chmod +x ./pyoxidizer
- - name: Create Q Executable - Mac
- run: |
- set -e -x
-
- ./pyoxidizer build --release
-
- export Q_EXECUTABLE=./build/x86_64-apple-darwin/release/install/q
- chmod 755 $Q_EXECUTABLE
-
- seq 1 100 | $Q_EXECUTABLE -c 1 "select sum(c1),count(*) from -" -S test.sqlite
-
- mkdir -p packages/macos/
- cp $Q_EXECUTABLE packages/macos/macos-q
- - name: Upload MacOS Executable
- uses: actions/upload-artifact@v1.0.0
- with:
- name: macos-q
- path: packages/macos/macos-q
-
- test-mac:
- needs: build-mac
- runs-on: macos-11
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Install Python for Testing
- uses: actions/setup-python@v2
- with:
- python-version: '3.8.12'
- architecture: 'x64'
- - name: Prepare Testing
- run: |
- set -e -x
-
- pip3 install wheel
-
- pip3 install -r test-requirements.txt
- - name: Download MacOS Executable
- uses: actions/download-artifact@v2
- with:
- name: macos-q
- - name: Run Tests on MacOS Executable
- run: |
- set -e -x
-
- chmod 755 ./macos-q
-
- Q_EXECUTABLE=`pwd`/macos-q Q_SKIP_EXECUTABLE_VALIDATION=true ./run-tests.sh -v
-
- not-package-mac:
- # create-man is not needed, as it's generated inside the brew formula independently
- needs: [test-mac]
- runs-on: macos-11
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Not Packaging Mac
- run: |
- echo "homebrew mac cannot be packaged from the source code itself, due to the package build process of homebrew. See https://github.com/harelba/homebrew-q"
-
- not-test-mac-packaging:
- needs: not-package-mac
- runs-on: macos-11
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Not Testing Mac Packaging
- run: |
- echo "homebrew mac packaging cannot be tested here, due to the package build process of homebrew. See https://github.com/harelba/homebrew-q"
-
- build-windows:
- runs-on: windows-latest
- needs: version_info
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Install MSVC build tools
- uses: ilammy/msvc-dev-cmd@v1
- - name: Install Python
- uses: actions/setup-python@v2
- with:
- python-version: '3.8.10'
- architecture: 'x64'
- - name: Install pyoxidizer
- shell: bash
- run: |
- set -x -e
-
- python3 -V
- pip3 -V
-
- pip3 install pyoxidizer
- - name: Create Q Executable - Windows
- shell: bash
- run: |
- set -e -x
-
- pyoxidizer build --release --var Q_VERSION ${{ needs.version_info.outputs.q_version }}
-
- export Q_EXECUTABLE=./build/x86_64-pc-windows-msvc/release/install/q
- chmod 755 $Q_EXECUTABLE
-
- seq 1 100 | $Q_EXECUTABLE -c 1 "select sum(c1),count(*) from -" -S test.sqlite
-
- mkdir -p packages/windows/
- cp $Q_EXECUTABLE packages/windows/win-q.exe
-
- find ./ -ls
- - name: Upload Linux Executable
- uses: actions/upload-artifact@v1.0.0
- with:
- name: win-q.exe
- path: packages/windows/win-q.exe
-
- not-really-test-windows:
- needs: build-windows
- runs-on: windows-latest
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Install Python for Testing
- uses: actions/setup-python@v2
- with:
- python-version: '3.8'
- architecture: 'x64'
- - name: Download Windows Executable
- uses: actions/download-artifact@v2
- with:
- name: win-q.exe
- - name: Not-Really-Test Windows
- shell: bash
- continue-on-error: true
- run: |
- echo "Tests are not compatible with Windows (path separators, tmp folder names etc.). Only a sanity wil be tested"
-
- chmod +x ./win-q.exe
-
- seq 1 10000 | ./win-q.exe -c 1 "select sum(c1),count(*) from -" -S some-db.sqlite
-
- package-windows:
- needs: [create-man, not-really-test-windows, version_info]
- runs-on: windows-latest
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Install MSVC build tools
- uses: ilammy/msvc-dev-cmd@v1
- - name: Install Python
- uses: actions/setup-python@v2
- with:
- python-version: '3.8.10'
- architecture: 'x64'
- - name: Install pyoxidizer
- shell: bash
- run: |
- set -x -e
-
- python3 -V
- pip3 -V
-
- pip3 install pyoxidizer
- - name: Create Q MSI - Windows
- shell: bash
- run: |
- set -e -x
-
- pyoxidizer build --release msi_installer --var Q_VERSION ${{ needs.version_info.outputs.q_version }}
-
- export Q_MSI=./build/x86_64-pc-windows-msvc/release/msi_installer/q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi
- chmod 755 $Q_MSI
-
- mkdir -p packages/windows/
- cp $Q_MSI packages/windows/q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi
-
- - name: Upload Windows MSI
- uses: actions/upload-artifact@v1.0.0
- with:
- name: q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi
- path: packages/windows/q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi
-
- test-windows-packaging:
- needs: [package-windows, version_info]
- runs-on: windows-latest
- steps:
- - name: Checkout
- uses: actions/checkout@v2
- - name: Download Windows Package
- uses: actions/download-artifact@v2
- with:
- name: q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi
- - name: Test Install of MSI
- continue-on-error: true
- shell: powershell
- run: |
- $process = Start-Process msiexec.exe -ArgumentList "/i q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi -l* msi-install.log /norestart /quiet" -PassThru -Wait
- $process.ExitCode
- gc msi-install.log
-
- exit $process.ExitCode
- - name: Test Uninstall of MSI
- continue-on-error: true
- shell: powershell
- run: |
- $process = Start-Process msiexec.exe -ArgumentList "/u q-text-as-data-${{ needs.version_info.outputs.q_version }}.msi /norestart /quiet" -PassThru -Wait
- $process.ExitCode
- exit $process.ExitCode
-
- perform-prerelease:
- # We'd like artifacts to be uploaded regardless of tests succeeded or not,
- # this is why the dependency here is not on test-X-packaging jobs
- needs: [package-linux-deb, package-linux-rpm, not-package-mac, package-windows, version_info]
- runs-on: ubuntu-latest
- if: needs.version_info.outputs.is_release == 'false'
- steps:
- - name: Download All Artifacts
- uses: actions/download-artifact@v2
- with:
- path: artifacts/
- - name: Timestamp pre-release
- run: |
- set -e -x
-
- echo "Workflow finished at $(date)" >> artifacts/workflow-finish-time.txt
- - name: Create pre-release
- uses: "marvinpinto/action-automatic-releases@v1.2.1"
- with:
- repo_token: "${{ secrets.GITHUB_TOKEN }}"
- automatic_release_tag: "latest"
- prerelease: true
- title: "Next Release Development Build"
- files: |
- artifacts/**/*
-
- perform-release:
- needs: [not-test-mac-packaging, test-deb-packaging, test-rpm-packaging, test-windows-packaging, version_info]
- runs-on: ubuntu-latest
- if: needs.version_info.outputs.is_release == 'true'
- steps:
- - name: Download All Artifacts
- uses: actions/download-artifact@v2
- with:
- path: artifacts/
- - uses: "marvinpinto/action-automatic-releases@v1.2.1"
- with:
- repo_token: "${{ secrets.GITHUB_TOKEN }}"
- prerelease: false
- files: |
- artifacts/**/*
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
new file mode 100644
index 00000000..91060ead
--- /dev/null
+++ b/.github/workflows/docs.yaml
@@ -0,0 +1,37 @@
+name: Documentation
+
+on:
+ push:
+ branches: [ master ]
+ paths:
+ - "mkdocs/**/*"
+ - "doc/**/*"
+ - "*.md"
+ - "*.markdown"
+ release:
+ types: [published]
+
+jobs:
+ deploy-docs:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.9'
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install mkdocs mkdocs-material
+ - name: Build man page
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y ruby
+ gem install ronn
+ ronn doc/USAGE.markdown
+ gzip -c doc/USAGE > doc/USAGE.gz
+ - name: Deploy documentation
+ run: |
+ mkdocs build --config-file mkdocs/mkdocs.yml
+ mkdocs gh-deploy --force --config-file mkdocs/mkdocs.yml
\ No newline at end of file
diff --git a/.github/workflows/publish-package.yaml b/.github/workflows/publish-package.yaml
new file mode 100644
index 00000000..061eb86a
--- /dev/null
+++ b/.github/workflows/publish-package.yaml
@@ -0,0 +1,33 @@
+name: Publish Python Package
+
+on:
+ release:
+ types: [created]
+
+jobs:
+ deploy:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.9'
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install build twine
+ - name: Update version number
+ run: |
+ VERSION=${GITHUB_REF#refs/tags/v}
+ echo "Setting version to $VERSION"
+ sed -i "s/q_version = .*/q_version = '$VERSION'/" qtextasdata/__init__.py
+ cat qtextasdata/__init__.py
+ - name: Build and publish
+ env:
+ TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+ TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+ run: |
+ python -m build
+ twine check dist/*
+ twine upload dist/*
\ No newline at end of file
diff --git a/.github/workflows/python-package.yaml b/.github/workflows/python-package.yaml
new file mode 100644
index 00000000..b4b01696
--- /dev/null
+++ b/.github/workflows/python-package.yaml
@@ -0,0 +1,55 @@
+name: Python Package
+
+on:
+ push:
+ branches: [ master ]
+ paths-ignore:
+ - "*.md"
+ - "*.markdown"
+ - "mkdocs/**/*"
+ pull_request:
+ branches: [ master ]
+ paths-ignore:
+ - "*.md"
+ - "*.markdown"
+ - "mkdocs/**/*"
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install pytest pytest-cov
+ if [ -f test-requirements.txt ]; then pip install -r test-requirements.txt; fi
+ pip install -e .
+ - name: Test with pytest
+ run: |
+ pytest -v --cov=qtextasdata
+
+ lint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.9"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install flake8
+ - name: Lint with flake8
+ run: |
+ flake8 qtextasdata --count --select=E9,F63,F7,F82 --show-source --statistics
diff --git a/.gitignore b/.gitignore
index eb3a2d28..7a560572 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,5 @@ htmlcov/
.coverage
.DS_Store
*.egg
+*.egg-info/
+.vscode/
diff --git a/.python-version b/.python-version
new file mode 100644
index 00000000..588c03e1
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+refactor-q
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..9d992328
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,27 @@
+# Changelog
+
+All notable changes to q (qtextasdata) will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### Added
+- Version bumping script (`bump-version.py`) for easier version management
+
+## [4.0.0] - (release date)
+
+Initial versioned release in the new format.
+
+### Added
+- (List major features added in 4.0.0)
+
+### Changed
+- (List major changes in 4.0.0)
+
+### Fixed
+- (List major fixes in 4.0.0)
+
+[Unreleased]: https://github.com/harelba/q/compare/v4.0.0...HEAD
+[4.0.0]: https://github.com/harelba/q/releases/tag/v4.0.0
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 9cecc1d4..261eeb9e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,674 +1,201 @@
- GNU GENERAL PUBLIC LICENSE
- Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc.
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
- Preamble
-
- The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
- The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works. By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users. We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors. You can apply it to
-your programs, too.
-
- When we speak of free software, we are referring to freedom, not
-price. Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
- To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights. Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
- For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received. You must make sure that they, too, receive
-or can get the source code. And you must show them these terms so they
-know their rights.
-
- Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
- For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software. For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
- Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so. This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software. The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable. Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products. If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
- Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary. To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
- The precise terms and conditions for copying, distribution and
-modification follow.
-
- TERMS AND CONDITIONS
-
- 0. Definitions.
-
- "This License" refers to version 3 of the GNU General Public License.
-
- "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
- "The Program" refers to any copyrightable work licensed under this
-License. Each licensee is addressed as "you". "Licensees" and
-"recipients" may be individuals or organizations.
-
- To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy. The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
- A "covered work" means either the unmodified Program or a work based
-on the Program.
-
- To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy. Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
- To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies. Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
- An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License. If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
- 1. Source Code.
-
- The "source code" for a work means the preferred form of the work
-for making modifications to it. "Object code" means any non-source
-form of a work.
-
- A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
- The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form. A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
- The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities. However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work. For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
- The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
- The Corresponding Source for a work in source code form is that
-same work.
-
- 2. Basic Permissions.
-
- All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met. This License explicitly affirms your unlimited
-permission to run the unmodified Program. The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work. This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
- You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force. You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright. Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
- Conveying under any other circumstances is permitted solely under
-the conditions stated below. Sublicensing is not allowed; section 10
-makes it unnecessary.
-
- 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
- No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
- When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
- 4. Conveying Verbatim Copies.
-
- You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
- You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
- 5. Conveying Modified Source Versions.
-
- You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
- a) The work must carry prominent notices stating that you modified
- it, and giving a relevant date.
-
- b) The work must carry prominent notices stating that it is
- released under this License and any conditions added under section
- 7. This requirement modifies the requirement in section 4 to
- "keep intact all notices".
-
- c) You must license the entire work, as a whole, under this
- License to anyone who comes into possession of a copy. This
- License will therefore apply, along with any applicable section 7
- additional terms, to the whole of the work, and all its parts,
- regardless of how they are packaged. This License gives no
- permission to license the work in any other way, but it does not
- invalidate such permission if you have separately received it.
-
- d) If the work has interactive user interfaces, each must display
- Appropriate Legal Notices; however, if the Program has interactive
- interfaces that do not display Appropriate Legal Notices, your
- work need not make them do so.
-
- A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit. Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
- 6. Conveying Non-Source Forms.
-
- You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
- a) Convey the object code in, or embodied in, a physical product
- (including a physical distribution medium), accompanied by the
- Corresponding Source fixed on a durable physical medium
- customarily used for software interchange.
-
- b) Convey the object code in, or embodied in, a physical product
- (including a physical distribution medium), accompanied by a
- written offer, valid for at least three years and valid for as
- long as you offer spare parts or customer support for that product
- model, to give anyone who possesses the object code either (1) a
- copy of the Corresponding Source for all the software in the
- product that is covered by this License, on a durable physical
- medium customarily used for software interchange, for a price no
- more than your reasonable cost of physically performing this
- conveying of source, or (2) access to copy the
- Corresponding Source from a network server at no charge.
-
- c) Convey individual copies of the object code with a copy of the
- written offer to provide the Corresponding Source. This
- alternative is allowed only occasionally and noncommercially, and
- only if you received the object code with such an offer, in accord
- with subsection 6b.
-
- d) Convey the object code by offering access from a designated
- place (gratis or for a charge), and offer equivalent access to the
- Corresponding Source in the same way through the same place at no
- further charge. You need not require recipients to copy the
- Corresponding Source along with the object code. If the place to
- copy the object code is a network server, the Corresponding Source
- may be on a different server (operated by you or a third party)
- that supports equivalent copying facilities, provided you maintain
- clear directions next to the object code saying where to find the
- Corresponding Source. Regardless of what server hosts the
- Corresponding Source, you remain obligated to ensure that it is
- available for as long as needed to satisfy these requirements.
-
- e) Convey the object code using peer-to-peer transmission, provided
- you inform other peers where the object code and Corresponding
- Source of the work are being offered to the general public at no
- charge under subsection 6d.
-
- A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
- A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling. In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage. For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product. A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
- "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source. The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
- If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information. But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
- The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed. Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
- Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
- 7. Additional Terms.
-
- "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law. If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
- When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it. (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.) You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
- Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
- a) Disclaiming warranty or limiting liability differently from the
- terms of sections 15 and 16 of this License; or
-
- b) Requiring preservation of specified reasonable legal notices or
- author attributions in that material or in the Appropriate Legal
- Notices displayed by works containing it; or
-
- c) Prohibiting misrepresentation of the origin of that material, or
- requiring that modified versions of such material be marked in
- reasonable ways as different from the original version; or
-
- d) Limiting the use for publicity purposes of names of licensors or
- authors of the material; or
-
- e) Declining to grant rights under trademark law for use of some
- trade names, trademarks, or service marks; or
-
- f) Requiring indemnification of licensors and authors of that
- material by anyone who conveys the material (or modified versions of
- it) with contractual assumptions of liability to the recipient, for
- any liability that these contractual assumptions directly impose on
- those licensors and authors.
-
- All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10. If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term. If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
- If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
- Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
- 8. Termination.
-
- You may not propagate or modify a covered work except as expressly
-provided under this License. Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
- However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
- Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
- Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License. If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
- 9. Acceptance Not Required for Having Copies.
-
- You are not required to accept this License in order to receive or
-run a copy of the Program. Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance. However,
-nothing other than this License grants you permission to propagate or
-modify any covered work. These actions infringe copyright if you do
-not accept this License. Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
- 10. Automatic Licensing of Downstream Recipients.
-
- Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License. You are not responsible
-for enforcing compliance by third parties with this License.
-
- An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations. If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
- You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License. For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
- 11. Patents.
-
- A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based. The
-work thus licensed is called the contributor's "contributor version".
-
- A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version. For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
- Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
- In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement). To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
- If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients. "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
- If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
- A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License. You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
- Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
- 12. No Surrender of Others' Freedom.
-
- If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License. If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all. For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
- 13. Use with the GNU Affero General Public License.
-
- Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work. The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
- 14. Revised Versions of this License.
-
- The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time. Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
- Each version is given a distinguishing version number. If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation. If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
- If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
- Later license versions may give you additional or different
-permissions. However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
- 15. Disclaimer of Warranty.
-
- THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
- 16. Limitation of Liability.
-
- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
- 17. Interpretation of Sections 15 and 16.
-
- If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
- END OF TERMS AND CONDITIONS
-
- How to Apply These Terms to Your New Programs
-
- If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
- To do so, attach the following notices to the program. It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
- {one line to give the program's name and a brief idea of what it does.}
- Copyright (C) {year} {name of author}
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
-
-Also add information on how to contact you by electronic and paper mail.
-
- If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
- {project} Copyright (C) {year} {fullname}
- This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
- This is free software, and you are welcome to redistribute it
- under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License. Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
- You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-.
-
- The GNU General Public License does not permit incorporating your program
-into proprietary programs. If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library. If this is what you want to do, use the GNU Lesser General
-Public License instead of this License. But first, please read
-.
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.markdown b/README.markdown
index 4d5edd5a..682799a2 100644
--- a/README.markdown
+++ b/README.markdown
@@ -45,6 +45,17 @@ Instructions for all OSs are [here](https://harelba.github.io/q/#installation).
The previous version `2.0.19` Can still be downloaded from [here](https://github.com/harelba/q/releases/tag/2.0.19)
+## Version Management
+
+q uses semantic versioning (MAJOR.MINOR.PATCH):
+- MAJOR: Incompatible API changes
+- MINOR: New functionality (backwards compatible)
+- PATCH: Bug fixes (backwards compatible)
+
+For version management:
+- Check the [CHANGELOG.md](CHANGELOG.md) for details about each release
+- Use `./bump-version.py` to bump versions (run `./bump-version.py --help` for usage information)
+
## Contact
Any feedback/suggestions/complaints regarding this tool would be much appreciated. Contributions are most welcome as well, of course.
diff --git a/bin/.qrc b/bin/.qrc
deleted file mode 100644
index 513452ce..00000000
--- a/bin/.qrc
+++ /dev/null
@@ -1,25 +0,0 @@
-#
-# q options ini file. Put either in your home folder as .qrc or in the working directory
-# (both will be merged in that order)
-#
-# All options should reside in an [options] section
-#
-# Available options:
-# * delimiter - escaped string (e.g. use \t for tab or \x20 for space)
-# * outputdelimiter - escaped string (e.g. use \t for tab or \x20 for space)
-# * gzipped - boolean True or False
-# * beautify - boolean True or False
-# * header_skip - integer number of lines to skip at the beginning of the file
-# * formatting - regular string - post-query formatting - see docs for details
-# * encoding - regular string - required encoding.
-#
-# All options have a matching command line option. See --help for details on defaults
-
-[options]
-#delimiter: \t
-#output_delimiter: \t
-#gzipped: False
-#beautify: True
-#skip_header: False
-#formatting: 1=%4.3f,2=%4.3f
-#encoding: UTF-8
diff --git a/bin/__init__.py b/bin/__init__.py
deleted file mode 100755
index cf529d79..00000000
--- a/bin/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env python
-
diff --git a/bin/q.bat b/bin/q.bat
deleted file mode 100644
index b832ab29..00000000
--- a/bin/q.bat
+++ /dev/null
@@ -1,5 +0,0 @@
-@echo off
-
-setlocal
-if exist "%~dp0..\python.exe" ( "%~dp0..\python" "%~dp0q" %* ) else ( python "%~dp0q" %* )
-endlocal
diff --git a/bin/q.py b/bin/q.py
deleted file mode 100755
index 2a2f6771..00000000
--- a/bin/q.py
+++ /dev/null
@@ -1,3640 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# Copyright (C) 2012-2021 Harel Ben-Attia
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details (doc/LICENSE contains
-# a copy of it)
-#
-#
-# Name : q (With respect to The Q Continuum)
-# Author : Harel Ben-Attia - harelba@gmail.com, harelba @ github, @harelba on twitter
-#
-#
-# q allows performing SQL-like statements on tabular text data.
-#
-# Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line.
-#
-# Full Documentation and details in https://harelba.github.io/q/
-#
-# Run with --help for command line details
-#
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import OrderedDict
-from sqlite3.dbapi2 import OperationalError
-from uuid import uuid4
-
-q_version = '3.1.6'
-
-#__all__ = [ 'QTextAsData' ]
-
-import os
-import sys
-import sqlite3
-import glob
-from argparse import ArgumentParser
-import codecs
-import locale
-import time
-import re
-from six.moves import configparser, range, filter
-import traceback
-import csv
-import uuid
-import math
-import six
-import io
-import json
-import datetime
-import hashlib
-
-if six.PY2:
- assert False, 'Python 2 is not longer supported by q'
-
-long = int
-unicode = six.text_type
-
-DEBUG = bool(os.environ.get('Q_DEBUG', None)) or '-V' in sys.argv
-SQL_DEBUG = False
-
-if DEBUG:
- def xprint(*args,**kwargs):
- print(datetime.datetime.utcnow().isoformat()," DEBUG ",*args,file=sys.stderr,**kwargs)
-
- def iprint(*args,**kwargs):
- print(datetime.datetime.utcnow().isoformat()," INFO ",*args,file=sys.stderr,**kwargs)
-
- def sqlprint(*args,**kwargs):
- pass
-else:
- def xprint(*args,**kwargs): pass
- def iprint(*args,**kwargs): pass
- def sqlprint(*args,**kwargs): pass
-
-if SQL_DEBUG:
- def sqlprint(*args,**kwargs):
- print(datetime.datetime.utcnow().isoformat(), " SQL ", *args, file=sys.stderr, **kwargs)
-
-
-def get_stdout_encoding(encoding_override=None):
- if encoding_override is not None and encoding_override != 'none':
- return encoding_override
-
- if sys.stdout.isatty():
- return sys.stdout.encoding
- else:
- return locale.getpreferredencoding()
-
-SHOW_SQL = False
-
-sha_algorithms = {
- 1 : hashlib.sha1,
- 224: hashlib.sha224,
- 256: hashlib.sha256,
- 386: hashlib.sha384,
- 512: hashlib.sha512
-}
-
-def sha(data,algorithm,encoding):
- try:
- f = sha_algorithms[algorithm]
- return f(six.text_type(data).encode(encoding)).hexdigest()
- except Exception as e:
- print(e)
-
-# For backward compatibility only (doesn't handle encoding well enough)
-def sha1(data):
- return hashlib.sha1(six.text_type(data).encode('utf-8')).hexdigest()
-
-# TODO Add caching of compiled regexps - Will be added after benchmarking capability is baked in
-def regexp(regular_expression, data):
- if data is not None:
- if not isinstance(data, str) and not isinstance(data, unicode):
- data = str(data)
- return re.search(regular_expression, data) is not None
- else:
- return False
-
-def regexp_extract(regular_expression, data,group_number):
- if data is not None:
- if not isinstance(data, str) and not isinstance(data, unicode):
- data = str(data)
- m = re.search(regular_expression, data)
- if m is not None:
- return m.groups()[group_number]
- else:
- return False
-
-def md5(data,encoding):
- m = hashlib.md5()
- m.update(six.text_type(data).encode(encoding))
- return m.hexdigest()
-
-def sqrt(data):
- return math.sqrt(data)
-
-def power(data,p):
- return data**p
-
-def file_ext(data):
- if data is None:
- return None
-
- return os.path.splitext(data)[1]
-
-def file_folder(data):
- if data is None:
- return None
- return os.path.split(data)[0]
-
-def file_basename(data):
- if data is None:
- return None
- return os.path.split(data)[1]
-
-def file_basename_no_ext(data):
- if data is None:
- return None
-
- return os.path.split(os.path.splitext(data)[0])[-1]
-
-def percentile(l, p):
- # TODO Alpha implementation, need to provide multiple interpolation methods, and add tests
- if not l:
- return None
- k = p*(len(l) - 1)
- f = math.floor(k)
- c = math.ceil(k)
- if c == f:
- return l[int(k)]
- return (c-k) * l[int(f)] + (k-f) * l[int(c)]
-
-# TODO Streaming Percentile to prevent memory consumption blowup for large datasets
-class StrictPercentile(object):
- def __init__(self):
- self.values = []
- self.p = None
-
- def step(self,value,p):
- if self.p is None:
- self.p = p
- self.values.append(value)
-
- def finalize(self):
- if len(self.values) == 0 or (self.p < 0 or self.p > 1):
- return None
- else:
- return percentile(sorted(self.values),self.p)
-
-class StdevPopulation(object):
- def __init__(self):
- self.M = 0.0
- self.S = 0.0
- self.k = 0
-
- def step(self, value):
- try:
- # Ignore nulls
- if value is None:
- return
- val = float(value) # if fails, skips this iteration, which also ignores nulls
- tM = self.M
- self.k += 1
- self.M += ((val - tM) / self.k)
- self.S += ((val - tM) * (val - self.M))
- except ValueError:
- # TODO propagate udf errors to console
- raise Exception("Data is not numeric when calculating stddev (%s)" % value)
-
- def finalize(self):
- if self.k <= 1: # avoid division by zero
- return None
- else:
- return math.sqrt(self.S / (self.k))
-
-class StdevSample(object):
- def __init__(self):
- self.M = 0.0
- self.S = 0.0
- self.k = 0
-
- def step(self, value):
- try:
- # Ignore nulls
- if value is None:
- return
- val = float(value) # if fails, skips this iteration, which also ignores nulls
- tM = self.M
- self.k += 1
- self.M += ((val - tM) / self.k)
- self.S += ((val - tM) * (val - self.M))
- except ValueError:
- # TODO propagate udf errors to console
- raise Exception("Data is not numeric when calculating stddev (%s)" % value)
-
- def finalize(self):
- if self.k <= 1: # avoid division by zero
- return None
- else:
- return math.sqrt(self.S / (self.k-1))
-
-class FunctionType(object):
- REGULAR = 1
- AGG = 2
-
-class UserFunctionDef(object):
- def __init__(self,func_type,name,usage,description,func_or_obj,param_count):
- self.func_type = func_type
- self.name = name
- self.usage = usage
- self.description = description
- self.func_or_obj = func_or_obj
- self.param_count = param_count
-
-user_functions = [
- UserFunctionDef(FunctionType.REGULAR,
- "regexp","regexp(,) = <1|0>",
- "Find regexp in string expression. Returns 1 if found or 0 if not",
- regexp,
- 2),
- UserFunctionDef(FunctionType.REGULAR,
- "regexp_extract","regexp_extract(,,group_number) = ",
- "Get regexp capture group content",
- regexp_extract,
- 3),
- UserFunctionDef(FunctionType.REGULAR,
- "sha","sha(,,) = ",
- "Calculate sha of some expression. Algorithm can be one of 1,224,256,384,512. For now encoding must be manually provided. Will use the input encoding automatically in the future.",
- sha,
- 3),
- UserFunctionDef(FunctionType.REGULAR,
- "sha1","sha1() = ",
- "Exists for backward compatibility only, since it doesn't handle encoding properly. Calculates sha1 of some expression",
- sha1,
- 1),
- UserFunctionDef(FunctionType.REGULAR,
- "md5","md5(,) = ",
- "Calculate md5 of expression. Returns a hex-string of the result. Currently requires to manually provide the encoding of the data. Will be taken automatically from the input encoding in the future.",
- md5,
- 2),
- UserFunctionDef(FunctionType.REGULAR,
- "sqrt","sqrt() = ",
- "Calculate the square root of the expression",
- sqrt,
- 1),
- UserFunctionDef(FunctionType.REGULAR,
- "power","power(,) = ",
- "Raise expr1 to the power of expr2",
- power,
- 2),
- UserFunctionDef(FunctionType.REGULAR,
- "file_ext","file_ext() = ",
- "Get the extension of a filename",
- file_ext,
- 1),
- UserFunctionDef(FunctionType.REGULAR,
- "file_folder","file_folder() = ",
- "Get the folder part of a filename",
- file_folder,
- 1),
- UserFunctionDef(FunctionType.REGULAR,
- "file_basename","file_basename() = ",
- "Get the basename of a filename, including extension if any",
- file_basename,
- 1),
- UserFunctionDef(FunctionType.REGULAR,
- "file_basename_no_ext","file_basename_no_ext() = ",
- "Get the basename of a filename, without the extension if there is one",
- file_basename_no_ext,
- 1),
- UserFunctionDef(FunctionType.AGG,
- "percentile","percentile(,) = ",
- "Calculate the strict percentile of a set of a values.",
- StrictPercentile,
- 2),
- UserFunctionDef(FunctionType.AGG,
- "stddev_pop","stddev_pop() = ",
- "Calculate the population standard deviation of a set of values",
- StdevPopulation,
- 1),
- UserFunctionDef(FunctionType.AGG,
- "stddev_sample","stddev_sample() = ",
- "Calculate the sample standard deviation of a set of values",
- StdevSample,
- 1)
-]
-
-def print_user_functions():
- for udf in user_functions:
- print("Function: %s" % udf.name)
- print(" Usage: %s" % udf.usage)
- print(" Description: %s" % udf.description)
-
-class Sqlite3DBResults(object):
- def __init__(self,query_column_names,results):
- self.query_column_names = query_column_names
- self.results = results
-
- def __str__(self):
- return "Sqlite3DBResults" % (len(self.results),str(self.query_column_names))
- __repr__ = __str__
-
-def get_sqlite_type_affinity(sqlite_type):
- sqlite_type = sqlite_type.upper()
- if 'INT' in sqlite_type:
- return 'INTEGER'
- elif 'CHAR' in sqlite_type or 'TEXT' in sqlite_type or 'CLOB' in sqlite_type:
- return 'TEXT'
- elif 'BLOB' in sqlite_type:
- return 'BLOB'
- elif 'REAL' in sqlite_type or 'FLOA' in sqlite_type or 'DOUB' in sqlite_type:
- return 'REAL'
- else:
- return 'NUMERIC'
-
-def sqlite_type_to_python_type(sqlite_type):
- SQLITE_AFFINITY_TO_PYTHON_TYPE_NAMES = {
- 'INTEGER': long,
- 'TEXT': unicode,
- 'BLOB': bytes,
- 'REAL': float,
- 'NUMERIC': float
- }
- return SQLITE_AFFINITY_TO_PYTHON_TYPE_NAMES[get_sqlite_type_affinity(sqlite_type)]
-
-
-class Sqlite3DB(object):
- # TODO Add metadata table with qsql file version
-
- QCATALOG_TABLE_NAME = '_qcatalog'
- NUMERIC_COLUMN_TYPES = {int, long, float}
- PYTHON_TO_SQLITE_TYPE_NAMES = { str: 'TEXT', int: 'INT', long : 'INT' , float: 'REAL', None: 'TEXT' }
-
-
- def __str__(self):
- return "Sqlite3DB" % self.sqlite_db_url
- __repr__ = __str__
-
- def __init__(self, db_id, sqlite_db_url, sqlite_db_filename, create_qcatalog, show_sql=SHOW_SQL):
- self.show_sql = show_sql
- self.create_qcatalog = create_qcatalog
-
- self.db_id = db_id
- # TODO Is this needed anymore?
- self.sqlite_db_filename = sqlite_db_filename
- self.sqlite_db_url = sqlite_db_url
- self.conn = sqlite3.connect(self.sqlite_db_url, uri=True)
- self.last_temp_table_id = 10000
- self.cursor = self.conn.cursor()
- self.add_user_functions()
-
- if create_qcatalog:
- self.create_qcatalog_table()
- else:
- xprint('Not creating qcatalog for db_id %s' % db_id)
-
- def retrieve_all_table_names(self):
- return [x[0] for x in self.execute_and_fetch("select tbl_name from sqlite_master where type='table'").results]
-
- def get_sqlite_table_info(self,table_name):
- return self.execute_and_fetch('PRAGMA table_info(%s)' % table_name).results
-
- def get_sqlite_database_list(self):
- return self.execute_and_fetch('pragma database_list').results
-
- def find_new_table_name(self,planned_table_name):
- existing_table_names = self.retrieve_all_table_names()
-
- possible_indices = range(1,1000)
-
- for index in possible_indices:
- if index == 1:
- suffix = ''
- else:
- suffix = '_%s' % index
-
- table_name_attempt = '%s%s' % (planned_table_name,suffix)
-
- if table_name_attempt not in existing_table_names:
- xprint("Found free table name %s in db %s for planned table name %s" % (table_name_attempt,self.db_id,planned_table_name))
- return table_name_attempt
-
- # TODO Add test for this
- raise Exception('Cannot find free table name in db %s for planned table name %s' % (self.db_id,planned_table_name))
-
- def create_qcatalog_table(self):
- if not self.qcatalog_table_exists():
- xprint("qcatalog table does not exist. Creating it")
- r = self.conn.execute("""CREATE TABLE %s (
- qcatalog_entry_id text not null primary key,
- content_signature_key text,
- temp_table_name text,
- content_signature text,
- creation_time text,
- source_type text,
- source text)""" % self.QCATALOG_TABLE_NAME).fetchall()
- else:
- xprint("qcatalog table already exists. No need to create it")
-
- def qcatalog_table_exists(self):
- return sqlite_table_exists(self.conn,self.QCATALOG_TABLE_NAME)
-
- def calculate_content_signature_key(self,content_signature):
- assert type(content_signature) == OrderedDict
- pp = json.dumps(content_signature,sort_keys=True)
- xprint("Calculating content signature for:",pp,six.b(pp))
- return hashlib.sha1(six.b(pp)).hexdigest()
-
- def add_to_qcatalog_table(self, temp_table_name, content_signature, creation_time,source_type, source):
- assert source is not None
- assert source_type is not None
- content_signature_key = self.calculate_content_signature_key(content_signature)
- xprint("db_id: %s Adding to qcatalog table: %s. Calculated signature key %s" % (self.db_id, temp_table_name,content_signature_key))
- r = self.execute_and_fetch(
- 'INSERT INTO %s (qcatalog_entry_id,content_signature_key, temp_table_name,content_signature,creation_time,source_type,source) VALUES (?,?,?,?,?,?,?)' % self.QCATALOG_TABLE_NAME,
- (str(uuid4()),content_signature_key,temp_table_name,json.dumps(content_signature),creation_time,source_type,source))
- # Ensure transaction is completed
- self.conn.commit()
-
- def get_from_qcatalog(self, content_signature):
- content_signature_key = self.calculate_content_signature_key(content_signature)
- xprint("Finding table in db_id %s that matches content signature key %s" % (self.db_id,content_signature_key))
-
- field_names = ["content_signature_key", "temp_table_name", "content_signature", "creation_time","source_type","source","qcatalog_entry_id"]
-
- q = "SELECT %s FROM %s where content_signature_key = ?" % (",".join(field_names),self.QCATALOG_TABLE_NAME)
- r = self.execute_and_fetch(q,(content_signature_key,))
-
- if r is None:
- return None
-
- if len(r.results) == 0:
- return None
-
- if len(r.results) > 1:
- raise Exception("Bug - Exactly one result should have been provided: %s" % str(r.results))
-
- d = dict(zip(field_names,r.results[0]))
- return d
-
- def get_from_qcatalog_using_table_name(self, temp_table_name):
- xprint("getting from qcatalog using table name")
-
- field_names = ["content_signature", "temp_table_name","creation_time","source_type","source","content_signature_key","qcatalog_entry_id"]
-
- q = "SELECT %s FROM %s where temp_table_name = ?" % (",".join(field_names),self.QCATALOG_TABLE_NAME)
- xprint("Query from qcatalog %s params %s" % (q,str(temp_table_name,)))
- r = self.execute_and_fetch(q,(temp_table_name,))
- xprint("results: ",r.results)
-
- if r is None:
- return None
-
- if len(r.results) == 0:
- return None
-
- if len(r.results) > 1:
- raise Exception("Bug - Exactly one result should have been provided: %s" % str(r.results))
-
- d = dict(zip(field_names,r.results[0]))
- # content_signature should be the first in the list of field_names
- cs = OrderedDict(json.loads(r.results[0][0]))
- if self.calculate_content_signature_key(cs) != d['content_signature_key']:
- raise Exception('Table contains an invalid entry - content signature key is not matching the actual content signature')
- return d
-
- def get_all_from_qcatalog(self):
- xprint("getting from qcatalog using table name")
-
- field_names = ["temp_table_name", "content_signature", "creation_time","source_type","source","qcatalog_entry_id"]
-
- q = "SELECT %s FROM %s" % (",".join(field_names),self.QCATALOG_TABLE_NAME)
- xprint("Query from qcatalog %s" % q)
- r = self.execute_and_fetch(q)
-
- if r is None:
- return None
-
- def convert(res):
- d = dict(zip(field_names, res))
- cs = OrderedDict(json.loads(res[1]))
- d['content_signature_key'] = self.calculate_content_signature_key(cs)
- return d
-
- rr = [convert(r) for r in r.results]
-
- return rr
-
- def done(self):
- xprint("Closing database %s" % self.db_id)
- try:
- self.conn.commit()
- self.conn.close()
- xprint("Database %s closed" % self.db_id)
- except Exception as e:
- xprint("Could not close database %s" % self.db_id)
- raise
-
- def add_user_functions(self):
- for udf in user_functions:
- if type(udf.func_or_obj) == type(object):
- self.conn.create_aggregate(udf.name,udf.param_count,udf.func_or_obj)
- elif type(udf.func_or_obj) == type(md5):
- self.conn.create_function(udf.name,udf.param_count,udf.func_or_obj)
- else:
- raise Exception("Invalid user function definition %s" % str(udf))
-
- def is_numeric_type(self, column_type):
- return column_type in Sqlite3DB.NUMERIC_COLUMN_TYPES
-
- def update_many(self, sql, params):
- try:
- sqlprint(sql, " params: " + str(params))
- self.cursor.executemany(sql, params)
- _ = self.cursor.fetchall()
- finally:
- pass # cursor.close()
-
- def execute_and_fetch(self, q,params = None):
- try:
- try:
- if self.show_sql:
- print(repr(q))
- if params is None:
- r = self.cursor.execute(q)
- else:
- r = self.cursor.execute(q,params)
- if self.cursor.description is not None:
- # we decode the column names, so they can be encoded to any output format later on
- query_column_names = [c[0] for c in self.cursor.description]
- else:
- query_column_names = None
- result = self.cursor.fetchall()
- finally:
- pass # cursor.close()
- except OperationalError as e:
- raise SqliteOperationalErrorException("Failed executing sqlite query %s with params %s . error: %s" % (q,params,str(e)),e)
- return Sqlite3DBResults(query_column_names,result)
-
- def _get_as_list_str(self, l):
- return ",".join(['"%s"' % x.replace('"', '""') for x in l])
-
- def generate_insert_row(self, table_name, column_names):
- col_names_str = self._get_as_list_str(column_names)
- question_marks = ", ".join(["?" for i in range(0, len(column_names))])
- return 'INSERT INTO %s (%s) VALUES (%s)' % (table_name, col_names_str, question_marks)
-
- # Get a list of column names so order will be preserved (Could have used OrderedDict, but
- # then we would need python 2.7)
- def generate_create_table(self, table_name, column_names, column_dict):
- # Convert dict from python types to db types
- column_name_to_db_type = dict(
- (n, Sqlite3DB.PYTHON_TO_SQLITE_TYPE_NAMES[t]) for n, t in six.iteritems(column_dict))
- column_defs = ','.join(['"%s" %s' % (
- n.replace('"', '""'), column_name_to_db_type[n]) for n in column_names])
- return 'CREATE TABLE %s (%s)' % (table_name, column_defs)
-
- def generate_temp_table_name(self):
- # WTF - From my own past mutable-self
- self.last_temp_table_id += 1
- tn = "temp_table_%s" % self.last_temp_table_id
- return tn
-
- def generate_drop_table(self, table_name):
- return "DROP TABLE %s" % table_name
-
- def drop_table(self, table_name):
- return self.execute_and_fetch(self.generate_drop_table(table_name))
-
- def attach_and_copy_table(self, from_db, relevant_table,stop_after_analysis):
- xprint("Attaching %s into db %s and copying table %s into it" % (from_db,self,relevant_table))
- temp_db_id = 'temp_db_id'
- q = "attach '%s' as %s" % (from_db.sqlite_db_url,temp_db_id)
- xprint("Attach query: %s" % q)
- c = self.execute_and_fetch(q)
-
- new_temp_table_name = 'temp_table_%s' % (self.last_temp_table_id + 1)
- fully_qualified_table_name = '%s.%s' % (temp_db_id,relevant_table)
-
- if stop_after_analysis:
- limit = ' limit 100'
- else:
- limit = ''
-
- copy_query = 'create table %s as select * from %s %s' % (new_temp_table_name,fully_qualified_table_name,limit)
- copy_results = self.execute_and_fetch(copy_query)
- xprint("Copied %s.%s into %s in db_id %s. Results %s" % (temp_db_id,relevant_table,new_temp_table_name,self.db_id,copy_results))
- self.last_temp_table_id += 1
-
- xprint("Copied table into %s. Detaching db that was attached temporarily" % self.db_id)
-
- q = "detach database %s" % temp_db_id
- xprint("detach query: %s" % q)
- c = self.execute_and_fetch(q)
- xprint(c)
- return new_temp_table_name
-
-
-class CouldNotConvertStringToNumericValueException(Exception):
-
- def __init__(self, msg):
- self.msg = msg
-
- def __str(self):
- return repr(self.msg)
-
-class SqliteOperationalErrorException(Exception):
-
- def __init__(self, msg,original_error):
- self.msg = msg
- self.original_error = original_error
-
- def __str(self):
- return repr(self.msg) + "//" + repr(self.original_error)
-
-class IncorrectDefaultValueException(Exception):
-
- def __init__(self, option_type,option,actual_value):
- self.option_type = option_type
- self.option = option
- self.actual_value = actual_value
-
- def __str__(self):
- return repr(self)
-
-class NonExistentTableNameInQsql(Exception):
-
- def __init__(self, qsql_filename,table_name,existing_table_names):
- self.qsql_filename = qsql_filename
- self.table_name = table_name
- self.existing_table_names = existing_table_names
-
-class NonExistentTableNameInSqlite(Exception):
-
- def __init__(self, qsql_filename,table_name,existing_table_names):
- self.qsql_filename = qsql_filename
- self.table_name = table_name
- self.existing_table_names = existing_table_names
-
-class TooManyTablesInQsqlException(Exception):
-
- def __init__(self, qsql_filename,existing_table_names):
- self.qsql_filename = qsql_filename
- self.existing_table_names = existing_table_names
-
-class NoTableInQsqlExcption(Exception):
-
- def __init__(self, qsql_filename):
- self.qsql_filename = qsql_filename
-
-class TooManyTablesInSqliteException(Exception):
-
- def __init__(self, qsql_filename,existing_table_names):
- self.qsql_filename = qsql_filename
- self.existing_table_names = existing_table_names
-
-class NoTablesInSqliteException(Exception):
-
- def __init__(self, sqlite_filename):
- self.sqlite_filename = sqlite_filename
-
-class ColumnMaxLengthLimitExceededException(Exception):
-
- def __init__(self, msg):
- self.msg = msg
-
- def __str(self):
- return repr(self.msg)
-
-class CouldNotParseInputException(Exception):
-
- def __init__(self, msg):
- self.msg = msg
-
- def __str(self):
- return repr(self.msg)
-
-class BadHeaderException(Exception):
-
- def __init__(self, msg):
- self.msg = msg
-
- def __str(self):
- return repr(self.msg)
-
-class EncodedQueryException(Exception):
-
- def __init__(self, msg):
- self.msg = msg
-
- def __str(self):
- return repr(self.msg)
-
-
-class CannotUnzipDataStreamException(Exception):
-
- def __init__(self):
- pass
-
-class UniversalNewlinesExistException(Exception):
-
- def __init__(self):
- pass
-
-class EmptyDataException(Exception):
-
- def __init__(self):
- pass
-
-class MissingHeaderException(Exception):
-
- def __init__(self,msg):
- self.msg = msg
-
-class InvalidQueryException(Exception):
-
- def __init__(self,msg):
- self.msg = msg
-
-class TooManyAttachedDatabasesException(Exception):
-
- def __init__(self,msg):
- self.msg = msg
-
-class FileNotFoundException(Exception):
-
- def __init__(self, msg):
- self.msg = msg
-
- def __str(self):
- return repr(self.msg)
-
-class UnknownFileTypeException(Exception):
-
- def __init__(self, msg):
- self.msg = msg
-
- def __str(self):
- return repr(self.msg)
-
-
-class ColumnCountMismatchException(Exception):
-
- def __init__(self, msg):
- self.msg = msg
-
-class ContentSignatureNotFoundException(Exception):
-
- def __init__(self, msg):
- self.msg = msg
-
-class StrictModeColumnCountMismatchException(Exception):
-
- def __init__(self,atomic_fn, expected_col_count,actual_col_count,lines_read):
- self.atomic_fn = atomic_fn
- self.expected_col_count = expected_col_count
- self.actual_col_count = actual_col_count
- self.lines_read = lines_read
-
-class FluffyModeColumnCountMismatchException(Exception):
-
- def __init__(self,atomic_fn, expected_col_count,actual_col_count,lines_read):
- self.atomic_fn = atomic_fn
- self.expected_col_count = expected_col_count
- self.actual_col_count = actual_col_count
- self.lines_read = lines_read
-
-class ContentSignatureDiffersException(Exception):
-
- def __init__(self,original_filename, other_filename, filenames_str,key,source_value,signature_value):
- self.original_filename = original_filename
- self.other_filename = other_filename
- self.filenames_str = filenames_str
- self.key = key
- self.source_value = source_value
- self.signature_value = signature_value
-
-
-class ContentSignatureDataDiffersException(Exception):
-
- def __init__(self,msg):
- self.msg = msg
-
-
-class InvalidQSqliteFileException(Exception):
-
- def __init__(self,msg):
- self.msg = msg
-
-
-class MaximumSourceFilesExceededException(Exception):
-
- def __init__(self,msg):
- self.msg = msg
-
-
-
-# Simplistic Sql "parsing" class... We'll eventually require a real SQL parser which will provide us with a parse tree
-#
-# A "qtable" is a filename which behaves like an SQL table...
-class Sql(object):
-
- def __init__(self, sql, data_streams):
- # Currently supports only standard SELECT statements
-
- # Holds original SQL
- self.sql = sql
- # Holds sql parts
- self.sql_parts = sql.split()
- self.data_streams = data_streams
-
- self.qtable_metadata_dict = OrderedDict()
-
- # Set of qtable names
- self.qtable_names = []
- # Dict from qtable names to their positions in sql_parts. Value here is a *list* of positions,
- # since it is possible that the same qtable_name (file) is referenced in multiple positions
- # and we don't want the database table to be recreated for each
- # reference
- self.qtable_name_positions = {}
- # Dict from qtable names to their effective (actual database) table
- # names
- self.qtable_name_effective_table_names = {}
-
- self.query_column_names = None
-
- # Go over all sql parts
- idx = 0
- while idx < len(self.sql_parts):
- # Get the part string
- part = self.sql_parts[idx]
- # If it's a FROM or a JOIN
- if part.upper() in ['FROM', 'JOIN']:
- # and there is nothing after it,
- if idx == len(self.sql_parts) - 1:
- # Just fail
- raise InvalidQueryException(
- 'FROM/JOIN is missing a table name after it')
-
- qtable_name = self.sql_parts[idx + 1]
- # Otherwise, the next part contains the qtable name. In most cases the next part will be only the qtable name.
- # We handle one special case here, where this is a subquery as a column: "SELECT (SELECT ... FROM qtable),100 FROM ...".
- # In that case, there will be an ending paranthesis as part of the name, and we want to handle this case gracefully.
- # This is obviously a hack of a hack :) Just until we have
- # complete parsing capabilities
- if ')' in qtable_name:
- leftover = qtable_name[qtable_name.index(')'):]
- self.sql_parts.insert(idx + 2, leftover)
- qtable_name = qtable_name[:qtable_name.index(')')]
- self.sql_parts[idx + 1] = qtable_name
-
- if qtable_name[0] != '(':
- normalized_qtable_name = self.normalize_qtable_name(qtable_name)
- xprint("Normalized qtable name for %s is %s" % (qtable_name,normalized_qtable_name))
- self.qtable_names += [normalized_qtable_name]
-
- if normalized_qtable_name not in self.qtable_name_positions.keys():
- self.qtable_name_positions[normalized_qtable_name] = []
-
- self.qtable_name_positions[normalized_qtable_name].append(idx + 1)
- self.sql_parts[idx + 1] = normalized_qtable_name
- idx += 2
- else:
- idx += 1
- else:
- idx += 1
- xprint("Final sql parts: %s" % self.sql_parts)
-
- def normalize_qtable_name(self,qtable_name):
- if self.data_streams.is_data_stream(qtable_name):
- return qtable_name
-
- if ':::' in qtable_name:
- qsql_filename, table_name = qtable_name.split(":::", 1)
- return '%s:::%s' % (os.path.realpath(os.path.abspath(qsql_filename)),table_name)
- else:
- return os.path.realpath(os.path.abspath(qtable_name))
-
- def set_effective_table_name(self, qtable_name, effective_table_name):
- if qtable_name in self.qtable_name_effective_table_names.keys():
- if self.qtable_name_effective_table_names[qtable_name] != effective_table_name:
- raise Exception(
- "Already set effective table name for qtable %s. Trying to change the effective table name from %s to %s" %
- (qtable_name,self.qtable_name_effective_table_names[qtable_name],effective_table_name))
-
- xprint("Setting effective table name for %s - effective table name is set to %s" % (qtable_name,effective_table_name))
- self.qtable_name_effective_table_names[
- qtable_name] = effective_table_name
-
- def get_effective_sql(self,table_name_mapping=None):
- if len(list(filter(lambda x: x is None, self.qtable_name_effective_table_names))) != 0:
- assert False, 'There are qtables without effective tables'
-
- effective_sql = [x for x in self.sql_parts]
-
- xprint("Effective table names",self.qtable_name_effective_table_names)
- for qtable_name, positions in six.iteritems(self.qtable_name_positions):
- xprint("Positions for qtable name %s are %s" % (qtable_name,positions))
- for pos in positions:
- if table_name_mapping is not None:
- x = self.qtable_name_effective_table_names[qtable_name]
- effective_sql[pos] = table_name_mapping[x]
- else:
- effective_sql[pos] = self.qtable_name_effective_table_names[qtable_name]
-
- return " ".join(effective_sql)
-
- def get_qtable_name_effective_table_names(self):
- return self.qtable_name_effective_table_names
-
- def execute_and_fetch(self, db):
- x = self.get_effective_sql()
- xprint("Final query: %s" % x)
- db_results_obj = db.execute_and_fetch(x)
- return db_results_obj
-
- def materialize_using(self,loaded_table_structures_dict):
- xprint("Materializing sql object: %s" % str(self.qtable_names))
- xprint("loaded table structures dict %s" % loaded_table_structures_dict)
- for qtable_name in self.qtable_names:
- table_structure = loaded_table_structures_dict[qtable_name]
-
- table_name_in_disk_db = table_structure.get_table_name_for_querying()
-
- effective_table_name = '%s.%s' % (table_structure.db_id, table_name_in_disk_db)
-
- # for a single file - no need to create a union, just use the table name
- self.set_effective_table_name(qtable_name, effective_table_name)
- xprint("Materialized filename %s to effective table name %s" % (qtable_name,effective_table_name))
-
-
-class TableColumnInferer(object):
-
- def __init__(self, input_params):
- self.inferred = False
- self.mode = input_params.parsing_mode
- self.rows = []
- self.skip_header = input_params.skip_header
- self.header_row = None
- self.header_row_filename = None
- self.expected_column_count = input_params.expected_column_count
- self.input_delimiter = input_params.delimiter
- self.disable_column_type_detection = input_params.disable_column_type_detection
-
- def _generate_content_signature(self):
- return OrderedDict({
- "inferred": self.inferred,
- "mode": self.mode,
- "rows": "\n".join([",".join(x) for x in self.rows]),
- "skip_header": self.skip_header,
- "header_row": self.header_row,
- "expected_column_count": self.expected_column_count,
- "input_delimiter": self.input_delimiter,
- "disable_column_type_detection": self.disable_column_type_detection
- })
-
- def analyze(self, filename, col_vals):
- if self.inferred:
- assert False, "Already inferred columns"
-
- if self.skip_header and self.header_row is None:
- self.header_row = col_vals
- self.header_row_filename = filename
- else:
- self.rows.append(col_vals)
-
- if len(self.rows) < 100:
- return False
-
- self.do_analysis()
- return True
-
- def force_analysis(self):
- # This method is called whenever there is no more data, and an analysis needs
- # to be performed immediately, regardless of the amount of sample data that has
- # been collected
- self.do_analysis()
-
- def determine_type_of_value(self, value):
- if self.disable_column_type_detection:
- return str
-
- if value is not None:
- value = value.strip()
- if value == '' or value is None:
- return None
-
- try:
- i = int(value)
- if type(i) == long:
- return long
- else:
- return int
- except:
- pass
-
- try:
- f = float(value)
- return float
- except:
- pass
-
- return str
-
- def determine_type_of_value_list(self, value_list):
- type_list = [self.determine_type_of_value(v) for v in value_list]
- all_types = set(type_list)
- if len(set(type_list)) == 1:
- # all the sample lines are of the same type
- return type_list[0]
- else:
- # check for the number of types without nulls,
- type_list_without_nulls = list(filter(
- lambda x: x is not None, type_list))
- # If all the sample lines are of the same type,
- if len(set(type_list_without_nulls)) == 1:
- # return it
- return type_list_without_nulls[0]
- else:
- # If there are only two types, one float an one int, then choose a float type
- if len(set(type_list_without_nulls)) == 2 and float in type_list_without_nulls and int in type_list_without_nulls:
- return float
- return str
-
- def do_analysis(self):
- if self.mode == 'strict':
- self._do_strict_analysis()
- elif self.mode in ['relaxed']:
- self._do_relaxed_analysis()
- else:
- raise Exception('Unknown parsing mode %s' % self.mode)
-
- if self.column_count == 1 and self.expected_column_count != 1 and self.expected_column_count is not None:
- print(f"Warning: column count is one (expected column count is {self.expected_column_count} - did you provide the correct delimiter?", file=sys.stderr)
-
- self.infer_column_types()
- self.infer_column_names()
- self.inferred = True
-
- def validate_column_names(self, value_list):
- column_name_errors = []
- for v in value_list:
- if v is None:
- # we allow column names to be None, in relaxed mode it'll be filled with default names.
- # RLRL
- continue
- if ',' in v:
- column_name_errors.append(
- (v, "Column name cannot contain commas"))
- continue
- if self.input_delimiter in v:
- column_name_errors.append(
- (v, "Column name cannot contain the input delimiter. Please make sure you've set the correct delimiter"))
- continue
- if '\n' in v:
- column_name_errors.append(
- (v, "Column name cannot contain newline"))
- continue
- if v != v.strip():
- column_name_errors.append(
- (v, "Column name contains leading/trailing spaces"))
- continue
- try:
- v.encode("utf-8", "strict").decode("utf-8")
- except:
- column_name_errors.append(
- (v, "Column name must be UTF-8 Compatible"))
- continue
- # We're checking for column duplication for each field in order to be able to still provide it along with other errors
- if len(list(filter(lambda x: x == v,value_list))) > 1:
- entry = (v, "Column name is duplicated")
- # Don't duplicate the error report itself
- if entry not in column_name_errors:
- column_name_errors.append(entry)
- continue
- nul_index = v.find("\x00")
- if nul_index >= 0:
- column_name_errors.append(
- (v, "Column name cannot contain NUL"))
- continue
- t = self.determine_type_of_value(v)
- if t != str:
- column_name_errors.append((v, "Column name must be a string"))
- return column_name_errors
-
- def infer_column_names(self):
- if self.header_row is not None:
- column_name_errors = self.validate_column_names(self.header_row)
- if len(column_name_errors) > 0:
- raise BadHeaderException("Header must contain only strings and not numbers or empty strings: '%s'\n%s" % (
- ",".join(self.header_row), "\n".join(["'%s': %s" % (x, y) for x, y in column_name_errors])))
-
- # use header row in order to name columns
- if len(self.header_row) < self.column_count:
- if self.mode == 'strict':
- raise ColumnCountMismatchException("Strict mode. Header row contains less columns than expected column count(%s vs %s)" % (
- len(self.header_row), self.column_count))
- elif self.mode in ['relaxed']:
- # in relaxed mode, add columns to fill the missing ones
- self.header_row = self.header_row + \
- ['c%s' % (x + len(self.header_row) + 1)
- for x in range(self.column_count - len(self.header_row))]
- elif len(self.header_row) > self.column_count:
- if self.mode == 'strict':
- raise ColumnCountMismatchException("Strict mode. Header row contains more columns than expected column count (%s vs %s)" % (
- len(self.header_row), self.column_count))
- elif self.mode in ['relaxed']:
- # In relaxed mode, just cut the extra column names
- self.header_row = self.header_row[:self.column_count]
- self.column_names = self.header_row
- else:
- # Column names are cX starting from 1
- self.column_names = ['c%s' % (i + 1)
- for i in range(self.column_count)]
-
- def _do_relaxed_analysis(self):
- column_count_list = [len(col_vals) for col_vals in self.rows]
-
- if len(self.rows) == 0:
- if self.header_row is None:
- self.column_count = 0
- else:
- self.column_count = len(self.header_row)
- else:
- if self.expected_column_count is not None:
- self.column_count = self.expected_column_count
- else:
- # If not specified, we'll take the largest row in the sample rows
- self.column_count = max(column_count_list)
-
- def get_column_count_summary(self, column_count_list):
- counts = {}
- for column_count in column_count_list:
- counts[column_count] = counts.get(column_count, 0) + 1
- return six.u(", ").join([six.u("{} rows with {} columns".format(v, k)) for k, v in six.iteritems(counts)])
-
- def _do_strict_analysis(self):
- column_count_list = [len(col_vals) for col_vals in self.rows]
-
- if len(set(column_count_list)) != 1:
- raise ColumnCountMismatchException('Strict mode. Column Count is expected to identical. Multiple column counts exist at the first part of the file. Try to check your delimiter, or change to relaxed mode. Details: %s' % (
- self.get_column_count_summary(column_count_list)))
-
- self.column_count = len(self.rows[0])
-
- if self.expected_column_count is not None and self.column_count != self.expected_column_count:
- raise ColumnCountMismatchException('Strict mode. Column count is expected to be %s but is %s' % (
- self.expected_column_count, self.column_count))
-
- self.infer_column_types()
-
- def infer_column_types(self):
- assert self.column_count > -1
- self.column_types = []
- self.column_types2 = []
- for column_number in range(self.column_count):
- column_value_list = [
- row[column_number] if column_number < len(row) else None for row in self.rows]
- column_type = self.determine_type_of_value_list(column_value_list)
- self.column_types.append(column_type)
-
- column_value_list2 = [row[column_number] if column_number < len(
- row) else None for row in self.rows[1:]]
- column_type2 = self.determine_type_of_value_list(
- column_value_list2)
- self.column_types2.append(column_type2)
-
- comparison = map(
- lambda x: x[0] == x[1], zip(self.column_types, self.column_types2))
- if False in comparison and not self.skip_header:
- number_of_column_types = len(set(self.column_types))
- if number_of_column_types == 1 and list(set(self.column_types))[0] == str:
- print('Warning - There seems to be header line in the file, but -H has not been specified. All fields will be detected as text fields, and the header line will appear as part of the data', file=sys.stderr)
-
- def get_column_dict(self):
- return OrderedDict(zip(self.column_names, self.column_types))
-
- def get_column_count(self):
- return self.column_count
-
- def get_column_names(self):
- return self.column_names
-
- def get_column_types(self):
- return self.column_types
-
-
-def py3_encoded_csv_reader(encoding, f, dialect,row_data_only=False,**kwargs):
- try:
- xprint("f is %s" % str(f))
- xprint("dialect is %s" % dialect)
- csv_reader = csv.reader(f, dialect, **kwargs)
-
- if row_data_only:
- for row in csv_reader:
- yield row
- else:
- for row in csv_reader:
- yield (f.filename(),f.isfirstline(),row)
-
- except UnicodeDecodeError as e1:
- raise CouldNotParseInputException(e1)
- except ValueError as e:
- # TODO Add test for this
- if str(e) is not None and str(e).startswith('could not convert string to'):
- raise CouldNotConvertStringToNumericValueException(str(e))
- else:
- raise CouldNotParseInputException(str(e))
- except Exception as e:
- if str(e).startswith("field larger than field limit"):
- raise ColumnMaxLengthLimitExceededException(str(e))
- elif 'universal-newline' in str(e):
- raise UniversalNewlinesExistException()
- else:
- raise
-
-encoded_csv_reader = py3_encoded_csv_reader
-
-def normalized_filename(filename):
- return filename
-
-class TableCreatorState(object):
- INITIALIZED = 'INITIALIZED'
- ANALYZED = 'ANALYZED'
- FULLY_READ = 'FULLY_READ'
-
-class MaterializedStateType(object):
- UNKNOWN = 'unknown'
- DELIMITED_FILE = 'delimited-file'
- QSQL_FILE = 'qsql-file'
- SQLITE_FILE = 'sqlite-file'
- DATA_STREAM = 'data-stream'
-
-class TableSourceType(object):
- DELIMITED_FILE = 'file'
- DELIMITED_FILE_WITH_UNUSED_QSQL = 'file-with-unused-qsql'
- QSQL_FILE = 'qsql-file'
- QSQL_FILE_WITH_ORIGINAL = 'qsql-file-with-original'
- SQLITE_FILE = 'sqlite-file'
- DATA_STREAM = 'data-stream'
-
-def skip_BOM(f):
- try:
- BOM = f.buffer.read(3)
-
- if BOM != six.b('\xef\xbb\xbf'):
- # TODO Add test for this (propagates to try:except)
- raise Exception('Value of BOM is not as expected - Value is "%s"' % str(BOM))
- except Exception as e:
- # TODO Add a test for this
- raise Exception('Tried to skip BOM for "utf-8-sig" encoding and failed. Error message is ' + str(e))
-
-def detect_qtable_name_source_info(qtable_name,data_streams,read_caching_enabled):
- data_stream = data_streams.get_for_filename(qtable_name)
- xprint("Found data stream %s" % data_stream)
-
- if data_stream is not None:
- return MaterializedStateType.DATA_STREAM, TableSourceType.DATA_STREAM,(data_stream,)
-
- if ':::' in qtable_name:
- qsql_filename, table_name = qtable_name.split(":::", 1)
- if not os.path.exists(qsql_filename):
- raise FileNotFoundException("Could not find file %s" % qsql_filename)
-
- if is_qsql_file(qsql_filename):
- return MaterializedStateType.QSQL_FILE, TableSourceType.QSQL_FILE, (qsql_filename, table_name,)
- if is_sqlite_file(qsql_filename):
- return MaterializedStateType.SQLITE_FILE, TableSourceType.SQLITE_FILE, (qsql_filename, table_name,)
- raise UnknownFileTypeException("Cannot detect the type of table %s" % qtable_name)
- else:
- if is_qsql_file(qtable_name):
- return MaterializedStateType.QSQL_FILE, TableSourceType.QSQL_FILE, (qtable_name, None)
- if is_sqlite_file(qtable_name):
- return MaterializedStateType.SQLITE_FILE, TableSourceType.SQLITE_FILE, (qtable_name, None)
- matching_qsql_file_candidate = qtable_name + '.qsql'
-
- table_source_type = TableSourceType.DELIMITED_FILE
- if is_qsql_file(matching_qsql_file_candidate):
- if read_caching_enabled:
- xprint("Found matching qsql file for original file %s (matching file %s) and read caching is enabled. Using it" % (qtable_name,matching_qsql_file_candidate))
- return MaterializedStateType.QSQL_FILE, TableSourceType.QSQL_FILE_WITH_ORIGINAL, (matching_qsql_file_candidate, None)
- else:
- xprint("Found matching qsql file for original file %s (matching file %s), but read caching is disabled. Not using it" % (qtable_name,matching_qsql_file_candidate))
- table_source_type = TableSourceType.DELIMITED_FILE_WITH_UNUSED_QSQL
-
-
- return MaterializedStateType.DELIMITED_FILE,table_source_type ,(qtable_name, None)
-
-
-def is_sqlite_file(filename):
- if not os.path.exists(filename):
- return False
-
- f = open(filename,'rb')
- magic = f.read(16)
- f.close()
- return magic == six.b("SQLite format 3\x00")
-
-def sqlite_table_exists(cursor,table_name):
- results = cursor.execute("select count(*) from sqlite_master where type='table' and tbl_name == '%s'" % table_name).fetchall()
- return results[0][0] == 1
-
-def is_qsql_file(filename):
- if not is_sqlite_file(filename):
- return False
-
- db = Sqlite3DB('check_qsql_db',filename,filename,create_qcatalog=False)
- qcatalog_exists = db.qcatalog_table_exists()
- db.done()
- return qcatalog_exists
-
-def normalize_filename_to_table_name(filename):
- xprint("Normalizing filename %s" % filename)
- if filename[0].isdigit():
- xprint("Filename starts with a digit, adding prefix")
- filename = 't_%s' % filename
- if filename.lower().endswith(".qsql"):
- filename = filename[:-5]
- elif filename.lower().endswith('.sqlite'):
- filename = filename[:-7]
- elif filename.lower().endswith('.sqlite3'):
- filename = filename[:-8]
- return filename.replace("-","_dash_").replace(".","_dot_").replace('?','_qm_').replace("/","_slash_").replace("\\","_backslash_").replace(":","_colon_").replace(" ","_space_").replace("+","_plus_")
-
-def validate_content_signature(original_filename, source_signature,other_filename, content_signature,scope=None,dump=False):
- if dump:
- xprint("Comparing: source value: %s target value: %s" % (source_signature,content_signature))
-
- s = "%s vs %s:" % (original_filename,other_filename)
- if scope is None:
- scope = []
- for k in source_signature:
- if type(source_signature[k]) == OrderedDict:
- validate_content_signature(original_filename, source_signature[k],other_filename, content_signature[k],scope + [k])
- else:
- if k not in content_signature:
- raise ContentSignatureDataDiffersException("%s Content Signatures differ. %s is missing from content signature" % (s,k))
- if source_signature[k] != content_signature[k]:
- if k == 'rows':
- raise ContentSignatureDataDiffersException("%s Content Signatures differ at %s.%s (actual analysis data differs)" % (s,".".join(scope),k))
- else:
- raise ContentSignatureDiffersException(original_filename, other_filename, original_filename,".".join(scope + [k]),source_signature[k],content_signature[k])
-
-class DelimitedFileReader(object):
- def __init__(self,atomic_fns, input_params, dialect, f = None,external_f_name = None):
- if f is not None:
- assert len(atomic_fns) == 0
-
- self.atomic_fns = atomic_fns
- self.input_params = input_params
- self.dialect = dialect
-
- self.f = f
- self.lines_read = 0
- self.file_number = -1
-
- self.skipped_bom = False
-
- self.is_open = f is not None
-
- self.external_f = f is not None
- self.external_f_name = external_f_name
-
- def get_lines_read(self):
- return self.lines_read
-
- def get_size_hash(self):
- if self.atomic_fns is None or len(self.atomic_fns) == 0:
- return "data-stream-size"
- else:
- return ",".join(map(str,[os.stat(atomic_fn).st_size for atomic_fn in self.atomic_fns]))
-
- def get_last_modification_time_hash(self):
- if self.atomic_fns is None or len(self.atomic_fns) == 0:
- return "data stream-lmt"
- else:
- x = ",".join(map(lambda x: ':%s:' % x,[os.stat(x).st_mtime_ns for x in self.atomic_fns]))
- res = hashlib.sha1(six.b(x)).hexdigest() + '///' + x
- xprint("Hash of last modification time is %s" % res)
- return res
-
- def open_file(self):
- if self.external_f:
- xprint("External f has been provided. No need to open the file")
- return
-
- # TODO Support universal newlines for gzipped and stdin data as well
-
- xprint("XX Opening file %s" % ",".join(self.atomic_fns))
- import fileinput
-
- def q_openhook(filename, mode):
- if self.input_params.gzipped_input or filename.endswith('.gz'):
- import gzip
- f = gzip.open(filename,mode='rt',encoding=self.input_params.input_encoding)
- else:
- if six.PY3:
- if self.input_params.with_universal_newlines:
- f = io.open(filename, 'rU', newline=None, encoding=self.input_params.input_encoding)
- else:
- f = io.open(filename, 'r', newline=None, encoding=self.input_params.input_encoding)
- else:
- if self.input_params.with_universal_newlines:
- file_opening_mode = 'rbU'
- else:
- file_opening_mode = 'rb'
- f = open(filename, file_opening_mode)
-
- if self.input_params.input_encoding == 'utf-8-sig' and not self.skipped_bom:
- skip_BOM(f)
-
- return f
-
- f = fileinput.input(self.atomic_fns,mode='rb',openhook=q_openhook)
-
- self.f = f
- self.is_open = True
- xprint("Actually opened file %s" % self.f)
- return f
-
- def close_file(self):
- if not self.is_open:
- # TODO Convert to assertion
- raise Exception("Bug - file should already be open: %s" % ",".join(self.atomic_fns))
-
- self.f.close()
- xprint("XX Closed file %s" % ",".join(self.atomic_fns))
-
- def generate_rows(self):
- csv_reader = encoded_csv_reader(self.input_params.input_encoding, self.f, dialect=self.dialect,row_data_only=self.external_f)
- try:
- # TODO Some order with regard to separating data-streams for actual files
- if self.external_f:
- for col_vals in csv_reader:
- self.lines_read += 1
- yield self.external_f_name,0, self.lines_read == 0, col_vals
- else:
- for file_name,is_first_line,col_vals in csv_reader:
- if is_first_line:
- self.file_number = self.file_number + 1
- self.lines_read += 1
- yield file_name,self.file_number,is_first_line,col_vals
- except ColumnMaxLengthLimitExceededException as e:
- msg = "Column length is larger than the maximum. Offending file is '%s' - Line is %s, counting from 1 (encoding %s). The line number is the raw line number of the file, ignoring whether there's a header or not" % (",".join(self.atomic_fns),self.lines_read + 1,self.input_params.input_encoding)
- raise ColumnMaxLengthLimitExceededException(msg)
- except UniversalNewlinesExistException as e2:
- # No need to translate the exception, but we want it to be explicitly defined here for clarity
- raise UniversalNewlinesExistException()
-
-class MaterializedState(object):
- def __init__(self, table_source_type,qtable_name, engine_id):
- xprint("Creating new MS: %s %s" % (id(self), qtable_name))
-
- self.table_source_type = table_source_type
-
- self.qtable_name = qtable_name
- self.engine_id = engine_id
-
- self.db_to_use = None
- self.db_id = None
-
- self.source_type = None
- self.source = None
-
- self.mfs_structure = None
-
- self.start_time = None
- self.end_time = None
- self.duration = None
-
- self.effective_table_name = None
-
-
- def get_materialized_state_type(self):
- return MaterializedStateType.UNKNOWN
-
- def get_planned_table_name(self):
- assert False, 'not implemented'
-
- def autodetect_table_name(self):
- xprint("Autodetecting table name. db_to_use=%s" % self.db_to_use)
- existing_table_names = self.db_to_use.retrieve_all_table_names()
- xprint("Existing table names: %s" % existing_table_names)
-
- possible_indices = range(1,1000)
-
- for index in possible_indices:
- if index == 1:
- suffix = ''
- else:
- suffix = '_%s' % index
-
- table_name_attempt = '%s%s' % (self.get_planned_table_name(),suffix)
- xprint("Table name attempt: index=%s name=%s" % (index,table_name_attempt))
-
- if table_name_attempt not in existing_table_names:
- xprint("Found free table name %s for source type %s source %s" % (table_name_attempt,self.source_type,self.source))
- return table_name_attempt
-
- raise Exception('Cannot find free table name for source type %s source %s' % (self.source_type,self.source))
-
- def initialize(self):
- self.start_time = time.time()
-
- def finalize(self):
- self.end_time = time.time()
- self.duration = self.end_time - self.start_time
-
- def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
- assert False, 'not implemented'
-
- def make_data_available(self,stop_after_analysis):
- assert False, 'not implemented'
-
-class MaterializedDelimitedFileState(MaterializedState):
- def __init__(self, table_source_type,qtable_name, input_params, dialect_id,engine_id,target_table_name=None):
- super().__init__(table_source_type,qtable_name,engine_id)
-
- self.input_params = input_params
- self.dialect_id = dialect_id
- self.target_table_name = target_table_name
-
- self.content_signature = None
-
- self.atomic_fns = None
-
- self.can_store_as_cached = None
-
- def get_materialized_state_type(self):
- return MaterializedStateType.DELIMITED_FILE
-
- def initialize(self):
- super(MaterializedDelimitedFileState, self).initialize()
-
- self.atomic_fns = self.materialize_file_list(self.qtable_name)
- self.delimited_file_reader = DelimitedFileReader(self.atomic_fns,self.input_params,self.dialect_id)
-
- self.source_type = self.table_source_type
- self.source = ",".join(self.atomic_fns)
-
- return
-
- def materialize_file_list(self,qtable_name):
- materialized_file_list = []
-
- unfound_files = []
- # First check if the file exists without globbing. This will ensure that we don't support non-existent files
- if os.path.exists(qtable_name):
- # If it exists, then just use it
- found_files = [qtable_name]
- else:
- # If not, then try with globs (and sort for predictability)
- found_files = list(sorted(glob.glob(qtable_name)))
- # If no files
- if len(found_files) == 0:
- unfound_files += [qtable_name]
- materialized_file_list += found_files
-
- # If there are no files to go over,
- if len(unfound_files) == 1:
- raise FileNotFoundException(
- "No files matching '%s' have been found" % unfound_files[0])
- elif len(unfound_files) > 1:
- # TODO Add test for this
- raise FileNotFoundException(
- "The following files have not been found for table %s: %s" % (qtable_name,",".join(unfound_files)))
-
- # deduplicate with matching qsql files
- filtered_file_list = list(filter(lambda x: not x.endswith('.qsql'),materialized_file_list))
- xprint("Filtered qsql files from glob search. Original file count: %s new file count: %s" % (len(materialized_file_list),len(filtered_file_list)))
-
- l = len(filtered_file_list)
- # If this proves to be a problem for users in terms of usability, then we'll just materialize the files
- # into the adhoc db, as with the db attach limit of sqlite
- if l > 500:
- msg = "Maximum source files for table must be 500. Table is name is %s Number of actual files is %s" % (qtable_name,l)
- raise MaximumSourceFilesExceededException(msg)
-
- absolute_path_list = [os.path.abspath(x) for x in filtered_file_list]
- return absolute_path_list
-
- def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
- if forced_db_to_use is not None:
- self.db_id = forced_db_to_use.db_id
- self.db_to_use = forced_db_to_use
- self.can_store_as_cached = False
- assert self.target_table_name is None
- self.target_table_name = self.autodetect_table_name()
- return
-
- self.can_store_as_cached = True
-
- self.db_id = '%s' % self._generate_db_name(self.atomic_fns[0])
- xprint("Database id is %s" % self.db_id)
- self.db_to_use = Sqlite3DB(self.db_id, 'file:%s?mode=memory&cache=shared' % self.db_id, 'memory<%s>' % self.db_id,create_qcatalog=True)
-
- if self.target_table_name is None:
- self.target_table_name = self.autodetect_table_name()
-
-
- def __analyze_delimited_file(self,database_info):
- xprint("Analyzing delimited file")
- if self.target_table_name is not None:
- target_sqlite_table_name = self.target_table_name
- else:
- assert False
-
- xprint("Target sqlite table name is %s" % target_sqlite_table_name)
- # Create the matching database table and populate it
- table_creator = TableCreator(self.qtable_name, self.delimited_file_reader,self.input_params, sqlite_db=database_info.sqlite_db,
- target_sqlite_table_name=target_sqlite_table_name)
- table_creator.perform_analyze(self.dialect_id)
- xprint("after perform_analyze")
- self.content_signature = table_creator._generate_content_signature()
-
- now = datetime.datetime.utcnow().isoformat()
-
- database_info.sqlite_db.add_to_qcatalog_table(target_sqlite_table_name,
- self.content_signature,
- now,
- self.source_type,
- self.source)
- return table_creator
-
- def _generate_disk_db_filename(self, filenames_str):
- fn = '%s.qsql' % (os.path.abspath(filenames_str).replace("+","__"))
- return fn
-
-
- def _get_should_read_from_cache(self, disk_db_filename):
- disk_db_file_exists = os.path.exists(disk_db_filename)
-
- should_read_from_cache = self.input_params.read_caching and disk_db_file_exists
-
- return should_read_from_cache
-
- def calculate_should_read_from_cache(self):
- # TODO cache filename is chosen according to first filename only, which makes multi-file (glob) caching difficult
- # cache writing is blocked for now in these cases. Will be added in the future (see save_cache_to_disk_if_needed)
- disk_db_filename = self._generate_disk_db_filename(self.atomic_fns[0])
- should_read_from_cache = self._get_should_read_from_cache(disk_db_filename)
- xprint("should read from cache %s" % should_read_from_cache)
- return disk_db_filename,should_read_from_cache
-
- def get_planned_table_name(self):
- return normalize_filename_to_table_name(os.path.basename(self.atomic_fns[0]))
-
- def make_data_available(self,stop_after_analysis):
- xprint("In make_data_available. db_id %s db_to_use %s" % (self.db_id,self.db_to_use))
- assert self.db_id is not None
-
- disk_db_filename, should_read_from_cache = self.calculate_should_read_from_cache()
- xprint("disk_db_filename=%s should_read_from_cache=%s" % (disk_db_filename,should_read_from_cache))
-
- database_info = DatabaseInfo(self.db_id,self.db_to_use, needs_closing=True)
- xprint("db %s (%s) has been added to the database list" % (self.db_id, self.db_to_use))
-
- self.delimited_file_reader.open_file()
-
- table_creator = self.__analyze_delimited_file(database_info)
-
- self.mfs_structure = MaterializedStateTableStructure(self.qtable_name, self.atomic_fns, self.db_id,
- table_creator.column_inferer.get_column_names(),
- table_creator.column_inferer.get_column_types(),
- None,
- self.target_table_name,
- self.source_type,
- self.source,
- self.get_planned_table_name())
-
- content_signature = table_creator.content_signature
- content_signature_key = self.db_to_use.calculate_content_signature_key(content_signature)
- xprint("table creator signature key: %s" % content_signature_key)
-
- relevant_table = self.db_to_use.get_from_qcatalog(content_signature)['temp_table_name']
-
- if not stop_after_analysis:
- table_creator.perform_read_fully(self.dialect_id)
-
- self.save_cache_to_disk_if_needed(disk_db_filename, table_creator)
-
-
- self.delimited_file_reader.close_file()
-
- return database_info, relevant_table
-
- def save_cache_to_disk_if_needed(self, disk_db_filename, table_creator):
- if len(self.atomic_fns) > 1:
- xprint("Cannot save cache for multi-files for now, deciding auto-naming for cache is challenging. Will be added in the future.")
- return
-
- effective_write_caching = self.input_params.write_caching
- if effective_write_caching:
- if self.can_store_as_cached:
- assert self.table_source_type != TableSourceType.DELIMITED_FILE_WITH_UNUSED_QSQL
- xprint("Going to write file cache for %s. Disk filename is %s" % (",".join(self.atomic_fns), disk_db_filename))
- self._store_qsql(table_creator.sqlite_db, disk_db_filename)
- else:
- xprint("Database has been provided externally. Skipping storing a cached version of the data")
-
- def _store_qsql(self, source_sqlite_db, disk_db_filename):
- xprint("Storing data as disk db")
- disk_db_conn = sqlite3.connect(disk_db_filename)
- with disk_db_conn:
- source_sqlite_db.conn.backup(disk_db_conn)
- xprint("Written db to disk: disk db filename %s" % (disk_db_filename))
- disk_db_conn.close()
-
- def _generate_db_name(self, qtable_name):
- return 'e_%s_fn_%s' % (self.engine_id,normalize_filename_to_table_name(qtable_name))
-
-
-class MaterialiedDataStreamState(MaterializedDelimitedFileState):
- def __init__(self, table_source_type, qtable_name, input_params, dialect_id, engine_id, data_stream, stream_target_db): ## should pass adhoc_db
- assert data_stream is not None
-
- super().__init__(table_source_type, qtable_name, input_params, dialect_id, engine_id,target_table_name=None)
-
- self.data_stream = data_stream
-
- self.stream_target_db = stream_target_db
-
- self.target_table_name = None
-
- def get_planned_table_name(self):
- return 'data_stream_%s' % (normalize_filename_to_table_name(self.source))
-
- def get_materialized_state_type(self):
- return MaterializedStateType.DATA_STREAM
-
- def initialize(self):
- self.start_time = time.time()
- if self.input_params.gzipped_input:
- raise CannotUnzipDataStreamException()
-
- self.source_type = self.table_source_type
- self.source = self.data_stream.stream_id
-
- self.delimited_file_reader = DelimitedFileReader([], self.input_params, self.dialect_id, f=self.data_stream.stream,external_f_name=self.source)
-
- def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
- assert forced_db_to_use is None
-
- self.db_id = self.stream_target_db.db_id
- self.db_to_use = self.stream_target_db
-
- self.target_table_name = self.autodetect_table_name()
-
- return
-
- def calculate_should_read_from_cache(self):
- # No disk_db_filename, and no reading from cache when reading a datastream
- return None, False
-
- def finalize(self):
- super(MaterialiedDataStreamState, self).finalize()
-
- def save_cache_to_disk_if_needed(self, disk_db_filename, table_creator):
- xprint("Saving to cache is disabled for data streams")
- return
-
-
-class MaterializedSqliteState(MaterializedState):
- def __init__(self,table_source_type,qtable_name,sqlite_filename,table_name, engine_id):
- super(MaterializedSqliteState, self).__init__(table_source_type,qtable_name,engine_id)
- self.sqlite_filename = sqlite_filename
- self.table_name = table_name
-
- self.table_name_autodetected = None
-
- def initialize(self):
- super(MaterializedSqliteState, self).initialize()
-
- self.table_name_autodetected = False
- if self.table_name is None:
- self.table_name = self.autodetect_table_name()
- self.table_name_autodetected = True
- return
-
- self.validate_table_name()
-
- def get_planned_table_name(self):
- if self.table_name_autodetected:
- return normalize_filename_to_table_name(os.path.basename(self.qtable_name))
- else:
- return self.table_name
-
-
- def autodetect_table_name(self):
- db = Sqlite3DB('temp_db','file:%s?immutable=1' % self.sqlite_filename,self.sqlite_filename,create_qcatalog=False)
- try:
- table_names = list(sorted(db.retrieve_all_table_names()))
- if len(table_names) == 1:
- return table_names[0]
- elif len(table_names) == 0:
- raise NoTablesInSqliteException(self.sqlite_filename)
- else:
- raise TooManyTablesInSqliteException(self.sqlite_filename,table_names)
- finally:
- db.done()
-
- def validate_table_name(self):
- db = Sqlite3DB('temp_db', 'file:%s?immutable=1' % self.sqlite_filename, self.sqlite_filename,
- create_qcatalog=False)
- try:
- table_names = list(db.retrieve_all_table_names())
- if self.table_name.lower() not in map(lambda x:x.lower(),table_names):
- raise NonExistentTableNameInSqlite(self.sqlite_filename, self.table_name, table_names)
- finally:
- db.done()
-
- def finalize(self):
- super(MaterializedSqliteState, self).finalize()
-
- def get_materialized_state_type(self):
- return MaterializedStateType.SQLITE_FILE
-
- def _generate_qsql_only_db_name__temp(self, filenames_str):
- return 'e_%s_fn_%s' % (self.engine_id,hashlib.sha1(six.b(filenames_str)).hexdigest())
-
- def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
- self.source = self.sqlite_filename
- self.source_type = self.table_source_type
-
- self.db_id = '%s' % self._generate_qsql_only_db_name__temp(self.qtable_name)
-
- x = 'file:%s?immutable=1' % self.sqlite_filename
- self.db_to_use = Sqlite3DB(self.db_id, x, self.sqlite_filename,create_qcatalog=False)
-
- if forced_db_to_use:
- xprint("Forced sqlite db_to_use %s" % forced_db_to_use)
- new_table_name = forced_db_to_use.attach_and_copy_table(self.db_to_use,self.table_name,stop_after_analysis)
- self.table_name = new_table_name
- self.db_id = forced_db_to_use.db_id
- self.db_to_use = forced_db_to_use
-
- return
-
- def make_data_available(self,stop_after_analysis):
- xprint("db %s (%s) has been added to the database list" % (self.db_id, self.db_to_use))
-
- database_info,relevant_table = DatabaseInfo(self.db_id,self.db_to_use, needs_closing=True), self.table_name
-
- column_names, column_types, sqlite_column_types = self._extract_information()
-
- self.mfs_structure = MaterializedStateTableStructure(self.qtable_name, [self.qtable_name], self.db_id,
- column_names, column_types, sqlite_column_types,
- self.table_name,
- self.source_type,self.source,
- self.get_planned_table_name())
- return database_info, relevant_table
-
- def _extract_information(self):
- table_list = self.db_to_use.retrieve_all_table_names()
- if len(table_list) == 1:
- table_name = table_list[0][0]
- xprint("Only one table in sqlite database, choosing it: %s" % table_name)
- else:
- # self.table_name has either beein autodetected, or validated as an existing table up the stack
- table_name = self.table_name
- xprint("Multiple tables in sqlite file. Using provided table name %s" % self.table_name)
-
- table_info = self.db_to_use.get_sqlite_table_info(table_name)
- xprint('Table info is %s' % table_info)
- column_names = list(map(lambda x: x[1], table_info))
- sqlite_column_types = list(map(lambda x: x[2].lower(),table_info))
- column_types = list(map(lambda x: sqlite_type_to_python_type(x[2]), table_info))
- xprint("Column names and types for table %s: %s" % (table_name, list(zip(column_names, zip(sqlite_column_types,column_types)))))
- self.content_signature = OrderedDict()
-
- return column_names, column_types, sqlite_column_types
-
-
-class MaterializedQsqlState(MaterializedState):
- def __init__(self,table_source_type,qtable_name,qsql_filename,table_name, engine_id,input_params,dialect_id):
- super(MaterializedQsqlState, self).__init__(table_source_type,qtable_name,engine_id)
- self.qsql_filename = qsql_filename
- self.table_name = table_name
-
- # These are for cases where the qsql file is just a cache and the original is still there, used for content
- # validation
- self.input_params = input_params
- self.dialect_id = dialect_id
-
- self.table_name_autodetected = None
-
- def initialize(self):
- super(MaterializedQsqlState, self).initialize()
-
- self.table_name_autodetected = False
- if self.table_name is None:
- self.table_name = self.autodetect_table_name()
- self.table_name_autodetected = True
- return
-
- self.validate_table_name()
-
- def get_planned_table_name(self):
- if self.table_name_autodetected:
- return normalize_filename_to_table_name(os.path.basename(self.qtable_name))
- else:
- return self.table_name
-
-
- def autodetect_table_name(self):
- db = Sqlite3DB('temp_db','file:%s?immutable=1' % self.qsql_filename,self.qsql_filename,create_qcatalog=False)
- assert db.qcatalog_table_exists()
- try:
- qcatalog_entries = db.get_all_from_qcatalog()
- if len(qcatalog_entries) == 0:
- raise NoTableInQsqlExcption(self.qsql_filename)
- elif len(qcatalog_entries) == 1:
- return qcatalog_entries[0]['temp_table_name']
- else:
- # TODO Add a test for this
- table_names = list(sorted([x['temp_table_name'] for x in qcatalog_entries]))
- raise TooManyTablesInQsqlException(self.qsql_filename,table_names)
- finally:
- db.done()
-
- def validate_table_name(self):
- db = Sqlite3DB('temp_db', 'file:%s?immutable=1' % self.qsql_filename, self.qsql_filename,
- create_qcatalog=False)
- assert db.qcatalog_table_exists()
- try:
- entry = db.get_from_qcatalog_using_table_name(self.table_name)
- if entry is None:
- qcatalog_entries = db.get_all_from_qcatalog()
- table_names = list(sorted([x['temp_table_name'] for x in qcatalog_entries]))
- raise NonExistentTableNameInQsql(self.qsql_filename,self.table_name,table_names)
- finally:
- db.done()
-
- def finalize(self):
- super(MaterializedQsqlState, self).finalize()
-
- def get_materialized_state_type(self):
- return MaterializedStateType.QSQL_FILE
-
- def _generate_qsql_only_db_name__temp(self, filenames_str):
- return 'e_%s_fn_%s' % (self.engine_id,hashlib.sha1(six.b(filenames_str)).hexdigest())
-
- def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
- self.source = self.qsql_filename
- self.source_type = self.table_source_type
-
- self.db_id = '%s' % self._generate_qsql_only_db_name__temp(self.qtable_name)
-
- x = 'file:%s?immutable=1' % self.qsql_filename
- self.db_to_use = Sqlite3DB(self.db_id, x, self.qsql_filename,create_qcatalog=False)
-
- if forced_db_to_use:
- xprint("Forced qsql to use forced_db: %s" % forced_db_to_use)
-
- # TODO RLRL Move query to Sqlite3DB
- all_table_names = [(x[0],x[1]) for x in self.db_to_use.execute_and_fetch("select content_signature_key,temp_table_name from %s" % self.db_to_use.QCATALOG_TABLE_NAME).results]
- csk,t = list(filter(lambda x: x[1] == self.table_name,all_table_names))[0]
- xprint("Copying table %s from db_id %s" % (t,self.db_id))
- d = self.db_to_use.get_from_qcatalog_using_table_name(t)
-
- new_table_name = forced_db_to_use.attach_and_copy_table(self.db_to_use,self.table_name,stop_after_analysis)
-
- xprint("CS",d['content_signature'])
- cs = OrderedDict(json.loads(d['content_signature']))
- forced_db_to_use.add_to_qcatalog_table(new_table_name, cs, d['creation_time'],
- d['source_type'], d['source'])
-
- self.table_name = new_table_name
- self.db_id = forced_db_to_use.db_id
- self.db_to_use = forced_db_to_use
-
- return
-
- def make_data_available(self,stop_after_analysis):
- xprint("db %s (%s) has been added to the database list" % (self.db_id, self.db_to_use))
-
- database_info,relevant_table = self._read_table_from_cache(stop_after_analysis)
-
- column_names, column_types, sqlite_column_types = self._extract_information()
-
- self.mfs_structure = MaterializedStateTableStructure(self.qtable_name, [self.qtable_name], self.db_id,
- column_names, column_types, sqlite_column_types,
- self.table_name,
- self.source_type,self.source,
- self.get_planned_table_name())
- return database_info, relevant_table
-
- def _extract_information(self):
- assert self.db_to_use.qcatalog_table_exists()
- table_info = self.db_to_use.get_sqlite_table_info(self.table_name)
- xprint('table_name=%s Table info is %s' % (self.table_name,table_info))
-
- x = self.db_to_use.get_from_qcatalog_using_table_name(self.table_name)
-
- column_names = list(map(lambda x: x[1], table_info))
- sqlite_column_types = list(map(lambda x: x[2].lower(),table_info))
- column_types = list(map(lambda x: sqlite_type_to_python_type(x[2]), table_info))
- self.content_signature = OrderedDict(
- **json.loads(x['content_signature']))
- xprint('Inferred column names and types from qsql: %s' % list(zip(column_names, zip(sqlite_column_types,column_types))))
-
- return column_names, column_types, sqlite_column_types
-
- def _backing_original_file_exists(self):
- return '%s.qsql' % self.qtable_name == self.qsql_filename
-
- def _read_table_from_cache(self, stop_after_analysis):
- if self._backing_original_file_exists():
- xprint("Found a matching source file for qsql file with qtable name %s. Checking content signature by creating a temp MFDS + analysis" % self.qtable_name)
- mdfs = MaterializedDelimitedFileState(TableSourceType.DELIMITED_FILE,self.qtable_name,self.input_params,self.dialect_id,self.engine_id,target_table_name=None)
- mdfs.initialize()
- mdfs.choose_db_to_use(forced_db_to_use=None,stop_after_analysis=stop_after_analysis)
- _,_ = mdfs.make_data_available(stop_after_analysis=True)
-
- original_file_content_signature = mdfs.content_signature
- original_file_content_signature_key = self.db_to_use.calculate_content_signature_key(original_file_content_signature)
-
- qcatalog_entry = self.db_to_use.get_from_qcatalog_using_table_name(self.table_name)
-
- if qcatalog_entry is None:
- raise Exception('missing content signature!')
-
- xprint("Actual Signature Key: %s Expected Signature Key: %s" % (qcatalog_entry['content_signature_key'],original_file_content_signature_key))
- actual_content_signature = json.loads(qcatalog_entry['content_signature'])
-
- xprint("Validating content signatures: original %s vs qsql %s" % (original_file_content_signature,actual_content_signature))
- validate_content_signature(self.qtable_name, original_file_content_signature, self.qsql_filename, actual_content_signature,dump=True)
- mdfs.finalize()
- return DatabaseInfo(self.db_id,self.db_to_use, needs_closing=True), self.table_name
-
-
-class MaterializedStateTableStructure(object):
- def __init__(self,qtable_name, atomic_fns, db_id, column_names, python_column_types, sqlite_column_types, table_name_for_querying,source_type,source,planned_table_name):
- self.qtable_name = qtable_name
- self.atomic_fns = atomic_fns
- self.db_id = db_id
- self.column_names = column_names
- self.python_column_types = python_column_types
- self.table_name_for_querying = table_name_for_querying
- self.source_type = source_type
- self.source = source
- self.planned_table_name = planned_table_name
-
- if sqlite_column_types is not None:
- self.sqlite_column_types = sqlite_column_types
- else:
- self.sqlite_column_types = [Sqlite3DB.PYTHON_TO_SQLITE_TYPE_NAMES[t].lower() for t in python_column_types]
-
- def get_table_name_for_querying(self):
- return self.table_name_for_querying
-
- def __str__(self):
- return "MaterializedStateTableStructure<%s>" % self.__dict__
- __repr__ = __str__
-
-class TableCreator(object):
- def __str__(self):
- return "TableCreator<%s>" % str(self)
- __repr__ = __str__
-
- def __init__(self, qtable_name, delimited_file_reader,input_params,sqlite_db=None,target_sqlite_table_name=None):
-
- self.qtable_name = qtable_name
- self.delimited_file_reader = delimited_file_reader
-
- self.db_id = sqlite_db.db_id
-
- self.sqlite_db = sqlite_db
- self.target_sqlite_table_name = target_sqlite_table_name
-
- self.skip_header = input_params.skip_header
- self.gzipped = input_params.gzipped_input
- self.table_created = False
-
- self.encoding = input_params.input_encoding
- self.mode = input_params.parsing_mode
- self.expected_column_count = input_params.expected_column_count
- self.input_delimiter = input_params.delimiter
- self.with_universal_newlines = input_params.with_universal_newlines
-
- self.column_inferer = TableColumnInferer(input_params)
-
- self.pre_creation_rows = []
- self.buffered_inserts = []
- self.effective_column_names = None
-
- # Column type indices for columns that contain numeric types. Lazily initialized
- # so column inferer can do its work before this information is needed
- self.numeric_column_indices = None
-
- self.state = TableCreatorState.INITIALIZED
-
- self.content_signature = None
-
- def _generate_content_signature(self):
- if self.state != TableCreatorState.ANALYZED:
- # TODO Change to assertion
- raise Exception('Bug - Wrong state %s. Table needs to be analyzed before a content signature can be calculated' % self.state)
-
- size = self.delimited_file_reader.get_size_hash()
- last_modification_time = self.delimited_file_reader.get_last_modification_time_hash()
-
- m = OrderedDict({
- "_signature_version": "v1",
- "skip_header": self.skip_header,
- "gzipped": self.gzipped,
- "with_universal_newlines": self.with_universal_newlines,
- "encoding": self.encoding,
- "mode": self.mode,
- "expected_column_count": self.expected_column_count,
- "input_delimiter": self.input_delimiter,
- "inferer": self.column_inferer._generate_content_signature(),
- "original_file_size": size,
- "last_modification_time": last_modification_time
- })
-
- return m
-
- def validate_extra_header_if_needed(self, file_number, filename,col_vals):
- xprint("HHX validate",file_number,filename,col_vals)
- if not self.skip_header:
- xprint("No need to validate header")
- return False
-
- if file_number == 0:
- xprint("First file, no need to validate extra header")
- return False
-
- header_already_exists = self.column_inferer.header_row is not None
-
- if header_already_exists:
- xprint("Validating extra header")
- if tuple(self.column_inferer.header_row) != tuple(col_vals):
- raise BadHeaderException("Extra header '{}' in file '{}' mismatches original header '{}' from file '{}'. Table name is '{}'".format(
- ",".join(col_vals),filename,
- ",".join(self.column_inferer.header_row),
- self.column_inferer.header_row_filename,
- self.qtable_name))
- xprint("header already exists: %s" % self.column_inferer.header_row)
- else:
- xprint("Header doesn't already exist")
-
- return header_already_exists
-
- def _populate(self,dialect,stop_after_analysis=False):
- total_data_lines_read = 0
- try:
- try:
- for file_name,file_number,is_first_line,col_vals in self.delimited_file_reader.generate_rows():
- if is_first_line:
- if self.validate_extra_header_if_needed(file_number,file_name,col_vals):
- continue
- self._insert_row(file_name, col_vals)
- if stop_after_analysis:
- if self.column_inferer.inferred:
- xprint("Stopping after analysis")
- return
- if self.delimited_file_reader.get_lines_read() == 0 and self.skip_header:
- raise MissingHeaderException("Header line is expected but missing in file %s" % ",".join(self.delimited_file_reader.atomic_fns))
-
- total_data_lines_read += self.delimited_file_reader.lines_read - (1 if self.skip_header else 0)
- xprint("Total Data lines read %s" % total_data_lines_read)
- except StrictModeColumnCountMismatchException as e:
- raise ColumnCountMismatchException(
- 'Strict mode - Expected %s columns instead of %s columns in file %s row %s. Either use relaxed modes or check your delimiter' % (
- e.expected_col_count, e.actual_col_count, normalized_filename(e.atomic_fn), e.lines_read))
- except FluffyModeColumnCountMismatchException as e:
- raise ColumnCountMismatchException(
- 'Deprecated fluffy mode - Too many columns in file %s row %s (%s fields instead of %s fields). Consider moving to either relaxed or strict mode' % (
- normalized_filename(e.atomic_fn), e.lines_read, e.actual_col_count, e.expected_col_count))
- finally:
- self._flush_inserts()
-
- if not self.table_created:
- self.column_inferer.force_analysis()
- self._do_create_table(self.qtable_name)
-
- self.sqlite_db.conn.commit()
-
- def perform_analyze(self, dialect):
- xprint("Analyzing... %s" % dialect)
- if self.state == TableCreatorState.INITIALIZED:
- self._populate(dialect,stop_after_analysis=True)
- self.state = TableCreatorState.ANALYZED
-
- self.content_signature = self._generate_content_signature()
- content_signature_key = self.sqlite_db.calculate_content_signature_key(self.content_signature)
- xprint("Setting content signature after analysis: %s" % content_signature_key)
- else:
- # TODO Convert to assertion
- raise Exception('Bug - Wrong state %s' % self.state)
-
- def perform_read_fully(self, dialect):
- if self.state == TableCreatorState.ANALYZED:
- self._populate(dialect,stop_after_analysis=False)
- self.state = TableCreatorState.FULLY_READ
- else:
- # TODO Convert to assertion
- raise Exception('Bug - Wrong state %s' % self.state)
-
- def _flush_pre_creation_rows(self, filename):
- for i, col_vals in enumerate(self.pre_creation_rows):
- if self.skip_header and i == 0:
- # skip header line
- continue
- self._insert_row(filename, col_vals)
- self._flush_inserts()
- self.pre_creation_rows = []
-
- def _insert_row(self, filename, col_vals):
- # If table has not been created yet
- if not self.table_created:
- # Try to create it along with another "example" line of data
- self.try_to_create_table(filename, col_vals)
-
- # If the table is still not created, then we don't have enough data, just
- # store the data and return
- if not self.table_created:
- self.pre_creation_rows.append(col_vals)
- return
-
-
- # The table already exists, so we can just add a new row
- self._insert_row_i(col_vals)
-
- def initialize_numeric_column_indices_if_needed(self):
- # Lazy initialization of numeric column indices
- if self.numeric_column_indices is None:
- column_types = self.column_inferer.get_column_types()
- self.numeric_column_indices = [idx for idx, column_type in enumerate(
- column_types) if self.sqlite_db.is_numeric_type(column_type)]
-
- def nullify_values_if_needed(self, col_vals):
- new_vals = col_vals[:]
- col_count = len(col_vals)
- for i in self.numeric_column_indices:
- if i >= col_count:
- continue
- v = col_vals[i]
- if v == '':
- new_vals[i] = None
- return new_vals
-
- def normalize_col_vals(self, col_vals):
- # Make sure that numeric column indices are initializd
- self.initialize_numeric_column_indices_if_needed()
-
- col_vals = self.nullify_values_if_needed(col_vals)
-
- expected_col_count = self.column_inferer.get_column_count()
- actual_col_count = len(col_vals)
- if self.mode == 'strict':
- if actual_col_count != expected_col_count:
- raise StrictModeColumnCountMismatchException(",".join(self.delimited_file_reader.atomic_fns), expected_col_count,actual_col_count,self.delimited_file_reader.get_lines_read())
- return col_vals
-
- # in all non strict mode, we add dummy data to missing columns
-
- if actual_col_count < expected_col_count:
- col_vals = col_vals + \
- [None for x in range(expected_col_count - actual_col_count)]
-
- # in relaxed mode, we merge all extra columns to the last column value
- if self.mode == 'relaxed':
- if actual_col_count > expected_col_count:
- xxx = col_vals[:expected_col_count - 1] + \
- [self.input_delimiter.join([v if v is not None else '' for v in
- col_vals[expected_col_count - 1:]])]
- return xxx
- else:
- return col_vals
-
- assert False, "Unidentified parsing mode %s" % self.mode
-
- def _insert_row_i(self, col_vals):
- col_vals = self.normalize_col_vals(col_vals)
-
- if self.effective_column_names is None:
- self.effective_column_names = self.column_inferer.column_names[:len(col_vals)]
-
- if len(self.effective_column_names) > 0:
- self.buffered_inserts.append(col_vals)
- else:
- self.buffered_inserts.append([""])
-
- if len(self.buffered_inserts) < 5000:
- return
- self._flush_inserts()
-
- def _flush_inserts(self):
- # If the table is still not created, then we don't have enough data
- if not self.table_created:
- return
-
- if len(self.buffered_inserts) > 0:
- insert_row_stmt = self.sqlite_db.generate_insert_row(
- self.target_sqlite_table_name, self.effective_column_names)
-
- self.sqlite_db.update_many(insert_row_stmt, self.buffered_inserts)
- self.buffered_inserts = []
-
- def try_to_create_table(self, filename, col_vals):
- if self.table_created:
- # TODO Convert to assertion
- raise Exception('Table is already created')
-
- # Add that line to the column inferer
- result = self.column_inferer.analyze(filename, col_vals)
- # If inferer succeeded,
- if result:
- self._do_create_table(filename)
- else:
- pass # We don't have enough information for creating the table yet
-
- def _do_create_table(self,filename):
- # Get the column definition dict from the inferer
- column_dict = self.column_inferer.get_column_dict()
-
- # Guard against empty tables (instead of preventing the creation, just create with a dummy column)
- if len(column_dict) == 0:
- column_dict = { 'dummy_column_for_empty_tables' : str }
- ordered_column_names = [ 'dummy_column_for_empty_tables' ]
- else:
- ordered_column_names = self.column_inferer.get_column_names()
-
- # Create the CREATE TABLE statement
- create_table_stmt = self.sqlite_db.generate_create_table(
- self.target_sqlite_table_name, ordered_column_names, column_dict)
- # And create the table itself
- self.sqlite_db.execute_and_fetch(create_table_stmt)
- # Mark the table as created
- self.table_created = True
- self._flush_pre_creation_rows(filename)
-
-
-def determine_max_col_lengths(m,output_field_quoting_func,output_delimiter):
- if len(m) == 0:
- return []
- max_lengths = [0 for x in range(0, len(m[0]))]
- for row_index in range(0, len(m)):
- for col_index in range(0, len(m[0])):
- # TODO Optimize this
- new_len = len("{}".format(output_field_quoting_func(output_delimiter,m[row_index][col_index])))
- if new_len > max_lengths[col_index]:
- max_lengths[col_index] = new_len
- return max_lengths
-
-def print_credentials():
- print("q version %s" % q_version, file=sys.stderr)
- print("Python: %s" % " // ".join([str(x).strip() for x in sys.version.split("\n")]), file=sys.stderr)
- print("Copyright (C) 2012-2021 Harel Ben-Attia (harelba@gmail.com, @harelba on twitter)", file=sys.stderr)
- print("https://harelba.github.io/q/", file=sys.stderr)
- print(file=sys.stderr)
-
-class QWarning(object):
- def __init__(self,exception,msg):
- self.exception = exception
- self.msg = msg
-
-class QError(object):
- def __init__(self,exception,msg,errorcode):
- self.exception = exception
- self.msg = msg
- self.errorcode = errorcode
- self.traceback = traceback.format_exc()
-
- def __str__(self):
- return "QError" % (self.errorcode,self.msg,self.exception,str(self.traceback))
- __repr__ = __str__
-
-class QMetadata(object):
- def __init__(self,table_structures={},new_table_structures={},output_column_name_list=None):
- self.table_structures = table_structures
- self.new_table_structures = new_table_structures
- self.output_column_name_list = output_column_name_list
-
- def __str__(self):
- return "QMetadata<%s" % (self.__dict__)
- __repr__ = __str__
-
-class QOutput(object):
- def __init__(self,data=None,metadata=None,warnings=[],error=None):
- self.data = data
- self.metadata = metadata
-
- self.warnings = warnings
- self.error = error
- if error is None:
- self.status = 'ok'
- else:
- self.status = 'error'
-
- def __str__(self):
- s = []
- s.append('status=%s' % self.status)
- if self.error is not None:
- s.append("error=%s" % self.error.msg)
- if len(self.warnings) > 0:
- s.append("warning_count=%s" % len(self.warnings))
- if self.data is not None:
- s.append("row_count=%s" % len(self.data))
- else:
- s.append("row_count=None")
- if self.metadata is not None:
- s.append("metadata=<%s>" % self.metadata)
- else:
- s.append("metadata=None")
- return "QOutput<%s>" % ",".join(s)
- __repr__ = __str__
-
-class QInputParams(object):
- def __init__(self,skip_header=False,
- delimiter=' ',input_encoding='UTF-8',gzipped_input=False,with_universal_newlines=False,parsing_mode='relaxed',
- expected_column_count=None,keep_leading_whitespace_in_values=False,
- disable_double_double_quoting=False,disable_escaped_double_quoting=False,
- disable_column_type_detection=False,
- input_quoting_mode='minimal',stdin_file=None,stdin_filename='-',
- max_column_length_limit=131072,
- read_caching=False,
- write_caching=False,
- max_attached_sqlite_databases = 10):
- self.skip_header = skip_header
- self.delimiter = delimiter
- self.input_encoding = input_encoding
- self.gzipped_input = gzipped_input
- self.with_universal_newlines = with_universal_newlines
- self.parsing_mode = parsing_mode
- self.expected_column_count = expected_column_count
- self.keep_leading_whitespace_in_values = keep_leading_whitespace_in_values
- self.disable_double_double_quoting = disable_double_double_quoting
- self.disable_escaped_double_quoting = disable_escaped_double_quoting
- self.input_quoting_mode = input_quoting_mode
- self.disable_column_type_detection = disable_column_type_detection
- self.max_column_length_limit = max_column_length_limit
- self.read_caching = read_caching
- self.write_caching = write_caching
- self.max_attached_sqlite_databases = max_attached_sqlite_databases
-
- def merged_with(self,input_params):
- params = QInputParams(**self.__dict__)
- if input_params is not None:
- params.__dict__.update(**input_params.__dict__)
- return params
-
- def __str__(self):
- return "QInputParams<%s>" % str(self.__dict__)
-
- def __repr__(self):
- return "QInputParams(...)"
-
-class DataStream(object):
- # TODO Can stream-id be removed?
- def __init__(self,stream_id,filename,stream):
- self.stream_id = stream_id
- self.filename = filename
- self.stream = stream
-
- def __str__(self):
- return "QDataStream" % (self.stream_id,self.filename,self.stream)
- __repr__ = __str__
-
-
-class DataStreams(object):
- def __init__(self, data_streams_dict):
- assert type(data_streams_dict) == dict
- self.validate(data_streams_dict)
- self.data_streams_dict = data_streams_dict
-
- def validate(self,d):
- for k in d:
- v = d[k]
- if type(k) != str or type(v) != DataStream:
- raise Exception('Bug - Invalid dict: %s' % str(d))
-
- def get_for_filename(self, filename):
- xprint("Data streams dict is %s. Trying to find %s" % (self.data_streams_dict,filename))
- x = self.data_streams_dict.get(filename)
- return x
-
- def is_data_stream(self,filename):
- return filename in self.data_streams_dict
-
-class DatabaseInfo(object):
- def __init__(self,db_id,sqlite_db,needs_closing):
- self.db_id = db_id
- self.sqlite_db = sqlite_db
- self.needs_closing = needs_closing
-
- def __str__(self):
- return "DatabaseInfo" % (self.sqlite_db,self.needs_closing)
- __repr__ = __str__
-
-class QTextAsData(object):
- def __init__(self,default_input_params=QInputParams(),data_streams_dict=None):
- self.engine_id = str(uuid.uuid4()).replace("-","_")
-
- self.default_input_params = default_input_params
- xprint("Default input params: %s" % self.default_input_params)
-
- self.loaded_table_structures_dict = OrderedDict()
- self.databases = OrderedDict()
-
- if data_streams_dict is not None:
- self.data_streams = DataStreams(data_streams_dict)
- else:
- self.data_streams = DataStreams({})
-
- # Create DB object
- self.query_level_db_id = 'query_e_%s' % self.engine_id
- self.query_level_db = Sqlite3DB(self.query_level_db_id,
- 'file:%s?mode=memory&cache=shared' % self.query_level_db_id,'',create_qcatalog=True)
- self.adhoc_db_id = 'adhoc_e_%s' % self.engine_id
- self.adhoc_db_name = 'file:%s?mode=memory&cache=shared' % self.adhoc_db_id
- self.adhoc_db = Sqlite3DB(self.adhoc_db_id,self.adhoc_db_name,'',create_qcatalog=True)
- self.query_level_db.conn.execute("attach '%s' as %s" % (self.adhoc_db_name,self.adhoc_db_id))
-
- self.add_db_to_database_list(DatabaseInfo(self.query_level_db_id,self.query_level_db,needs_closing=True))
- self.add_db_to_database_list(DatabaseInfo(self.adhoc_db_id,self.adhoc_db,needs_closing=True))
-
- def done(self):
- xprint("Inside done: Database list is %s" % self.databases)
- for db_id in reversed(self.databases.keys()):
- database_info = self.databases[db_id]
- if database_info.needs_closing:
- xprint("Gonna close database %s - %s" % (db_id,self.databases[db_id]))
- self.databases[db_id].sqlite_db.done()
- xprint("Database %s has been closed" % db_id)
- else:
- xprint("No need to close database %s" % db_id)
- xprint("Closed all databases")
-
- input_quoting_modes = { 'minimal' : csv.QUOTE_MINIMAL,
- 'all' : csv.QUOTE_ALL,
- # nonnumeric is not supported for input quoting modes, since we determine the data types
- # ourselves instead of letting the csv module try to identify the types
- 'none' : csv.QUOTE_NONE }
-
- def determine_proper_dialect(self,input_params):
-
- input_quoting_mode_csv_numeral = QTextAsData.input_quoting_modes[input_params.input_quoting_mode]
-
- if input_params.keep_leading_whitespace_in_values:
- skip_initial_space = False
- else:
- skip_initial_space = True
-
- dialect = {'skipinitialspace': skip_initial_space,
- 'delimiter': input_params.delimiter, 'quotechar': '"' }
- dialect['quoting'] = input_quoting_mode_csv_numeral
- dialect['doublequote'] = input_params.disable_double_double_quoting
-
- if input_params.disable_escaped_double_quoting:
- dialect['escapechar'] = '\\'
-
- return dialect
-
- def get_dialect_id(self,filename):
- return 'q_dialect_%s' % filename
-
- def _open_files_and_get_mfss(self,qtable_name,input_params,dialect):
- materialized_file_dict = OrderedDict()
-
- materialized_state_type,table_source_type,source_info = detect_qtable_name_source_info(qtable_name,self.data_streams,read_caching_enabled=input_params.read_caching)
- xprint("Detected source type %s source info %s" % (materialized_state_type,source_info))
-
- if materialized_state_type == MaterializedStateType.DATA_STREAM:
- (data_stream,) = source_info
- ms = MaterialiedDataStreamState(table_source_type,qtable_name,input_params,dialect,self.engine_id,data_stream,stream_target_db=self.adhoc_db)
- effective_qtable_name = data_stream.stream_id
- elif materialized_state_type == MaterializedStateType.QSQL_FILE:
- (qsql_filename,table_name) = source_info
- ms = MaterializedQsqlState(table_source_type,qtable_name, qsql_filename=qsql_filename, table_name=table_name,
- engine_id=self.engine_id, input_params=input_params, dialect_id=dialect)
- effective_qtable_name = '%s:::%s' % (qsql_filename, table_name)
- elif materialized_state_type == MaterializedStateType.SQLITE_FILE:
- (sqlite_filename,table_name) = source_info
- ms = MaterializedSqliteState(table_source_type,qtable_name, sqlite_filename=sqlite_filename, table_name=table_name,
- engine_id=self.engine_id)
- effective_qtable_name = '%s:::%s' % (sqlite_filename, table_name)
- elif materialized_state_type == MaterializedStateType.DELIMITED_FILE:
- (source_qtable_name,_) = source_info
- ms = MaterializedDelimitedFileState(table_source_type,source_qtable_name, input_params, dialect, self.engine_id)
- effective_qtable_name = source_qtable_name
- else:
- assert False, "Unknown file type for qtable %s should have exited with an exception" % (qtable_name)
-
- assert effective_qtable_name not in materialized_file_dict
- materialized_file_dict[effective_qtable_name] = ms
-
- xprint("MS dict: %s" % str(materialized_file_dict))
-
- return list([item for item in materialized_file_dict.values()])
-
- def _load_mfs(self,mfs,input_params,dialect_id,stop_after_analysis):
- xprint("Loading MFS:", mfs)
-
- materialized_state_type = mfs.get_materialized_state_type()
- xprint("Detected materialized state type for %s: %s" % (mfs.qtable_name,materialized_state_type))
-
- mfs.initialize()
-
- if not materialized_state_type in [MaterializedStateType.DATA_STREAM]:
- if stop_after_analysis or self.should_copy_instead_of_attach(input_params):
- xprint("Should copy instead of attaching. Forcing db to use to adhoc db")
- forced_db_to_use = self.adhoc_db
- else:
- forced_db_to_use = None
- else:
- forced_db_to_use = None
-
- mfs.choose_db_to_use(forced_db_to_use,stop_after_analysis)
- xprint("Chosen db to use: source %s source_type %s db_id %s db_to_use %s" % (mfs.source,mfs.source_type,mfs.db_id,mfs.db_to_use))
-
- database_info,relevant_table = mfs.make_data_available(stop_after_analysis)
-
- if not self.is_adhoc_db(mfs.db_to_use) and not self.should_copy_instead_of_attach(input_params):
- if not self.already_attached_to_query_level_db(mfs.db_to_use):
- self.attach_to_db(mfs.db_to_use, self.query_level_db)
- self.add_db_to_database_list(database_info)
- else:
- xprint("DB %s is already attached to query level db. No need to attach it again.")
-
- mfs.finalize()
-
- xprint("MFS Loaded")
-
- return mfs.source,mfs.source_type
-
- def add_db_to_database_list(self,database_info):
- db_id = database_info.db_id
- assert db_id is not None
- assert database_info.sqlite_db is not None
- if db_id in self.databases:
- # TODO Convert to assertion
- if id(database_info.sqlite_db) != id(self.databases[db_id].sqlite_db):
- raise Exception('Bug - database already in database list: db_id %s: old %s new %s' % (db_id,self.databases[db_id],database_info))
- else:
- return
- self.databases[db_id] = database_info
-
- def is_adhoc_db(self,db_to_use):
- return db_to_use.db_id == self.adhoc_db_id
-
- def should_copy_instead_of_attach(self,input_params):
- attached_database_count = len(self.query_level_db.get_sqlite_database_list())
- x = attached_database_count >= input_params.max_attached_sqlite_databases
- xprint("should_copy_instead_of_attach: attached_database_count=%s should_copy=%s" % (attached_database_count,x))
- return x
-
- def _load_data(self,qtable_name,input_params=QInputParams(),stop_after_analysis=False):
- xprint("Attempting to load data for materialized file names %s" % qtable_name)
-
- q_dialect = self.determine_proper_dialect(input_params)
- xprint("Dialect is %s" % q_dialect)
- dialect_id = self.get_dialect_id(qtable_name)
- csv.register_dialect(dialect_id, **q_dialect)
-
- xprint("qtable metadata for loading is %s" % qtable_name)
- mfss = self._open_files_and_get_mfss(qtable_name,
- input_params,
- dialect_id)
- assert len(mfss) == 1, "one MS now encapsulated an entire table"
- mfs = mfss[0]
-
- xprint("MFS to load: %s" % mfs)
-
- if qtable_name in self.loaded_table_structures_dict.keys():
- xprint("Atomic filename %s found. no need to load" % qtable_name)
- return None
-
- xprint("qtable %s not found - loading" % qtable_name)
-
-
- self._load_mfs(mfs, input_params, dialect_id, stop_after_analysis)
- xprint("Loaded: source-type %s source %s mfs_structure %s" % (mfs.source_type, mfs.source, mfs.mfs_structure))
-
- assert qtable_name not in self.loaded_table_structures_dict, "loaded_table_structures_dict has been changed to have a non-list value"
- self.loaded_table_structures_dict[qtable_name] = mfs.mfs_structure
-
- return mfs.mfs_structure
-
- def already_attached_to_query_level_db(self,db_to_attach):
- attached_dbs = list(map(lambda x:x[1],self.query_level_db.get_sqlite_database_list()))
- return db_to_attach.db_id in attached_dbs
-
- def attach_to_db(self, target_db, source_db):
- q = "attach '%s' as %s" % (target_db.sqlite_db_url,target_db.db_id)
- xprint("Attach query: %s" % q)
- try:
- c = source_db.execute_and_fetch(q)
- except SqliteOperationalErrorException as e:
- if 'too many attached databases' in str(e):
- raise TooManyAttachedDatabasesException('There are too many attached databases. Use a proper --max-attached-sqlite-databases parameter which is below the maximum. Original error: %s' % str(e))
- except Exception as e1:
- raise
-
- def detach_from_db(self, target_db, source_db):
- q = "detach %s" % (target_db.db_id)
- xprint("Detach query: %s" % q)
- try:
- c = source_db.execute_and_fetch(q)
- except Exception as e1:
- raise
-
- def load_data(self,filename,input_params=QInputParams(),stop_after_analysis=False):
- return self._load_data(filename,input_params,stop_after_analysis=stop_after_analysis)
-
- def _ensure_data_is_loaded_for_sql(self,sql_object,input_params,data_streams=None,stop_after_analysis=False):
- xprint("Ensuring Data load")
- new_table_structures = OrderedDict()
-
- # For each "table name"
- for qtable_name in sql_object.qtable_names:
- tss = self._load_data(qtable_name,input_params,stop_after_analysis=stop_after_analysis)
- if tss is not None:
- xprint("New Table Structures:",new_table_structures)
- assert qtable_name not in new_table_structures, "new_table_structures was changed not to contain a list as a value"
- new_table_structures[qtable_name] = tss
-
- return new_table_structures
-
- def materialize_query_level_db(self,save_db_to_disk_filename,sql_object):
- # TODO More robust creation - Create the file in a separate folder and move it to the target location only after success
-
- materialized_db = Sqlite3DB("materialized","file:%s" % save_db_to_disk_filename,save_db_to_disk_filename,create_qcatalog=False)
- table_name_mapping = OrderedDict()
-
- # For each table in the query
- effective_table_names = sql_object.get_qtable_name_effective_table_names()
-
- for i, qtable_name in enumerate(effective_table_names):
- # table name, in the format db_id.table_name
- effective_table_name_for_qtable_name = effective_table_names[qtable_name]
-
- source_db_id, actual_table_name_in_db = effective_table_name_for_qtable_name.split(".", 1)
- # The DatabaseInfo instance for this db
- source_database = self.databases[source_db_id]
- if source_db_id != self.query_level_db_id:
- self.attach_to_db(source_database.sqlite_db,materialized_db)
-
- ts = self.loaded_table_structures_dict[qtable_name]
- proposed_new_table_name = ts.planned_table_name
- xprint("Proposed table name is %s" % proposed_new_table_name)
-
- new_table_name = materialized_db.find_new_table_name(proposed_new_table_name)
-
- xprint("Materializing",source_db_id,actual_table_name_in_db,"as",new_table_name)
- # Copy the table into the materialized database
- xx = materialized_db.execute_and_fetch('CREATE TABLE %s AS SELECT * FROM %s' % (new_table_name,effective_table_name_for_qtable_name))
-
- table_name_mapping[effective_table_name_for_qtable_name] = new_table_name
-
- # TODO RLRL Preparation for writing materialized database as a qsql file
- # if source_database.sqlite_db.qcatalog_table_exists():
- # qcatalog_entry = source_database.sqlite_db.get_from_qcatalog_using_table_name(actual_table_name_in_db)
- # # TODO RLRL Encapsulate dictionary transform inside qcatalog access methods
- # materialized_db.add_to_qcatalog_table(new_table_name,OrderedDict(json.loads(qcatalog_entry['content_signature'])),
- # qcatalog_entry['creation_time'],
- # qcatalog_entry['source_type'],
- # qcatalog_entry['source_type'])
- # xprint("PQX Added to qcatalog",source_db_id,actual_table_name_in_db,'as',new_table_name)
- # else:
- # xprint("PQX Skipped adding to qcatalog",source_db_id,actual_table_name_in_db)
-
- if source_db_id != self.query_level_db:
- self.detach_from_db(source_database.sqlite_db,materialized_db)
-
- return table_name_mapping
-
- def validate_query(self,sql_object,table_structures):
-
- for qtable_name in sql_object.qtable_names:
- relevant_table_structures = [table_structures[qtable_name]]
-
- column_names = None
- column_types = None
- for ts in relevant_table_structures:
- names = ts.column_names
- types = ts.python_column_types
- xprint("Comparing column names: %s with %s" % (column_names,names))
- if column_names is None:
- column_names = names
- else:
- if column_names != names:
- raise BadHeaderException("Column names differ for table %s: %s vs %s" % (
- qtable_name, ",".join(column_names), ",".join(names)))
-
- xprint("Comparing column types: %s with %s" % (column_types,types))
- if column_types is None:
- column_types = types
- else:
- if column_types != types:
- raise BadHeaderException("Column types differ for table %s: %s vs %s" % (
- qtable_name, ",".join(column_types), ",".join(types)))
-
- xprint("All column names match for qtable name %s: column names: %s column types: %s" % (ts.qtable_name,column_names,column_types))
-
- xprint("Query validated")
-
- def _execute(self,query_str,input_params=None,data_streams=None,stop_after_analysis=False,save_db_to_disk_filename=None):
- warnings = []
- error = None
- table_structures = []
-
- db_results_obj = None
-
- effective_input_params = self.default_input_params.merged_with(input_params)
-
- if type(query_str) != unicode:
- try:
- # Heuristic attempt to auto convert the query to unicode before failing
- query_str = query_str.decode('utf-8')
- except:
- error = QError(EncodedQueryException(''),"Query should be in unicode. Please make sure to provide a unicode literal string or decode it using proper the character encoding.",91)
- return QOutput(error = error)
-
-
- try:
- # Create SQL statement
- sql_object = Sql('%s' % query_str, self.data_streams)
-
- load_start_time = time.time()
- iprint("Going to ensure data is loaded. Currently loaded tables: %s" % str(self.loaded_table_structures_dict))
- new_table_structures = self._ensure_data_is_loaded_for_sql(sql_object,effective_input_params,data_streams,stop_after_analysis=stop_after_analysis)
- iprint("Ensured data is loaded. loaded tables: %s" % self.loaded_table_structures_dict)
-
- self.validate_query(sql_object,self.loaded_table_structures_dict)
-
- iprint("Query validated")
-
- sql_object.materialize_using(self.loaded_table_structures_dict)
-
- iprint("Materialized sql object")
-
- if save_db_to_disk_filename is not None:
- xprint("Saving query data to disk")
- dump_start_time = time.time()
- table_name_mapping = self.materialize_query_level_db(save_db_to_disk_filename,sql_object)
- print("Data has been saved into %s . Saving has taken %4.3f seconds" % (save_db_to_disk_filename,time.time()-dump_start_time), file=sys.stderr)
- effective_sql = sql_object.get_effective_sql(table_name_mapping)
- print("Query to run on the database: %s;" % effective_sql, file=sys.stderr)
- command_line = 'echo "%s" | sqlite3 %s' % (effective_sql,save_db_to_disk_filename)
- print("You can run the query directly from the command line using the following command: %s" % command_line, file=sys.stderr)
-
- # TODO Propagate dump results using a different output class instead of an empty one
- return QOutput()
-
- # Ensure that adhoc db is not in the middle of a transaction
- self.adhoc_db.conn.commit()
-
- all_databases = self.query_level_db.get_sqlite_database_list()
- xprint("Query level db: databases %s" % all_databases)
-
- # Execute the query and fetch the data
- db_results_obj = sql_object.execute_and_fetch(self.query_level_db)
- iprint("Query executed")
-
- if len(db_results_obj.results) == 0:
- warnings.append(QWarning(None, "Warning - data is empty"))
-
- return QOutput(
- data = db_results_obj.results,
- metadata = QMetadata(
- table_structures=self.loaded_table_structures_dict,
- new_table_structures=new_table_structures,
- output_column_name_list=db_results_obj.query_column_names),
- warnings = warnings,
- error = error)
- except InvalidQueryException as e:
- error = QError(e,str(e),118)
- except MissingHeaderException as e:
- error = QError(e,e.msg,117)
- except FileNotFoundException as e:
- error = QError(e,e.msg,30)
- except SqliteOperationalErrorException as e:
- xprint("Sqlite Operational error: %s" % e)
- msg = str(e.original_error)
- error = QError(e,"query error: %s" % msg,1)
- if "no such column" in msg and effective_input_params.skip_header:
- warnings.append(QWarning(e,'Warning - There seems to be a "no such column" error, and -H (header line) exists. Please make sure that you are using the column names from the header line and not the default (cXX) column names. Another issue might be that the file contains a BOM. Files that are encoded with UTF8 and contain a BOM can be read by specifying `-e utf-9-sig` in the command line. Support for non-UTF8 encoding will be provided in the future.'))
- except ColumnCountMismatchException as e:
- error = QError(e,e.msg,2)
- except (UnicodeDecodeError, UnicodeError) as e:
- error = QError(e,"Cannot decode data. Try to change the encoding by setting it using the -e parameter. Error:%s" % e,3)
- except BadHeaderException as e:
- error = QError(e,"Bad header row: %s" % e.msg,35)
- except CannotUnzipDataStreamException as e:
- error = QError(e,"Cannot decompress standard input. Pipe the input through zcat in order to decompress.",36)
- except UniversalNewlinesExistException as e:
- error = QError(e,"Data contains universal newlines. Run q with -U to use universal newlines. Please note that q still doesn't support universal newlines for .gz files or for stdin. Route the data through a regular file to use -U.",103)
- # deprecated, but shouldn't be used: error = QError(e,"Standard Input must be provided in order to use it as a table",61)
- except CouldNotConvertStringToNumericValueException as e:
- error = QError(e,"Could not convert string to a numeric value. Did you use `-w nonnumeric` with unquoted string values? Error: %s" % e.msg,58)
- except CouldNotParseInputException as e:
- error = QError(e,"Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error: %s" % e.msg,59)
- except ColumnMaxLengthLimitExceededException as e:
- error = QError(e,e.msg,31)
- # deprecated, but shouldn't be used: error = QError(e,e.msg,79)
- except ContentSignatureDiffersException as e:
- error = QError(e,"%s vs %s: Content Signatures for table %s differ at %s (source value '%s' disk signature value '%s')" %
- (e.original_filename,e.other_filename,e.filenames_str,e.key,e.source_value,e.signature_value),80)
- except ContentSignatureDataDiffersException as e:
- error = QError(e,e.msg,81)
- except MaximumSourceFilesExceededException as e:
- error = QError(e,e.msg,82)
- except ContentSignatureNotFoundException as e:
- error = QError(e,e.msg,83)
- except NonExistentTableNameInQsql as e:
- msg = "Table %s could not be found in qsql file %s . Existing table names: %s" % (e.table_name,e.qsql_filename,",".join(e.existing_table_names))
- error = QError(e,msg,84)
- except NonExistentTableNameInSqlite as e:
- msg = "Table %s could not be found in sqlite file %s . Existing table names: %s" % (e.table_name,e.qsql_filename,",".join(e.existing_table_names))
- error = QError(e,msg,85)
- except TooManyTablesInQsqlException as e:
- msg = "Could not autodetect table name in qsql file. Existing Tables %s" % ",".join(e.existing_table_names)
- error = QError(e,msg,86)
- except NoTableInQsqlExcption as e:
- msg = "Could not autodetect table name in qsql file. File contains no record of a table"
- error = QError(e,msg,97)
- except TooManyTablesInSqliteException as e:
- msg = "Could not autodetect table name in sqlite file %s . Existing tables: %s" % (e.qsql_filename,",".join(e.existing_table_names))
- error = QError(e,msg,87)
- except NoTablesInSqliteException as e:
- msg = "sqlite file %s has no tables" % e.sqlite_filename
- error = QError(e,msg,88)
- except TooManyAttachedDatabasesException as e:
- msg = str(e)
- error = QError(e,msg,89)
- except UnknownFileTypeException as e:
- msg = str(e)
- error = QError(e,msg,95)
- except KeyboardInterrupt as e:
- warnings.append(QWarning(e,"Interrupted"))
- except Exception as e:
- global DEBUG
- if DEBUG:
- xprint(traceback.format_exc())
- error = QError(e,repr(e),199)
-
- return QOutput(data=None,warnings = warnings,error = error , metadata=QMetadata(table_structures=self.loaded_table_structures_dict,new_table_structures=self.loaded_table_structures_dict,output_column_name_list=[]))
-
- def execute(self,query_str,input_params=None,save_db_to_disk_filename=None):
- r = self._execute(query_str,input_params,stop_after_analysis=False,save_db_to_disk_filename=save_db_to_disk_filename)
- return r
-
- def unload(self):
- # TODO This would fail, since table structures are just value objects now. Will be fixed as part of making q a full python module
- for qtable_name,table_creator in six.iteritems(self.loaded_table_structures_dict):
- try:
- table_creator.drop_table()
- except:
- # Support no-table select queries
- pass
- self.loaded_table_structures_dict = OrderedDict()
-
- def analyze(self,query_str,input_params=None,data_streams=None):
- q_output = self._execute(query_str,input_params,data_streams=data_streams,stop_after_analysis=True)
-
- return q_output
-
-def escape_double_quotes_if_needed(v):
- x = v.replace(six.u('"'), six.u('""'))
- return x
-
-def quote_none_func(output_delimiter,v):
- return v
-
-def quote_minimal_func(output_delimiter,v):
- if v is None:
- return v
- t = type(v)
- if (t == str or t == unicode) and ((output_delimiter in v) or ('\n' in v) or ('"' in v)):
- return six.u('"{}"').format(escape_double_quotes_if_needed(v))
- return v
-
-def quote_nonnumeric_func(output_delimiter,v):
- if v is None:
- return v
- if type(v) == str or type(v) == unicode:
- return six.u('"{}"').format(escape_double_quotes_if_needed(v))
- return v
-
-def quote_all_func(output_delimiter,v):
- if type(v) == str or type(v) == unicode:
- return six.u('"{}"').format(escape_double_quotes_if_needed(v))
- else:
- return six.u('"{}"').format(v)
-
-class QOutputParams(object):
- def __init__(self,
- delimiter=' ',
- beautify=False,
- output_quoting_mode='minimal',
- formatting=None,
- output_header=False,
- encoding=None):
- self.delimiter = delimiter
- self.beautify = beautify
- self.output_quoting_mode = output_quoting_mode
- self.formatting = formatting
- self.output_header = output_header
- self.encoding = encoding
-
- def __str__(self):
- return "QOutputParams<%s>" % str(self.__dict__)
-
- def __repr__(self):
- return "QOutputParams(...)"
-
-class QOutputPrinter(object):
- output_quoting_modes = { 'minimal' : quote_minimal_func,
- 'all' : quote_all_func,
- 'nonnumeric' : quote_nonnumeric_func,
- 'none' : quote_none_func }
-
- def __init__(self,output_params,show_tracebacks=False):
- self.output_params = output_params
- self.show_tracebacks = show_tracebacks
-
- self.output_field_quoting_func = QOutputPrinter.output_quoting_modes[output_params.output_quoting_mode]
-
- def print_errors_and_warnings(self,f,results):
- if results.status == 'error':
- error = results.error
- print(error.msg, file=f)
- if self.show_tracebacks:
- print(error.traceback, file=f)
-
- for warning in results.warnings:
- print("%s" % warning.msg, file=f)
-
- def print_analysis(self,f_out,f_err,results):
- self.print_errors_and_warnings(f_err,results)
-
- if results.metadata is None:
- return
-
- if results.metadata.table_structures is None:
- return
-
- for qtable_name in results.metadata.table_structures:
- table_structures = results.metadata.table_structures[qtable_name]
- print("Table: %s" % qtable_name,file=f_out)
- print(" Sources:",file=f_out)
- dl = results.metadata.new_table_structures[qtable_name]
- print(" source_type: %s source: %s" % (dl.source_type,dl.source),file=f_out)
- print(" Fields:",file=f_out)
- for n,t in zip(table_structures.column_names,table_structures.sqlite_column_types):
- print(" `%s` - %s" % (n,t), file=f_out)
-
- def print_output(self,f_out,f_err,results):
- try:
- self._print_output(f_out,f_err,results)
- except (UnicodeEncodeError, UnicodeError) as e:
- print("Cannot encode data. Error:%s" % e, file=f_err)
- sys.exit(3)
- except IOError as e:
- if e.errno == 32:
- # broken pipe, that's ok
- pass
- else:
- # don't miss other problems for now
- raise
- except KeyboardInterrupt:
- pass
-
- def _print_output(self,f_out,f_err,results):
- self.print_errors_and_warnings(f_err,results)
-
- data = results.data
-
- if data is None:
- return
-
- # If the user requested beautifying the output
- if self.output_params.beautify:
- if self.output_params.output_header:
- data_with_possible_headers = data + [tuple(results.metadata.output_column_name_list)]
- else:
- data_with_possible_headers = data
- max_lengths = determine_max_col_lengths(data_with_possible_headers,self.output_field_quoting_func,self.output_params.delimiter)
-
- if self.output_params.formatting:
- formatting_dict = dict(
- [(x.split("=")[0], x.split("=")[1]) for x in self.output_params.formatting.split(",")])
- else:
- formatting_dict = {}
-
- try:
- if self.output_params.output_header and results.metadata.output_column_name_list is not None:
- data.insert(0,results.metadata.output_column_name_list)
- for rownum, row in enumerate(data):
- row_str = []
- skip_formatting = rownum == 0 and self.output_params.output_header
- for i, col in enumerate(row):
- if str(i + 1) in formatting_dict.keys() and not skip_formatting:
- fmt_str = formatting_dict[str(i + 1)]
- else:
- if self.output_params.beautify:
- fmt_str = six.u("{{0:<{}}}").format(max_lengths[i])
- else:
- fmt_str = six.u("{}")
-
- if col is not None:
- xx = self.output_field_quoting_func(self.output_params.delimiter,col)
- row_str.append(fmt_str.format(xx))
- else:
- row_str.append(fmt_str.format(""))
-
-
- xxxx = six.u(self.output_params.delimiter).join(row_str) + six.u("\n")
- f_out.write(xxxx)
- except (UnicodeEncodeError, UnicodeError) as e:
- print("Cannot encode data. Error:%s" % e, file=sys.stderr)
- sys.exit(3)
- except TypeError as e:
- print("Error while formatting output: %s" % e, file=sys.stderr)
- sys.exit(4)
- except IOError as e:
- if e.errno == 32:
- # broken pipe, that's ok
- pass
- else:
- # don't miss other problem for now
- raise
- except KeyboardInterrupt:
- pass
-
- try:
- # Prevent python bug when order of pipe shutdowns is reversed
- f_out.flush()
- except IOError as e:
- pass
-
-def get_option_with_default(p, option_type, option, default):
- try:
- if not p.has_option('options', option):
- return default
- if p.get('options',option) == 'None':
- return None
- if option_type == 'boolean':
- r = p.getboolean('options', option)
- return r
- elif option_type == 'int':
- r = p.getint('options', option)
- return r
- elif option_type == 'string':
- r = p.get('options', option)
- return r
- else:
- raise Exception("Unknown option type %s " % option_type)
- except ValueError as e:
- raise IncorrectDefaultValueException(option_type,option,p.get("options",option))
-
-QRC_FILENAME_ENVVAR = 'QRC_FILENAME'
-
-def dump_default_values_as_qrc(parser,exclusions):
- m = parser.parse_args([]).__dict__
- m.pop('leftover')
- print("[options]",file=sys.stdout)
- for k in sorted(m.keys()):
- if k not in exclusions:
- print("%s=%s" % (k,m[k]),file=sys.stdout)
-
-USAGE_TEXT = """
- q
-
- Example Execution for a delimited file:
-
- q "select * from myfile.csv"
-
- Example Execution for an sqlite3 database:
-
- q "select * from mydatabase.sqlite:::my_table_name"
-
- or
-
- q "select * from mydatabase.sqlite"
-
- if the database file contains only one table
-
- Auto-caching of delimited files can be activated through `-C readwrite` (writes new caches if needed) or `-C read` (only reads existing cache files)
-
- Setting the default caching mode (`-C`) can be done by writing a `~/.qrc` file. See docs for more info.
-
-q's purpose is to bring SQL expressive power to the Linux command line and to provide easy access to text as actual data.
-
-q allows the following:
-
-* Performing SQL-like statements directly on tabular text data, auto-caching the data in order to accelerate additional querying on the same file
-* Performing SQL statements directly on multi-file sqlite3 databases, without having to merge them or load them into memory
-
-Changing the default values for parameters can be done by creating a `~/.qrc` file. Run q with `--dump-defaults` in order to dump a default `.qrc` file into stdout.
-
-See https://github.com/harelba/q for more details.
-
-"""
-
-def run_standalone():
- sqlite3.enable_callback_tracebacks(True)
-
- p, qrc_filename = parse_qrc_file()
-
- args, options, parser = initialize_command_line_parser(p, qrc_filename)
-
- dump_defaults_and_stop__if_needed(options, parser)
-
- dump_version_and_stop__if_needed(options)
-
- STDOUT, default_input_params, q_output_printer, query_strs = parse_options(args, options)
-
- data_streams_dict = initialize_default_data_streams()
-
- q_engine = QTextAsData(default_input_params=default_input_params,data_streams_dict=data_streams_dict)
-
- execute_queries(STDOUT, options, q_engine, q_output_printer, query_strs)
-
- q_engine.done()
-
- sys.exit(0)
-
-
-def dump_version_and_stop__if_needed(options):
- if options.version:
- print_credentials()
- sys.exit(0)
-
-
-def dump_defaults_and_stop__if_needed(options, parser):
- if options.dump_defaults:
- dump_default_values_as_qrc(parser, ['dump-defaults', 'version'])
- sys.exit(0)
-
-
-def execute_queries(STDOUT, options, q_engine, q_output_printer, query_strs):
- for query_str in query_strs:
- if options.analyze_only:
- q_output = q_engine.analyze(query_str)
- q_output_printer.print_analysis(STDOUT, sys.stderr, q_output)
- else:
- q_output = q_engine.execute(query_str, save_db_to_disk_filename=options.save_db_to_disk_filename)
- q_output_printer.print_output(STDOUT, sys.stderr, q_output)
-
- if q_output.status == 'error':
- sys.exit(q_output.error.errorcode)
-
-
-def initialize_command_line_parser(p, qrc_filename):
- try:
- default_verbose = get_option_with_default(p, 'boolean', 'verbose', False)
- default_save_db_to_disk = get_option_with_default(p, 'string', 'save_db_to_disk_filename', None)
- default_caching_mode = get_option_with_default(p, 'string', 'caching_mode', 'none')
-
- default_skip_header = get_option_with_default(p, 'boolean', 'skip_header', False)
- default_delimiter = get_option_with_default(p, 'string', 'delimiter', None)
- default_pipe_delimited = get_option_with_default(p, 'boolean', 'pipe_delimited', False)
- default_tab_delimited = get_option_with_default(p, 'boolean', 'tab_delimited', False)
- default_encoding = get_option_with_default(p, 'string', 'encoding', 'UTF-8')
- default_gzipped = get_option_with_default(p, 'boolean', 'gzipped', False)
- default_analyze_only = get_option_with_default(p, 'boolean', 'analyze_only', False)
- default_mode = get_option_with_default(p, 'string', 'mode', "relaxed")
- default_column_count = get_option_with_default(p, 'string', 'column_count', None)
- default_keep_leading_whitespace_in_values = get_option_with_default(p, 'boolean',
- 'keep_leading_whitespace_in_values', False)
- default_disable_double_double_quoting = get_option_with_default(p, 'boolean', 'disable_double_double_quoting',
- True)
- default_disable_escaped_double_quoting = get_option_with_default(p, 'boolean', 'disable_escaped_double_quoting',
- True)
- default_disable_column_type_detection = get_option_with_default(p, 'boolean', 'disable_column_type_detection',
- False)
- default_input_quoting_mode = get_option_with_default(p, 'string', 'input_quoting_mode', 'minimal')
- default_max_column_length_limit = get_option_with_default(p, 'int', 'max_column_length_limit', 131072)
- default_with_universal_newlines = get_option_with_default(p, 'boolean', 'with_universal_newlines', False)
-
- default_output_delimiter = get_option_with_default(p, 'string', 'output_delimiter', None)
- default_pipe_delimited_output = get_option_with_default(p, 'boolean', 'pipe_delimited_output', False)
- default_tab_delimited_output = get_option_with_default(p, 'boolean', 'tab_delimited_output', False)
- default_output_header = get_option_with_default(p, 'boolean', 'output_header', False)
- default_beautify = get_option_with_default(p, 'boolean', 'beautify', False)
- default_formatting = get_option_with_default(p, 'string', 'formatting', None)
- default_output_encoding = get_option_with_default(p, 'string', 'output_encoding', 'none')
- default_output_quoting_mode = get_option_with_default(p, 'string', 'output_quoting_mode', 'minimal')
- default_list_user_functions = get_option_with_default(p, 'boolean', 'list_user_functions', False)
- default_overwrite_qsql = get_option_with_default(p, 'boolean', 'overwrite_qsql', False)
-
- default_query_filename = get_option_with_default(p, 'string', 'query_filename', None)
- default_query_encoding = get_option_with_default(p, 'string', 'query_encoding', locale.getpreferredencoding())
- default_max_attached_sqlite_databases = get_option_with_default(p,'int','max_attached_sqlite_databases', 10)
- except IncorrectDefaultValueException as e:
- print("Incorrect value '%s' for option %s in .qrc file %s (option type is %s)" % (
- e.actual_value, e.option, qrc_filename, e.option_type))
- sys.exit(199)
- parser = ArgumentParser(prog="q",usage=USAGE_TEXT)
- parser.add_argument("-v", "--version", action="store_true", help="Print version")
- parser.add_argument("-V", "--verbose", default=default_verbose, action="store_true",
- help="Print debug info in case of problems")
- parser.add_argument("-S", "--save-db-to-disk", dest="save_db_to_disk_filename", default=default_save_db_to_disk,
- help="Save database to an sqlite database file")
- parser.add_argument("-C", "--caching-mode", default=default_caching_mode,
- help="Choose the autocaching mode (none/read/readwrite). Autocaches files to disk db so further queries will be faster. Caching is done to a side-file with the same name of the table, but with an added extension .qsql")
- parser.add_argument("--dump-defaults", action="store_true",
- help="Dump all default values for parameters and exit. Can be used in order to make sure .qrc file content is being read properly.")
- parser.add_argument("--max-attached-sqlite-databases", default=default_max_attached_sqlite_databases,type=int,
- help="Set the maximum number of concurrently-attached sqlite dbs. This is a compile time definition of sqlite. q's performance will slow down once this limit is reached for a query, since it will perform table copies in order to avoid that limit.")
- # -----------------------------------------------
- input_data_option_group = parser.add_argument_group("Input Data Options")
- input_data_option_group.add_argument("-H", "--skip-header", default=default_skip_header,
- action="store_true",
- help="Skip header row. This has been changed from earlier version - Only one header row is supported, and the header row is used for column naming")
- input_data_option_group.add_argument("-d", "--delimiter", default=default_delimiter,
- help="Field delimiter. If none specified, then space is used as the delimiter.")
- input_data_option_group.add_argument("-p", "--pipe-delimited", default=default_pipe_delimited,
- action="store_true",
- help="Same as -d '|'. Added for convenience and readability")
- input_data_option_group.add_argument("-t", "--tab-delimited", default=default_tab_delimited,
- action="store_true",
- help="Same as -d . Just a shorthand for handling standard tab delimited file You can use $'\\t' if you want (this is how Linux expects to provide tabs in the command line")
- input_data_option_group.add_argument("-e", "--encoding", default=default_encoding,
- help="Input file encoding. Defaults to UTF-8. set to none for not setting any encoding - faster, but at your own risk...")
- input_data_option_group.add_argument("-z", "--gzipped", default=default_gzipped, action="store_true",
- help="Data is gzipped. Useful for reading from stdin. For files, .gz means automatic gunzipping")
- input_data_option_group.add_argument("-A", "--analyze-only", default=default_analyze_only,
- action='store_true',
- help="Analyze sample input and provide information about data types")
- input_data_option_group.add_argument("-m", "--mode", default=default_mode,
- help="Data parsing mode. fluffy, relaxed and strict. In strict mode, the -c column-count parameter must be supplied as well")
- input_data_option_group.add_argument("-c", "--column-count", default=default_column_count,
- help="Specific column count when using relaxed or strict mode")
- input_data_option_group.add_argument("-k", "--keep-leading-whitespace", dest="keep_leading_whitespace_in_values",
- default=default_keep_leading_whitespace_in_values, action="store_true",
- help="Keep leading whitespace in values. Default behavior strips leading whitespace off values, in order to provide out-of-the-box usability for simple use cases. If you need to preserve whitespace, use this flag.")
- input_data_option_group.add_argument("--disable-double-double-quoting",
- default=default_disable_double_double_quoting, action="store_false",
- help="Disable support for double double-quoting for escaping the double quote character. By default, you can use \"\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
- input_data_option_group.add_argument("--disable-escaped-double-quoting",
- default=default_disable_escaped_double_quoting, action="store_false",
- help="Disable support for escaped double-quoting for escaping the double quote character. By default, you can use \\\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
- input_data_option_group.add_argument("--as-text", dest="disable_column_type_detection",
- default=default_disable_column_type_detection, action="store_true",
- help="Don't detect column types - All columns will be treated as text columns")
- input_data_option_group.add_argument("-w", "--input-quoting-mode",
- default=default_input_quoting_mode,
- help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.")
- input_data_option_group.add_argument("-M", "--max-column-length-limit",
- default=default_max_column_length_limit,
- help="Sets the maximum column length.")
- input_data_option_group.add_argument("-U", "--with-universal-newlines",
- default=default_with_universal_newlines, action="store_true",
- help="Expect universal newlines in the data. Limitation: -U works only with regular files for now, stdin or .gz files are not supported yet.")
- # -----------------------------------------------
- output_data_option_group = parser.add_argument_group("Output Options")
- output_data_option_group.add_argument("-D", "--output-delimiter",
- default=default_output_delimiter,
- help="Field delimiter for output. If none specified, then the -d delimiter is used if present, or space if no delimiter is specified")
- output_data_option_group.add_argument("-P", "--pipe-delimited-output",
- default=default_pipe_delimited_output, action="store_true",
- help="Same as -D '|'. Added for convenience and readability.")
- output_data_option_group.add_argument("-T", "--tab-delimited-output",
- default=default_tab_delimited_output, action="store_true",
- help="Same as -D . Just a shorthand for outputting tab delimited output. You can use -D $'\\t' if you want.")
- output_data_option_group.add_argument("-O", "--output-header", default=default_output_header,
- action="store_true",
- help="Output header line. Output column-names are determined from the query itself. Use column aliases in order to set your column names in the query. For example, 'select name FirstName,value1/value2 MyCalculation from ...'. This can be used even if there was no header in the input.")
- output_data_option_group.add_argument("-b", "--beautify", default=default_beautify,
- action="store_true",
- help="Beautify output according to actual values. Might be slow...")
- output_data_option_group.add_argument("-f", "--formatting", default=default_formatting,
- help="Output-level formatting, in the format X=fmt,Y=fmt etc, where X,Y are output column numbers (e.g. 1 for first SELECT column etc.")
- output_data_option_group.add_argument("-E", "--output-encoding",
- default=default_output_encoding,
- help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
- output_data_option_group.add_argument("-W", "--output-quoting-mode",
- default=default_output_quoting_mode,
- help="Output quoting mode. Possible values are all, minimal, nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.")
- output_data_option_group.add_argument("-L", "--list-user-functions",
- default=default_list_user_functions, action="store_true",
- help="List all user functions")
- parser.add_argument("--overwrite-qsql", default=default_overwrite_qsql,
- help="When used, qsql files (both caches and store-to-db) will be overwritten if they already exist. Use with care.")
- # -----------------------------------------------
- query_option_group = parser.add_argument_group("Query Related Options")
- query_option_group.add_argument("-q", "--query-filename", default=default_query_filename,
- help="Read query from the provided filename instead of the command line, possibly using the provided query encoding (using -Q).")
- query_option_group.add_argument("-Q", "--query-encoding", default=default_query_encoding,
- help="query text encoding. Experimental. Please send your feedback on this")
- # -----------------------------------------------
- parser.add_argument('leftover', nargs='*')
- args = parser.parse_args()
- return args.leftover, args, parser
-
-
-def parse_qrc_file():
- p = configparser.ConfigParser()
- if QRC_FILENAME_ENVVAR in os.environ:
- qrc_filename = os.environ[QRC_FILENAME_ENVVAR]
- if qrc_filename != 'None':
- xprint("qrc filename is %s" % qrc_filename)
- if os.path.exists(qrc_filename):
- p.read([os.environ[QRC_FILENAME_ENVVAR]])
- else:
- print('QRC_FILENAME env var exists, but cannot find qrc file at %s' % qrc_filename, file=sys.stderr)
- sys.exit(244)
- else:
- pass # special handling of 'None' env var value for QRC_FILENAME. Allows to eliminate the default ~/.qrc reading
- else:
- qrc_filename = os.path.expanduser('~/.qrc')
- p.read([qrc_filename, '.qrc'])
- return p, qrc_filename
-
-
-def initialize_default_data_streams():
- data_streams_dict = {
- '-': DataStream('stdin', '-', sys.stdin)
- }
- return data_streams_dict
-
-
-def parse_options(args, options):
- if options.list_user_functions:
- print_user_functions()
- sys.exit(0)
- if len(args) == 0 and options.query_filename is None:
- print_credentials()
- print("Must provide at least one query in the command line, or through a file with the -q parameter",
- file=sys.stderr)
- sys.exit(1)
- if options.query_filename is not None:
- if len(args) != 0:
- print("Can't provide both a query file and a query on the command line", file=sys.stderr)
- sys.exit(1)
- try:
- f = open(options.query_filename, 'rb')
- query_strs = [f.read()]
- f.close()
- except:
- print("Could not read query from file %s" % options.query_filename, file=sys.stderr)
- sys.exit(1)
- else:
- if sys.stdin.encoding is not None:
- query_strs = [x.encode(sys.stdin.encoding) for x in args]
- else:
- query_strs = args
- if options.query_encoding is not None and options.query_encoding != 'none':
- try:
- for idx in range(len(query_strs)):
- query_strs[idx] = query_strs[idx].decode(options.query_encoding).strip()
-
- if len(query_strs[idx]) == 0:
- print("Query cannot be empty (query number %s)" % (idx + 1), file=sys.stderr)
- sys.exit(1)
-
- except Exception as e:
- print("Could not decode query number %s using the provided query encoding (%s)" % (
- idx + 1, options.query_encoding), file=sys.stderr)
- sys.exit(3)
- ###
- if options.mode not in ['relaxed', 'strict']:
- print("Parsing mode can either be relaxed or strict", file=sys.stderr)
- sys.exit(13)
- output_encoding = get_stdout_encoding(options.output_encoding)
- try:
- if six.PY3:
- STDOUT = codecs.getwriter(output_encoding)(sys.stdout.buffer)
- else:
- STDOUT = codecs.getwriter(output_encoding)(sys.stdout)
- except:
- print("Could not create output stream using output encoding %s" % (output_encoding), file=sys.stderr)
- sys.exit(200)
- # If the user flagged for a tab-delimited file then set the delimiter to tab
- if options.tab_delimited:
- if options.delimiter is not None and options.delimiter != '\t':
- print("Warning: -t parameter overrides -d parameter (%s)" % options.delimiter, file=sys.stderr)
- options.delimiter = '\t'
- # If the user flagged for a pipe-delimited file then set the delimiter to pipe
- if options.pipe_delimited:
- if options.delimiter is not None and options.delimiter != '|':
- print("Warning: -p parameter overrides -d parameter (%s)" % options.delimiter, file=sys.stderr)
- options.delimiter = '|'
- if options.delimiter is None:
- options.delimiter = ' '
- elif len(options.delimiter) != 1:
- print("Delimiter must be one character only", file=sys.stderr)
- sys.exit(5)
- if options.tab_delimited_output:
- if options.output_delimiter is not None and options.output_delimiter != '\t':
- print("Warning: -T parameter overrides -D parameter (%s)" % options.output_delimiter, file=sys.stderr)
- options.output_delimiter = '\t'
- if options.pipe_delimited_output:
- if options.output_delimiter is not None and options.output_delimiter != '|':
- print("Warning: -P parameter overrides -D parameter (%s)" % options.output_delimiter, file=sys.stderr)
- options.output_delimiter = '|'
- if options.output_delimiter:
- # If output delimiter is specified, then we use it
- options.output_delimiter = options.output_delimiter
- else:
- # Otherwise,
- if options.delimiter:
- # if an input delimiter is specified, then we use it as the output as
- # well
- options.output_delimiter = options.delimiter
- else:
- # if no input delimiter is specified, then we use space as the default
- # (since no input delimiter means any whitespace)
- options.output_delimiter = " "
- try:
- max_column_length_limit = int(options.max_column_length_limit)
- except:
- print("Max column length limit must be an integer larger than 2 (%s)" % options.max_column_length_limit,
- file=sys.stderr)
- sys.exit(31)
- if max_column_length_limit < 3:
- print("Maximum column length must be larger than 2",file=sys.stderr)
- sys.exit(31)
-
- csv.field_size_limit(max_column_length_limit)
- xprint("Max column length limit is %s" % options.max_column_length_limit)
-
- if options.input_quoting_mode not in list(QTextAsData.input_quoting_modes.keys()):
- print("Input quoting mode can only be one of %s. It cannot be set to '%s'" % (
- ",".join(sorted(QTextAsData.input_quoting_modes.keys())), options.input_quoting_mode), file=sys.stderr)
- sys.exit(55)
- if options.output_quoting_mode not in list(QOutputPrinter.output_quoting_modes.keys()):
- print("Output quoting mode can only be one of %s. It cannot be set to '%s'" % (
- ",".join(QOutputPrinter.output_quoting_modes.keys()), options.input_quoting_mode), file=sys.stderr)
- sys.exit(56)
- if options.column_count is not None:
- expected_column_count = int(options.column_count)
- if expected_column_count < 1 or expected_column_count > int(options.max_column_length_limit):
- print("Column count must be between 1 and %s" % int(options.max_column_length_limit),file=sys.stderr)
- sys.exit(90)
- else:
- # infer automatically
- expected_column_count = None
- if options.encoding != 'none':
- try:
- codecs.lookup(options.encoding)
- except LookupError:
- print("Encoding %s could not be found" % options.encoding, file=sys.stderr)
- sys.exit(10)
- if options.save_db_to_disk_filename is not None:
- if options.analyze_only:
- print("Cannot save database to disk when running with -A (analyze-only) option.", file=sys.stderr)
- sys.exit(119)
-
- print("Going to save data into a disk database: %s" % options.save_db_to_disk_filename, file=sys.stderr)
- if os.path.exists(options.save_db_to_disk_filename):
- print("Disk database file %s already exists." % options.save_db_to_disk_filename, file=sys.stderr)
- sys.exit(77)
- # sys.exit(78) Deprecated, but shouldn't be reused
- if options.caching_mode not in ['none', 'read', 'readwrite']:
- print("caching mode must be none,read or readwrite",file=sys.stderr)
- sys.exit(85)
- read_caching = options.caching_mode in ['read', 'readwrite']
- write_caching = options.caching_mode in ['readwrite']
-
- if options.max_attached_sqlite_databases <= 3:
- print("Max attached sqlite databases must be larger than 3")
- sys.exit(99)
-
- default_input_params = QInputParams(skip_header=options.skip_header,
- delimiter=options.delimiter,
- input_encoding=options.encoding,
- gzipped_input=options.gzipped,
- with_universal_newlines=options.with_universal_newlines,
- parsing_mode=options.mode,
- expected_column_count=expected_column_count,
- keep_leading_whitespace_in_values=options.keep_leading_whitespace_in_values,
- disable_double_double_quoting=options.disable_double_double_quoting,
- disable_escaped_double_quoting=options.disable_escaped_double_quoting,
- input_quoting_mode=options.input_quoting_mode,
- disable_column_type_detection=options.disable_column_type_detection,
- max_column_length_limit=max_column_length_limit,
- read_caching=read_caching,
- write_caching=write_caching,
- max_attached_sqlite_databases=options.max_attached_sqlite_databases)
-
- output_params = QOutputParams(
- delimiter=options.output_delimiter,
- beautify=options.beautify,
- output_quoting_mode=options.output_quoting_mode,
- formatting=options.formatting,
- output_header=options.output_header,
- encoding=output_encoding)
- q_output_printer = QOutputPrinter(output_params, show_tracebacks=DEBUG)
-
- return STDOUT, default_input_params, q_output_printer, query_strs
-
-
-if __name__ == '__main__':
- run_standalone()
diff --git a/bump-version.py b/bump-version.py
new file mode 100755
index 00000000..bc0e259a
--- /dev/null
+++ b/bump-version.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+import re
+import subprocess
+from pathlib import Path
+
+INIT_FILE = Path("qtextasdata/__init__.py")
+
+def get_current_version():
+ """Read the current version from __init__.py"""
+ with open(INIT_FILE, "r") as f:
+ content = f.read()
+ match = re.search(r"q_version\s*=\s*['\"]([^'\"]+)['\"]", content)
+ if not match:
+ raise ValueError(f"Could not find version in {INIT_FILE}")
+ return match.group(1)
+
+def update_version(new_version):
+ """Update the version in __init__.py"""
+ with open(INIT_FILE, "r") as f:
+ content = f.read()
+
+ updated_content = re.sub(r"(q_version\s*=\s*['\"])[^'\"]+(['\"])",
+ r"\g<1>" + new_version + r"\g<2>",
+ content)
+
+ with open(INIT_FILE, "w") as f:
+ f.write(updated_content)
+
+ print(f"Updated version to {new_version} in {INIT_FILE}")
+
+def bump_version(current_version, bump_type):
+ """Bump the version according to semver rules"""
+ if bump_type not in ["major", "minor", "patch", "prerelease"]:
+ raise ValueError("Bump type must be 'major', 'minor', 'patch', or 'prerelease'")
+
+ # Handle prerelease versions
+ if "-" in current_version:
+ base, prerelease = current_version.split("-", 1)
+ if bump_type == "prerelease":
+ # Increment prerelease number
+ pre_type, pre_num = re.match(r"([a-zA-Z]+)\.?(\d+)?", prerelease).groups()
+ if pre_num is None:
+ pre_num = 1
+ else:
+ pre_num = int(pre_num) + 1
+ return f"{base}-{pre_type}.{pre_num}"
+ else:
+ # If bumping a prerelease to release, use the base version
+ major, minor, patch = map(int, base.split("."))
+ else:
+ major, minor, patch = map(int, current_version.split("."))
+
+ if bump_type == "major":
+ return f"{major + 1}.0.0"
+ elif bump_type == "minor":
+ return f"{major}.{minor + 1}.0"
+ elif bump_type == "patch":
+ return f"{major}.{minor}.{patch + 1}"
+ elif bump_type == "prerelease":
+ # Create a new prerelease
+ return f"{major}.{minor}.{patch}-beta.1"
+
+def create_git_tag(version):
+ """Create a git tag for the new version"""
+ tag = f"v{version}"
+
+ try:
+ # Check if tag already exists
+ result = subprocess.run(["git", "tag", "-l", tag],
+ capture_output=True, text=True, check=True)
+ if tag in result.stdout:
+ print(f"Warning: Tag {tag} already exists!")
+ return False
+
+ # Create and push tag
+ subprocess.run(["git", "tag", "-a", tag, "-m", f"Release {version}"], check=True)
+ print(f"Created git tag: {tag}")
+
+ print("\nTo push the tag, run:")
+ print(f" git push origin {tag}")
+
+ return True
+ except subprocess.CalledProcessError as e:
+ print(f"Error creating git tag: {e}")
+ return False
+
+def show_help():
+ """Display help information"""
+ print("Usage: python bump-version.py [--tag]")
+ print("\nOptions:")
+ print(" major Increment the MAJOR version (incompatible API changes)")
+ print(" minor Increment the MINOR version (add functionality, backwards compatible)")
+ print(" patch Increment the PATCH version (bug fixes, backwards compatible)")
+ print(" prerelease Create or increment a prerelease version")
+ print(" --tag Create a git tag for the new version")
+ print(" --help Show this help message")
+ print("\nExamples:")
+ print(" python bump-version.py patch # 4.0.0 -> 4.0.1")
+ print(" python bump-version.py minor # 4.0.0 -> 4.1.0")
+ print(" python bump-version.py prerelease # 4.0.0 -> 4.0.0-beta.1")
+ print(" python bump-version.py patch --tag # 4.0.0 -> 4.0.1 and create tag v4.0.1")
+ sys.exit(0)
+
+def main():
+ if len(sys.argv) < 2 or "--help" in sys.argv or "-h" in sys.argv:
+ show_help()
+
+ bump_type = sys.argv[1].lower()
+ create_tag = "--tag" in sys.argv
+
+ try:
+ current_version = get_current_version()
+ print(f"Current version: {current_version}")
+
+ new_version = bump_version(current_version, bump_type)
+ print(f"New version: {new_version}")
+
+ confirm = input(f"Update version from {current_version} to {new_version}? [y/N] ")
+ if confirm.lower() != "y":
+ print("Version update cancelled.")
+ sys.exit(0)
+
+ update_version(new_version)
+
+ if create_tag:
+ create_git_tag(new_version)
+
+ except Exception as e:
+ print(f"Error: {e}")
+ sys.exit(1)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/pyoxidizer.bzl b/pyoxidizer.bzl
deleted file mode 100644
index 1f7fb135..00000000
--- a/pyoxidizer.bzl
+++ /dev/null
@@ -1,117 +0,0 @@
-# This file defines how PyOxidizer application building and packaging is
-# performed. See PyOxidizer's documentation at
-# https://pyoxidizer.readthedocs.io/en/stable/ for details of this
-# configuration file format.
-
-PYTHON_VERSION = VARS.get("PYTHON_VERSION","3.8")
-Q_VERSION = VARS.get("Q_VERSION","0.0.1")
-
-# Configuration files consist of functions which define build "targets."
-# This function creates a Python executable and installs it in a destination
-# directory.
-def make_exe():
- dist = default_python_distribution(python_version=PYTHON_VERSION)
-
- policy = dist.make_python_packaging_policy()
- policy.set_resource_handling_mode("classify")
- policy.resources_location = "in-memory"
- policy.resources_location_fallback = "filesystem-relative:Lib"
- policy.allow_in_memory_shared_library_loading = False
-
- python_config = dist.make_python_interpreter_config()
-
- python_config.run_module = "bin.q"
-
- exe = dist.to_python_executable(
- name="q",
-
- packaging_policy=policy,
-
- config=python_config,
- )
-
- exe.pip_install(["wheel"])
-
- exe.add_python_resources(exe.pip_install(["-r", "requirements.txt"]))
- exe.add_python_resources(exe.pip_install(["-e", "."]))
-
- exe.add_python_resources(exe.read_package_root(
- path="./",
- packages=["bin"],
- ))
-
- return exe
-
-def make_embedded_resources(exe):
- return exe.to_embedded_resources()
-
-def make_install(exe):
- # Create an object that represents our installed application file layout.
- files = FileManifest()
-
- # Add the generated executable to our install layout in the root directory.
- files.add_python_resource(".", exe)
-
- return files
-
-def make_msi(exe):
- # See the full docs for more. But this will convert your Python executable
- # into a `WiXMSIBuilder` Starlark type, which will be converted to a Windows
- # .msi installer when it is built.
- builder = exe.to_wix_msi_builder(
- # Simple identifier of your app.
- "q",
- # The name of your application.
- "q-text-as-data",
- # The version of your application.
- Q_VERSION,
- # The author/manufacturer of your application.
- "Harel Ben-Attia"
- )
- return builder
-
-
-# Dynamically enable automatic code signing.
-def register_code_signers():
- # You will need to run with `pyoxidizer build --var ENABLE_CODE_SIGNING 1` for
- # this if block to be evaluated.
- if not VARS.get("ENABLE_CODE_SIGNING"):
- return
-
- # Use a code signing certificate in a .pfx/.p12 file, prompting the
- # user for its path and password to open.
- # pfx_path = prompt_input("path to code signing certificate file")
- # pfx_password = prompt_password(
- # "password for code signing certificate file",
- # confirm = True
- # )
- # signer = code_signer_from_pfx_file(pfx_path, pfx_password)
-
- # Use a code signing certificate in the Windows certificate store, specified
- # by its SHA-1 thumbprint. (This allows you to use YubiKeys and other
- # hardware tokens if they speak to the Windows certificate APIs.)
- # sha1_thumbprint = prompt_input(
- # "SHA-1 thumbprint of code signing certificate in Windows store"
- # )
- # signer = code_signer_from_windows_store_sha1_thumbprint(sha1_thumbprint)
-
- # Choose a code signing certificate automatically from the Windows
- # certificate store.
- # signer = code_signer_from_windows_store_auto()
-
- # Activate your signer so it gets called automatically.
- # signer.activate()
-
-
-# Call our function to set up automatic code signers.
-register_code_signers()
-
-# Tell PyOxidizer about the build targets defined above.
-register_target("exe", make_exe)
-register_target("resources", make_embedded_resources, depends=["exe"], default_build_script=True)
-register_target("install", make_install, depends=["exe"], default=True)
-register_target("msi_installer", make_msi, depends=["exe"])
-
-# Resolve whatever targets the invoker of this configuration file is requesting
-# be resolved.
-resolve_targets()
diff --git a/pytest.ini b/pytest.ini
index 57ea2be4..f565085b 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,5 @@
[pytest]
markers =
benchmark: Benchmark tests
+python_files = *Tests.py
+addopts = -m "not benchmark" -n auto
diff --git a/qtextasdata/__init__.py b/qtextasdata/__init__.py
new file mode 100644
index 00000000..f2d2a41a
--- /dev/null
+++ b/qtextasdata/__init__.py
@@ -0,0 +1 @@
+q_version = '4.0.0'
diff --git a/qtextasdata/cli.py b/qtextasdata/cli.py
new file mode 100644
index 00000000..7ddbc98d
--- /dev/null
+++ b/qtextasdata/cli.py
@@ -0,0 +1,501 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2012-2025 Harel Ben-Attia
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details (doc/LICENSE contains
+# a copy of it)
+#
+#
+# Name : q (With respect to The Q Continuum)
+# Author : Harel Ben-Attia - harelba@gmail.com, harelba @ github, @harelba on twitter
+#
+#
+# q allows performing SQL-like statements on tabular text data.
+#
+# Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line.
+#
+# Full Documentation and details in https://harelba.github.io/q/
+#
+# Run with --help for command line details
+#
+
+from argparse import ArgumentParser
+import codecs
+from configparser import ConfigParser
+import csv
+import locale
+import os
+import sqlite3
+import sys
+from qtextasdata import q_version
+from qtextasdata.core import DataStream, QInputParams, QOutputParams, QOutputPrinter, QTextAsData
+from qtextasdata.exceptions import IncorrectDefaultValueException
+from qtextasdata.logging import DEBUG, xprint
+from qtextasdata.utilities import get_stdout_encoding, print_user_functions
+
+def print_credentials():
+ print("q version %s" % q_version, file=sys.stderr)
+ print("Python: %s" % " // ".join([str(x).strip() for x in sys.version.split("\n")]), file=sys.stderr)
+ print("Copyright (C) 2012-2021 Harel Ben-Attia (harelba@gmail.com, @harelba on twitter)", file=sys.stderr)
+ print("https://harelba.github.io/q/", file=sys.stderr)
+ print(file=sys.stderr)
+
+def get_option_with_default(p, option_type, option, default):
+ try:
+ if not p.has_option('options', option):
+ return default
+ if p.get('options',option) == 'None':
+ return None
+ if option_type == 'boolean':
+ r = p.getboolean('options', option)
+ return r
+ elif option_type == 'int':
+ r = p.getint('options', option)
+ return r
+ elif option_type == 'string':
+ r = p.get('options', option)
+ return r
+ else:
+ raise Exception("Unknown option type %s " % option_type)
+ except ValueError as e:
+ raise IncorrectDefaultValueException(option_type,option,p.get("options",option))
+
+QRC_FILENAME_ENVVAR = 'QRC_FILENAME'
+
+def dump_default_values_as_qrc(parser,exclusions):
+ m = parser.parse_args([]).__dict__
+ m.pop('leftover')
+ print("[options]",file=sys.stdout)
+ for k in sorted(m.keys()):
+ if k not in exclusions:
+ print("%s=%s" % (k,m[k]),file=sys.stdout)
+
+USAGE_TEXT = """
+ q
+
+ Example Execution for a delimited file:
+
+ q "select * from myfile.csv"
+
+ Example Execution for an sqlite3 database:
+
+ q "select * from mydatabase.sqlite:::my_table_name"
+
+ or
+
+ q "select * from mydatabase.sqlite"
+
+ if the database file contains only one table
+
+ Auto-caching of delimited files can be activated through `-C readwrite` (writes new caches if needed) or `-C read` (only reads existing cache files)
+
+ Setting the default caching mode (`-C`) can be done by writing a `~/.qrc` file. See docs for more info.
+
+q's purpose is to bring SQL expressive power to the Linux command line and to provide easy access to text as actual data.
+
+q allows the following:
+
+* Performing SQL-like statements directly on tabular text data, auto-caching the data in order to accelerate additional querying on the same file
+* Performing SQL statements directly on multi-file sqlite3 databases, without having to merge them or load them into memory
+
+Changing the default values for parameters can be done by creating a `~/.qrc` file. Run q with `--dump-defaults` in order to dump a default `.qrc` file into stdout.
+
+See https://github.com/harelba/q for more details.
+
+"""
+
+def run_standalone():
+ sqlite3.enable_callback_tracebacks(True)
+
+ p, qrc_filename = parse_qrc_file()
+
+ args, options, parser = initialize_command_line_parser(p, qrc_filename)
+
+ dump_defaults_and_stop__if_needed(options, parser)
+
+ dump_version_and_stop__if_needed(options)
+
+ STDOUT, default_input_params, q_output_printer, query_strs = parse_options(args, options)
+
+ data_streams_dict = initialize_default_data_streams()
+
+ q_engine = QTextAsData(default_input_params=default_input_params,data_streams_dict=data_streams_dict)
+
+ execute_queries(STDOUT, options, q_engine, q_output_printer, query_strs)
+
+ q_engine.done()
+
+ sys.exit(0)
+
+
+def dump_version_and_stop__if_needed(options):
+ if options.version:
+ print_credentials()
+ sys.exit(0)
+
+
+def dump_defaults_and_stop__if_needed(options, parser):
+ if options.dump_defaults:
+ dump_default_values_as_qrc(parser, ['dump-defaults', 'version'])
+ sys.exit(0)
+
+
+def execute_queries(STDOUT, options, q_engine, q_output_printer, query_strs):
+ for query_str in query_strs:
+ if options.analyze_only:
+ q_output = q_engine.analyze(query_str)
+ q_output_printer.print_analysis(STDOUT, sys.stderr, q_output)
+ else:
+ q_output = q_engine.execute(query_str, save_db_to_disk_filename=options.save_db_to_disk_filename)
+ q_output_printer.print_output(STDOUT, sys.stderr, q_output)
+
+ if q_output.status == 'error':
+ sys.exit(q_output.error.errorcode)
+
+
+def initialize_command_line_parser(p, qrc_filename):
+ try:
+ default_verbose = get_option_with_default(p, 'boolean', 'verbose', False)
+ default_save_db_to_disk = get_option_with_default(p, 'string', 'save_db_to_disk_filename', None)
+ default_caching_mode = get_option_with_default(p, 'string', 'caching_mode', 'none')
+
+ default_skip_header = get_option_with_default(p, 'boolean', 'skip_header', False)
+ default_delimiter = get_option_with_default(p, 'string', 'delimiter', None)
+ default_pipe_delimited = get_option_with_default(p, 'boolean', 'pipe_delimited', False)
+ default_tab_delimited = get_option_with_default(p, 'boolean', 'tab_delimited', False)
+ default_encoding = get_option_with_default(p, 'string', 'encoding', 'UTF-8')
+ default_gzipped = get_option_with_default(p, 'boolean', 'gzipped', False)
+ default_analyze_only = get_option_with_default(p, 'boolean', 'analyze_only', False)
+ default_mode = get_option_with_default(p, 'string', 'mode', "relaxed")
+ default_column_count = get_option_with_default(p, 'string', 'column_count', None)
+ default_keep_leading_whitespace_in_values = get_option_with_default(p, 'boolean',
+ 'keep_leading_whitespace_in_values', False)
+ default_disable_double_double_quoting = get_option_with_default(p, 'boolean', 'disable_double_double_quoting',
+ True)
+ default_disable_escaped_double_quoting = get_option_with_default(p, 'boolean', 'disable_escaped_double_quoting',
+ True)
+ default_disable_column_type_detection = get_option_with_default(p, 'boolean', 'disable_column_type_detection',
+ False)
+ default_input_quoting_mode = get_option_with_default(p, 'string', 'input_quoting_mode', 'minimal')
+ default_max_column_length_limit = get_option_with_default(p, 'int', 'max_column_length_limit', 131072)
+ default_with_universal_newlines = get_option_with_default(p, 'boolean', 'with_universal_newlines', False)
+
+ default_output_delimiter = get_option_with_default(p, 'string', 'output_delimiter', None)
+ default_pipe_delimited_output = get_option_with_default(p, 'boolean', 'pipe_delimited_output', False)
+ default_tab_delimited_output = get_option_with_default(p, 'boolean', 'tab_delimited_output', False)
+ default_output_header = get_option_with_default(p, 'boolean', 'output_header', False)
+ default_beautify = get_option_with_default(p, 'boolean', 'beautify', False)
+ default_formatting = get_option_with_default(p, 'string', 'formatting', None)
+ default_output_encoding = get_option_with_default(p, 'string', 'output_encoding', 'none')
+ default_output_quoting_mode = get_option_with_default(p, 'string', 'output_quoting_mode', 'minimal')
+ default_list_user_functions = get_option_with_default(p, 'boolean', 'list_user_functions', False)
+ default_overwrite_qsql = get_option_with_default(p, 'boolean', 'overwrite_qsql', False)
+
+ default_query_filename = get_option_with_default(p, 'string', 'query_filename', None)
+ default_query_encoding = get_option_with_default(p, 'string', 'query_encoding', locale.getpreferredencoding())
+ default_max_attached_sqlite_databases = get_option_with_default(p,'int','max_attached_sqlite_databases', 10)
+ except IncorrectDefaultValueException as e:
+ print("Incorrect value '%s' for option %s in .qrc file %s (option type is %s)" % (
+ e.actual_value, e.option, qrc_filename, e.option_type))
+ sys.exit(199)
+ parser = ArgumentParser(prog="q",usage=USAGE_TEXT)
+ parser.add_argument("-v", "--version", action="store_true", help="Print version")
+ parser.add_argument("-V", "--verbose", default=default_verbose, action="store_true",
+ help="Print debug info in case of problems")
+ parser.add_argument("-S", "--save-db-to-disk", dest="save_db_to_disk_filename", default=default_save_db_to_disk,
+ help="Save database to an sqlite database file")
+ parser.add_argument("-C", "--caching-mode", default=default_caching_mode,
+ help="Choose the autocaching mode (none/read/readwrite). Autocaches files to disk db so further queries will be faster. Caching is done to a side-file with the same name of the table, but with an added extension .qsql")
+ parser.add_argument("--dump-defaults", action="store_true",
+ help="Dump all default values for parameters and exit. Can be used in order to make sure .qrc file content is being read properly.")
+ parser.add_argument("--max-attached-sqlite-databases", default=default_max_attached_sqlite_databases,type=int,
+ help="Set the maximum number of concurrently-attached sqlite dbs. This is a compile time definition of sqlite. q's performance will slow down once this limit is reached for a query, since it will perform table copies in order to avoid that limit.")
+ # -----------------------------------------------
+ input_data_option_group = parser.add_argument_group("Input Data Options")
+ input_data_option_group.add_argument("-H", "--skip-header", default=default_skip_header,
+ action="store_true",
+ help="Skip header row. This has been changed from earlier version - Only one header row is supported, and the header row is used for column naming")
+ input_data_option_group.add_argument("-d", "--delimiter", default=default_delimiter,
+ help="Field delimiter. If none specified, then space is used as the delimiter.")
+ input_data_option_group.add_argument("-p", "--pipe-delimited", default=default_pipe_delimited,
+ action="store_true",
+ help="Same as -d '|'. Added for convenience and readability")
+ input_data_option_group.add_argument("-t", "--tab-delimited", default=default_tab_delimited,
+ action="store_true",
+ help="Same as -d . Just a shorthand for handling standard tab delimited file You can use $'\\t' if you want (this is how Linux expects to provide tabs in the command line")
+ input_data_option_group.add_argument("-e", "--encoding", default=default_encoding,
+ help="Input file encoding. Defaults to UTF-8. set to none for not setting any encoding - faster, but at your own risk...")
+ input_data_option_group.add_argument("-z", "--gzipped", default=default_gzipped, action="store_true",
+ help="Data is gzipped. Useful for reading from stdin. For files, .gz means automatic gunzipping")
+ input_data_option_group.add_argument("-A", "--analyze-only", default=default_analyze_only,
+ action='store_true',
+ help="Analyze sample input and provide information about data types")
+ input_data_option_group.add_argument("-m", "--mode", default=default_mode,
+ help="Data parsing mode. fluffy, relaxed and strict. In strict mode, the -c column-count parameter must be supplied as well")
+ input_data_option_group.add_argument("-c", "--column-count", default=default_column_count,
+ help="Specific column count when using relaxed or strict mode")
+ input_data_option_group.add_argument("-k", "--keep-leading-whitespace", dest="keep_leading_whitespace_in_values",
+ default=default_keep_leading_whitespace_in_values, action="store_true",
+ help="Keep leading whitespace in values. Default behavior strips leading whitespace off values, in order to provide out-of-the-box usability for simple use cases. If you need to preserve whitespace, use this flag.")
+ input_data_option_group.add_argument("--disable-double-double-quoting",
+ default=default_disable_double_double_quoting, action="store_false",
+ help="Disable support for double double-quoting for escaping the double quote character. By default, you can use \"\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
+ input_data_option_group.add_argument("--disable-escaped-double-quoting",
+ default=default_disable_escaped_double_quoting, action="store_false",
+ help="Disable support for escaped double-quoting for escaping the double quote character. By default, you can use \\\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
+ input_data_option_group.add_argument("--as-text", dest="disable_column_type_detection",
+ default=default_disable_column_type_detection, action="store_true",
+ help="Don't detect column types - All columns will be treated as text columns")
+ input_data_option_group.add_argument("-w", "--input-quoting-mode",
+ default=default_input_quoting_mode,
+ help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.")
+ input_data_option_group.add_argument("-M", "--max-column-length-limit",
+ default=default_max_column_length_limit,
+ help="Sets the maximum column length.")
+ input_data_option_group.add_argument("-U", "--with-universal-newlines",
+ default=default_with_universal_newlines, action="store_true",
+ help="Expect universal newlines in the data. Limitation: -U works only with regular files for now, stdin or .gz files are not supported yet.")
+ # -----------------------------------------------
+ output_data_option_group = parser.add_argument_group("Output Options")
+ output_data_option_group.add_argument("-D", "--output-delimiter",
+ default=default_output_delimiter,
+ help="Field delimiter for output. If none specified, then the -d delimiter is used if present, or space if no delimiter is specified")
+ output_data_option_group.add_argument("-P", "--pipe-delimited-output",
+ default=default_pipe_delimited_output, action="store_true",
+ help="Same as -D '|'. Added for convenience and readability.")
+ output_data_option_group.add_argument("-T", "--tab-delimited-output",
+ default=default_tab_delimited_output, action="store_true",
+ help="Same as -D . Just a shorthand for outputting tab delimited output. You can use -D $'\\t' if you want.")
+ output_data_option_group.add_argument("-O", "--output-header", default=default_output_header,
+ action="store_true",
+ help="Output header line. Output column-names are determined from the query itself. Use column aliases in order to set your column names in the query. For example, 'select name FirstName,value1/value2 MyCalculation from ...'. This can be used even if there was no header in the input.")
+ output_data_option_group.add_argument("-b", "--beautify", default=default_beautify,
+ action="store_true",
+ help="Beautify output according to actual values. Might be slow...")
+ output_data_option_group.add_argument("-f", "--formatting", default=default_formatting,
+ help="Output-level formatting, in the format X=fmt,Y=fmt etc, where X,Y are output column numbers (e.g. 1 for first SELECT column etc.")
+ output_data_option_group.add_argument("-E", "--output-encoding",
+ default=default_output_encoding,
+ help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
+ output_data_option_group.add_argument("-W", "--output-quoting-mode",
+ default=default_output_quoting_mode,
+ help="Output quoting mode. Possible values are all, minimal, nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.")
+ output_data_option_group.add_argument("-L", "--list-user-functions",
+ default=default_list_user_functions, action="store_true",
+ help="List all user functions")
+ parser.add_argument("--overwrite-qsql", default=default_overwrite_qsql,
+ help="When used, qsql files (both caches and store-to-db) will be overwritten if they already exist. Use with care.")
+ # -----------------------------------------------
+ query_option_group = parser.add_argument_group("Query Related Options")
+ query_option_group.add_argument("-q", "--query-filename", default=default_query_filename,
+ help="Read query from the provided filename instead of the command line, possibly using the provided query encoding (using -Q).")
+ query_option_group.add_argument("-Q", "--query-encoding", default=default_query_encoding,
+ help="query text encoding. Experimental. Please send your feedback on this")
+ # -----------------------------------------------
+ parser.add_argument('leftover', nargs='*')
+ args = parser.parse_args()
+ return args.leftover, args, parser
+
+
+def parse_qrc_file():
+ p = ConfigParser()
+ if QRC_FILENAME_ENVVAR in os.environ:
+ qrc_filename = os.environ[QRC_FILENAME_ENVVAR]
+ if qrc_filename != 'None':
+ xprint("qrc filename is %s" % qrc_filename)
+ if os.path.exists(qrc_filename):
+ p.read([os.environ[QRC_FILENAME_ENVVAR]])
+ else:
+ print('QRC_FILENAME env var exists, but cannot find qrc file at %s' % qrc_filename, file=sys.stderr)
+ sys.exit(244)
+ else:
+ pass # special handling of 'None' env var value for QRC_FILENAME. Allows to eliminate the default ~/.qrc reading
+ else:
+ qrc_filename = os.path.expanduser('~/.qrc')
+ p.read([qrc_filename, '.qrc'])
+ return p, qrc_filename
+
+
+def initialize_default_data_streams():
+ data_streams_dict = {
+ '-': DataStream('stdin', '-', sys.stdin)
+ }
+ return data_streams_dict
+
+
+def parse_options(args, options):
+ if options.list_user_functions:
+ print_user_functions()
+ sys.exit(0)
+ if len(args) == 0 and options.query_filename is None:
+ print_credentials()
+ print("Must provide at least one query in the command line, or through a file with the -q parameter",
+ file=sys.stderr)
+ sys.exit(1)
+ if options.query_filename is not None:
+ if len(args) != 0:
+ print("Can't provide both a query file and a query on the command line", file=sys.stderr)
+ sys.exit(1)
+ try:
+ f = open(options.query_filename, 'rb')
+ query_strs = [f.read()]
+ f.close()
+ except:
+ print("Could not read query from file %s" % options.query_filename, file=sys.stderr)
+ sys.exit(1)
+ else:
+ if sys.stdin.encoding is not None:
+ query_strs = [x.encode(sys.stdin.encoding) for x in args]
+ else:
+ query_strs = args
+ if options.query_encoding is not None and options.query_encoding != 'none':
+ try:
+ for idx in range(len(query_strs)):
+ query_strs[idx] = query_strs[idx].decode(options.query_encoding).strip()
+
+ if len(query_strs[idx]) == 0:
+ print("Query cannot be empty (query number %s)" % (idx + 1), file=sys.stderr)
+ sys.exit(1)
+
+ except Exception as e:
+ print("Could not decode query number %s using the provided query encoding (%s)" % (
+ idx + 1, options.query_encoding), file=sys.stderr)
+ sys.exit(3)
+ ###
+ if options.mode not in ['relaxed', 'strict']:
+ print("Parsing mode can either be relaxed or strict", file=sys.stderr)
+ sys.exit(13)
+ output_encoding = get_stdout_encoding(options.output_encoding)
+ try:
+ STDOUT = codecs.getwriter(output_encoding)(sys.stdout.buffer)
+ except:
+ print("Could not create output stream using output encoding %s" % (output_encoding), file=sys.stderr)
+ sys.exit(200)
+ # If the user flagged for a tab-delimited file then set the delimiter to tab
+ if options.tab_delimited:
+ if options.delimiter is not None and options.delimiter != '\t':
+ print("Warning: -t parameter overrides -d parameter (%s)" % options.delimiter, file=sys.stderr)
+ options.delimiter = '\t'
+ # If the user flagged for a pipe-delimited file then set the delimiter to pipe
+ if options.pipe_delimited:
+ if options.delimiter is not None and options.delimiter != '|':
+ print("Warning: -p parameter overrides -d parameter (%s)" % options.delimiter, file=sys.stderr)
+ options.delimiter = '|'
+ if options.delimiter is None:
+ options.delimiter = ' '
+ elif len(options.delimiter) != 1:
+ print("Delimiter must be one character only", file=sys.stderr)
+ sys.exit(5)
+ if options.tab_delimited_output:
+ if options.output_delimiter is not None and options.output_delimiter != '\t':
+ print("Warning: -T parameter overrides -D parameter (%s)" % options.output_delimiter, file=sys.stderr)
+ options.output_delimiter = '\t'
+ if options.pipe_delimited_output:
+ if options.output_delimiter is not None and options.output_delimiter != '|':
+ print("Warning: -P parameter overrides -D parameter (%s)" % options.output_delimiter, file=sys.stderr)
+ options.output_delimiter = '|'
+ if options.output_delimiter:
+ # If output delimiter is specified, then we use it
+ options.output_delimiter = options.output_delimiter
+ else:
+ # Otherwise,
+ if options.delimiter:
+ # if an input delimiter is specified, then we use it as the output as
+ # well
+ options.output_delimiter = options.delimiter
+ else:
+ # if no input delimiter is specified, then we use space as the default
+ # (since no input delimiter means any whitespace)
+ options.output_delimiter = " "
+ try:
+ max_column_length_limit = int(options.max_column_length_limit)
+ except:
+ print("Max column length limit must be an integer larger than 2 (%s)" % options.max_column_length_limit,
+ file=sys.stderr)
+ sys.exit(31)
+ if max_column_length_limit < 3:
+ print("Maximum column length must be larger than 2",file=sys.stderr)
+ sys.exit(31)
+
+ csv.field_size_limit(max_column_length_limit)
+ xprint("Max column length limit is %s" % options.max_column_length_limit)
+
+ if options.input_quoting_mode not in list(QTextAsData.input_quoting_modes.keys()):
+ print("Input quoting mode can only be one of %s. It cannot be set to '%s'" % (
+ ",".join(sorted(QTextAsData.input_quoting_modes.keys())), options.input_quoting_mode), file=sys.stderr)
+ sys.exit(55)
+ if options.output_quoting_mode not in list(QOutputPrinter.output_quoting_modes.keys()):
+ print("Output quoting mode can only be one of %s. It cannot be set to '%s'" % (
+ ",".join(QOutputPrinter.output_quoting_modes.keys()), options.input_quoting_mode), file=sys.stderr)
+ sys.exit(56)
+ if options.column_count is not None:
+ expected_column_count = int(options.column_count)
+ if expected_column_count < 1 or expected_column_count > int(options.max_column_length_limit):
+ print("Column count must be between 1 and %s" % int(options.max_column_length_limit),file=sys.stderr)
+ sys.exit(90)
+ else:
+ # infer automatically
+ expected_column_count = None
+ if options.encoding != 'none':
+ try:
+ codecs.lookup(options.encoding)
+ except LookupError:
+ print("Encoding %s could not be found" % options.encoding, file=sys.stderr)
+ sys.exit(10)
+ if options.save_db_to_disk_filename is not None:
+ if options.analyze_only:
+ print("Cannot save database to disk when running with -A (analyze-only) option.", file=sys.stderr)
+ sys.exit(119)
+
+ print("Going to save data into a disk database: %s" % options.save_db_to_disk_filename, file=sys.stderr)
+ if os.path.exists(options.save_db_to_disk_filename):
+ print("Disk database file %s already exists." % options.save_db_to_disk_filename, file=sys.stderr)
+ sys.exit(77)
+ # sys.exit(78) Deprecated, but shouldn't be reused
+ if options.caching_mode not in ['none', 'read', 'readwrite']:
+ print("caching mode must be none,read or readwrite",file=sys.stderr)
+ sys.exit(85)
+ read_caching = options.caching_mode in ['read', 'readwrite']
+ write_caching = options.caching_mode in ['readwrite']
+
+ if options.max_attached_sqlite_databases <= 3:
+ print("Max attached sqlite databases must be larger than 3")
+ sys.exit(99)
+
+ default_input_params = QInputParams(skip_header=options.skip_header,
+ delimiter=options.delimiter,
+ input_encoding=options.encoding,
+ gzipped_input=options.gzipped,
+ with_universal_newlines=options.with_universal_newlines,
+ parsing_mode=options.mode,
+ expected_column_count=expected_column_count,
+ keep_leading_whitespace_in_values=options.keep_leading_whitespace_in_values,
+ disable_double_double_quoting=options.disable_double_double_quoting,
+ disable_escaped_double_quoting=options.disable_escaped_double_quoting,
+ input_quoting_mode=options.input_quoting_mode,
+ disable_column_type_detection=options.disable_column_type_detection,
+ max_column_length_limit=max_column_length_limit,
+ read_caching=read_caching,
+ write_caching=write_caching,
+ max_attached_sqlite_databases=options.max_attached_sqlite_databases)
+
+ output_params = QOutputParams(
+ delimiter=options.output_delimiter,
+ beautify=options.beautify,
+ output_quoting_mode=options.output_quoting_mode,
+ formatting=options.formatting,
+ output_header=options.output_header,
+ encoding=output_encoding)
+ q_output_printer = QOutputPrinter(output_params, show_tracebacks=DEBUG)
+
+ return STDOUT, default_input_params, q_output_printer, query_strs
+
diff --git a/qtextasdata/core.py b/qtextasdata/core.py
new file mode 100644
index 00000000..82eab7ac
--- /dev/null
+++ b/qtextasdata/core.py
@@ -0,0 +1,770 @@
+from collections import OrderedDict
+import traceback
+
+from qtextasdata.exceptions import BadHeaderException, CannotUnzipDataStreamException, ColumnCountMismatchException, ColumnMaxLengthLimitExceededException, ContentSignatureDataDiffersException, ContentSignatureDiffersException, ContentSignatureNotFoundException, CouldNotConvertStringToNumericValueException, CouldNotParseInputException, EncodedQueryException, FileNotFoundException, InvalidQueryException, MaximumSourceFilesExceededException, MissingHeaderException, NoTableInQsqlExcption, NoTablesInSqliteException, NonExistentTableNameInQsql, NonExistentTableNameInSqlite, SqliteOperationalErrorException, TooManyAttachedDatabasesException, TooManyTablesInQsqlException, TooManyTablesInSqliteException, UniversalNewlinesExistException, UnknownFileTypeException
+from qtextasdata.logging import DEBUG, iprint, xprint
+from qtextasdata.sql import DatabaseInfo, MaterialiedDataStreamState, MaterializedDelimitedFileState, MaterializedQsqlState, MaterializedSqliteState, MaterializedStateType, Sql, Sqlite3DB, detect_qtable_name_source_info
+import uuid
+import csv
+import time
+import sys
+
+from qtextasdata.utilities import quote_all_func, quote_minimal_func, quote_none_func, quote_nonnumeric_func
+
+def determine_max_col_lengths(m,output_field_quoting_func,output_delimiter):
+ if len(m) == 0:
+ return []
+ max_lengths = [0 for x in range(0, len(m[0]))]
+ for row_index in range(0, len(m)):
+ for col_index in range(0, len(m[0])):
+ # TODO Optimize this
+ new_len = len("{}".format(output_field_quoting_func(output_delimiter,m[row_index][col_index])))
+ if new_len > max_lengths[col_index]:
+ max_lengths[col_index] = new_len
+ return max_lengths
+
+
+class QWarning(object):
+ def __init__(self,exception,msg):
+ self.exception = exception
+ self.msg = msg
+
+
+class QError(object):
+ def __init__(self,exception,msg,errorcode):
+ self.exception = exception
+ self.msg = msg
+ self.errorcode = errorcode
+ self.traceback = traceback.format_exc()
+
+ def __str__(self):
+ return "QError" % (self.errorcode,self.msg,self.exception,str(self.traceback))
+ __repr__ = __str__
+
+
+class QMetadata(object):
+ def __init__(self,table_structures={},new_table_structures={},output_column_name_list=None):
+ self.table_structures = table_structures
+ self.new_table_structures = new_table_structures
+ self.output_column_name_list = output_column_name_list
+
+ def __str__(self):
+ return "QMetadata<%s" % (self.__dict__)
+ __repr__ = __str__
+
+
+class QOutput(object):
+ def __init__(self,data=None,metadata=None,warnings=[],error=None):
+ self.data = data
+ self.metadata = metadata
+
+ self.warnings = warnings
+ self.error = error
+ if error is None:
+ self.status = 'ok'
+ else:
+ self.status = 'error'
+
+ def __str__(self):
+ s = []
+ s.append('status=%s' % self.status)
+ if self.error is not None:
+ s.append("error=%s" % self.error.msg)
+ if len(self.warnings) > 0:
+ s.append("warning_count=%s" % len(self.warnings))
+ if self.data is not None:
+ s.append("row_count=%s" % len(self.data))
+ else:
+ s.append("row_count=None")
+ if self.metadata is not None:
+ s.append("metadata=<%s>" % self.metadata)
+ else:
+ s.append("metadata=None")
+ return "QOutput<%s>" % ",".join(s)
+ __repr__ = __str__
+
+
+class QInputParams(object):
+ def __init__(self,skip_header=False,
+ delimiter=' ',input_encoding='UTF-8',gzipped_input=False,with_universal_newlines=False,parsing_mode='relaxed',
+ expected_column_count=None,keep_leading_whitespace_in_values=False,
+ disable_double_double_quoting=False,disable_escaped_double_quoting=False,
+ disable_column_type_detection=False,
+ input_quoting_mode='minimal',stdin_file=None,stdin_filename='-',
+ max_column_length_limit=131072,
+ read_caching=False,
+ write_caching=False,
+ max_attached_sqlite_databases = 10):
+ self.skip_header = skip_header
+ self.delimiter = delimiter
+ self.input_encoding = input_encoding
+ self.gzipped_input = gzipped_input
+ self.with_universal_newlines = with_universal_newlines
+ self.parsing_mode = parsing_mode
+ self.expected_column_count = expected_column_count
+ self.keep_leading_whitespace_in_values = keep_leading_whitespace_in_values
+ self.disable_double_double_quoting = disable_double_double_quoting
+ self.disable_escaped_double_quoting = disable_escaped_double_quoting
+ self.input_quoting_mode = input_quoting_mode
+ self.disable_column_type_detection = disable_column_type_detection
+ self.max_column_length_limit = max_column_length_limit
+ self.read_caching = read_caching
+ self.write_caching = write_caching
+ self.max_attached_sqlite_databases = max_attached_sqlite_databases
+
+ def merged_with(self,input_params):
+ params = QInputParams(**self.__dict__)
+ if input_params is not None:
+ params.__dict__.update(**input_params.__dict__)
+ return params
+
+ def __str__(self):
+ return "QInputParams<%s>" % str(self.__dict__)
+
+ def __repr__(self):
+ return "QInputParams(...)"
+
+
+class DataStream(object):
+ # TODO Can stream-id be removed?
+ def __init__(self,stream_id,filename,stream):
+ self.stream_id = stream_id
+ self.filename = filename
+ self.stream = stream
+
+ def __str__(self):
+ return "QDataStream" % (self.stream_id,self.filename,self.stream)
+ __repr__ = __str__
+
+
+class DataStreams(object):
+ def __init__(self, data_streams_dict):
+ assert type(data_streams_dict) == dict
+ self.validate(data_streams_dict)
+ self.data_streams_dict = data_streams_dict
+
+ def validate(self,d):
+ for k in d:
+ v = d[k]
+ if type(k) != str or type(v) != DataStream:
+ raise Exception('Bug - Invalid dict: %s' % str(d))
+
+ def get_for_filename(self, filename):
+ xprint("Data streams dict is %s. Trying to find %s" % (self.data_streams_dict,filename))
+ x = self.data_streams_dict.get(filename)
+ return x
+
+ def is_data_stream(self,filename):
+ return filename in self.data_streams_dict
+
+
+class QTextAsData(object):
+ def __init__(self,default_input_params=QInputParams(),data_streams_dict=None):
+ self.engine_id = str(uuid.uuid4()).replace("-","_")
+
+ self.default_input_params = default_input_params
+ xprint("Default input params: %s" % self.default_input_params)
+
+ self.loaded_table_structures_dict = OrderedDict()
+ self.databases = OrderedDict()
+
+ if data_streams_dict is not None:
+ self.data_streams = DataStreams(data_streams_dict)
+ else:
+ self.data_streams = DataStreams({})
+
+ # Create DB object
+ self.query_level_db_id = 'query_e_%s' % self.engine_id
+ self.query_level_db = Sqlite3DB(self.query_level_db_id,
+ 'file:%s?mode=memory&cache=shared' % self.query_level_db_id,'',create_qcatalog=True)
+ self.adhoc_db_id = 'adhoc_e_%s' % self.engine_id
+ self.adhoc_db_name = 'file:%s?mode=memory&cache=shared' % self.adhoc_db_id
+ self.adhoc_db = Sqlite3DB(self.adhoc_db_id,self.adhoc_db_name,'',create_qcatalog=True)
+ self.query_level_db.conn.execute("attach '%s' as %s" % (self.adhoc_db_name,self.adhoc_db_id))
+
+ self.add_db_to_database_list(DatabaseInfo(self.query_level_db_id,self.query_level_db,needs_closing=True))
+ self.add_db_to_database_list(DatabaseInfo(self.adhoc_db_id,self.adhoc_db,needs_closing=True))
+
+ def done(self):
+ xprint("Inside done: Database list is %s" % self.databases)
+ for db_id in reversed(self.databases.keys()):
+ database_info = self.databases[db_id]
+ if database_info.needs_closing:
+ xprint("Gonna close database %s - %s" % (db_id,self.databases[db_id]))
+ self.databases[db_id].sqlite_db.done()
+ xprint("Database %s has been closed" % db_id)
+ else:
+ xprint("No need to close database %s" % db_id)
+ xprint("Closed all databases")
+
+ input_quoting_modes = { 'minimal' : csv.QUOTE_MINIMAL,
+ 'all' : csv.QUOTE_ALL,
+ # nonnumeric is not supported for input quoting modes, since we determine the data types
+ # ourselves instead of letting the csv module try to identify the types
+ 'none' : csv.QUOTE_NONE }
+
+ def determine_proper_dialect(self,input_params):
+
+ input_quoting_mode_csv_numeral = QTextAsData.input_quoting_modes[input_params.input_quoting_mode]
+
+ if input_params.keep_leading_whitespace_in_values:
+ skip_initial_space = False
+ else:
+ skip_initial_space = True
+
+ dialect = {'skipinitialspace': skip_initial_space,
+ 'delimiter': input_params.delimiter, 'quotechar': '"' }
+ dialect['quoting'] = input_quoting_mode_csv_numeral
+ dialect['doublequote'] = input_params.disable_double_double_quoting
+
+ if input_params.disable_escaped_double_quoting:
+ dialect['escapechar'] = '\\'
+
+ return dialect
+
+ def get_dialect_id(self,filename):
+ return 'q_dialect_%s' % filename
+
+ def _open_files_and_get_mfss(self,qtable_name,input_params,dialect):
+ materialized_file_dict = OrderedDict()
+
+ materialized_state_type,table_source_type,source_info = detect_qtable_name_source_info(qtable_name,self.data_streams,read_caching_enabled=input_params.read_caching)
+ xprint("Detected source type %s source info %s" % (materialized_state_type,source_info))
+
+ if materialized_state_type == MaterializedStateType.DATA_STREAM:
+ (data_stream,) = source_info
+ ms = MaterialiedDataStreamState(table_source_type,qtable_name,input_params,dialect,self.engine_id,data_stream,stream_target_db=self.adhoc_db)
+ effective_qtable_name = data_stream.stream_id
+ elif materialized_state_type == MaterializedStateType.QSQL_FILE:
+ (qsql_filename,table_name) = source_info
+ ms = MaterializedQsqlState(table_source_type,qtable_name, qsql_filename=qsql_filename, table_name=table_name,
+ engine_id=self.engine_id, input_params=input_params, dialect_id=dialect)
+ effective_qtable_name = '%s:::%s' % (qsql_filename, table_name)
+ elif materialized_state_type == MaterializedStateType.SQLITE_FILE:
+ (sqlite_filename,table_name) = source_info
+ ms = MaterializedSqliteState(table_source_type,qtable_name, sqlite_filename=sqlite_filename, table_name=table_name,
+ engine_id=self.engine_id)
+ effective_qtable_name = '%s:::%s' % (sqlite_filename, table_name)
+ elif materialized_state_type == MaterializedStateType.DELIMITED_FILE:
+ (source_qtable_name,_) = source_info
+ ms = MaterializedDelimitedFileState(table_source_type,source_qtable_name, input_params, dialect, self.engine_id)
+ effective_qtable_name = source_qtable_name
+ else:
+ assert False, "Unknown file type for qtable %s should have exited with an exception" % (qtable_name)
+
+ assert effective_qtable_name not in materialized_file_dict
+ materialized_file_dict[effective_qtable_name] = ms
+
+ xprint("MS dict: %s" % str(materialized_file_dict))
+
+ return list([item for item in materialized_file_dict.values()])
+
+ def _load_mfs(self,mfs,input_params,dialect_id,stop_after_analysis):
+ xprint("Loading MFS:", mfs)
+
+ materialized_state_type = mfs.get_materialized_state_type()
+ xprint("Detected materialized state type for %s: %s" % (mfs.qtable_name,materialized_state_type))
+
+ mfs.initialize()
+
+ if not materialized_state_type in [MaterializedStateType.DATA_STREAM]:
+ if stop_after_analysis or self.should_copy_instead_of_attach(input_params):
+ xprint("Should copy instead of attaching. Forcing db to use to adhoc db")
+ forced_db_to_use = self.adhoc_db
+ else:
+ forced_db_to_use = None
+ else:
+ forced_db_to_use = None
+
+ mfs.choose_db_to_use(forced_db_to_use,stop_after_analysis)
+ xprint("Chosen db to use: source %s source_type %s db_id %s db_to_use %s" % (mfs.source,mfs.source_type,mfs.db_id,mfs.db_to_use))
+
+ database_info,relevant_table = mfs.make_data_available(stop_after_analysis)
+
+ if not self.is_adhoc_db(mfs.db_to_use) and not self.should_copy_instead_of_attach(input_params):
+ if not self.already_attached_to_query_level_db(mfs.db_to_use):
+ self.attach_to_db(mfs.db_to_use, self.query_level_db)
+ self.add_db_to_database_list(database_info)
+ else:
+ xprint("DB %s is already attached to query level db. No need to attach it again.")
+
+ mfs.finalize()
+
+ xprint("MFS Loaded")
+
+ return mfs.source,mfs.source_type
+
+ def add_db_to_database_list(self,database_info):
+ db_id = database_info.db_id
+ assert db_id is not None
+ assert database_info.sqlite_db is not None
+ if db_id in self.databases:
+ # TODO Convert to assertion
+ if id(database_info.sqlite_db) != id(self.databases[db_id].sqlite_db):
+ raise Exception('Bug - database already in database list: db_id %s: old %s new %s' % (db_id,self.databases[db_id],database_info))
+ else:
+ return
+ self.databases[db_id] = database_info
+
+ def is_adhoc_db(self,db_to_use):
+ return db_to_use.db_id == self.adhoc_db_id
+
+ def should_copy_instead_of_attach(self,input_params):
+ attached_database_count = len(self.query_level_db.get_sqlite_database_list())
+ x = attached_database_count >= input_params.max_attached_sqlite_databases
+ xprint("should_copy_instead_of_attach: attached_database_count=%s should_copy=%s" % (attached_database_count,x))
+ return x
+
+ def _load_data(self,qtable_name,input_params=QInputParams(),stop_after_analysis=False):
+ xprint("Attempting to load data for materialized file names %s" % qtable_name)
+
+ q_dialect = self.determine_proper_dialect(input_params)
+ xprint("Dialect is %s" % q_dialect)
+ dialect_id = self.get_dialect_id(qtable_name)
+ csv.register_dialect(dialect_id, **q_dialect)
+
+ xprint("qtable metadata for loading is %s" % qtable_name)
+ mfss = self._open_files_and_get_mfss(qtable_name,
+ input_params,
+ dialect_id)
+ assert len(mfss) == 1, "one MS now encapsulated an entire table"
+ mfs = mfss[0]
+
+ xprint("MFS to load: %s" % mfs)
+
+ if qtable_name in self.loaded_table_structures_dict.keys():
+ xprint("Atomic filename %s found. no need to load" % qtable_name)
+ return None
+
+ xprint("qtable %s not found - loading" % qtable_name)
+
+
+ self._load_mfs(mfs, input_params, dialect_id, stop_after_analysis)
+ xprint("Loaded: source-type %s source %s mfs_structure %s" % (mfs.source_type, mfs.source, mfs.mfs_structure))
+
+ assert qtable_name not in self.loaded_table_structures_dict, "loaded_table_structures_dict has been changed to have a non-list value"
+ self.loaded_table_structures_dict[qtable_name] = mfs.mfs_structure
+
+ return mfs.mfs_structure
+
+ def already_attached_to_query_level_db(self,db_to_attach):
+ attached_dbs = list(map(lambda x:x[1],self.query_level_db.get_sqlite_database_list()))
+ return db_to_attach.db_id in attached_dbs
+
+ def attach_to_db(self, target_db, source_db):
+ q = "attach '%s' as %s" % (target_db.sqlite_db_url,target_db.db_id)
+ xprint("Attach query: %s" % q)
+ try:
+ c = source_db.execute_and_fetch(q)
+ except SqliteOperationalErrorException as e:
+ if 'too many attached databases' in str(e):
+ raise TooManyAttachedDatabasesException('There are too many attached databases. Use a proper --max-attached-sqlite-databases parameter which is below the maximum. Original error: %s' % str(e))
+ except Exception as e1:
+ raise
+
+ def detach_from_db(self, target_db, source_db):
+ q = "detach %s" % (target_db.db_id)
+ xprint("Detach query: %s" % q)
+ try:
+ c = source_db.execute_and_fetch(q)
+ except Exception as e1:
+ raise
+
+ def load_data(self,filename,input_params=QInputParams(),stop_after_analysis=False):
+ return self._load_data(filename,input_params,stop_after_analysis=stop_after_analysis)
+
+ def _ensure_data_is_loaded_for_sql(self,sql_object,input_params,data_streams=None,stop_after_analysis=False):
+ xprint("Ensuring Data load")
+ new_table_structures = OrderedDict()
+
+ # For each "table name"
+ for qtable_name in sql_object.qtable_names:
+ tss = self._load_data(qtable_name,input_params,stop_after_analysis=stop_after_analysis)
+ if tss is not None:
+ xprint("New Table Structures:",new_table_structures)
+ assert qtable_name not in new_table_structures, "new_table_structures was changed not to contain a list as a value"
+ new_table_structures[qtable_name] = tss
+
+ return new_table_structures
+
+ def materialize_query_level_db(self,save_db_to_disk_filename,sql_object):
+ # TODO More robust creation - Create the file in a separate folder and move it to the target location only after success
+
+ materialized_db = Sqlite3DB("materialized","file:%s" % save_db_to_disk_filename,save_db_to_disk_filename,create_qcatalog=False)
+ table_name_mapping = OrderedDict()
+
+ # For each table in the query
+ effective_table_names = sql_object.get_qtable_name_effective_table_names()
+
+ for i, qtable_name in enumerate(effective_table_names):
+ # table name, in the format db_id.table_name
+ effective_table_name_for_qtable_name = effective_table_names[qtable_name]
+
+ source_db_id, actual_table_name_in_db = effective_table_name_for_qtable_name.split(".", 1)
+ # The DatabaseInfo instance for this db
+ source_database = self.databases[source_db_id]
+ if source_db_id != self.query_level_db_id:
+ self.attach_to_db(source_database.sqlite_db,materialized_db)
+
+ ts = self.loaded_table_structures_dict[qtable_name]
+ proposed_new_table_name = ts.planned_table_name
+ xprint("Proposed table name is %s" % proposed_new_table_name)
+
+ new_table_name = materialized_db.find_new_table_name(proposed_new_table_name)
+
+ xprint("Materializing",source_db_id,actual_table_name_in_db,"as",new_table_name)
+ # Copy the table into the materialized database
+ xx = materialized_db.execute_and_fetch('CREATE TABLE %s AS SELECT * FROM %s' % (new_table_name,effective_table_name_for_qtable_name))
+
+ table_name_mapping[effective_table_name_for_qtable_name] = new_table_name
+
+ # TODO RLRL Preparation for writing materialized database as a qsql file
+ # if source_database.sqlite_db.qcatalog_table_exists():
+ # qcatalog_entry = source_database.sqlite_db.get_from_qcatalog_using_table_name(actual_table_name_in_db)
+ # # TODO RLRL Encapsulate dictionary transform inside qcatalog access methods
+ # materialized_db.add_to_qcatalog_table(new_table_name,OrderedDict(json.loads(qcatalog_entry['content_signature'])),
+ # qcatalog_entry['creation_time'],
+ # qcatalog_entry['source_type'],
+ # qcatalog_entry['source_type'])
+ # xprint("PQX Added to qcatalog",source_db_id,actual_table_name_in_db,'as',new_table_name)
+ # else:
+ # xprint("PQX Skipped adding to qcatalog",source_db_id,actual_table_name_in_db)
+
+ if source_db_id != self.query_level_db:
+ self.detach_from_db(source_database.sqlite_db,materialized_db)
+
+ return table_name_mapping
+
+ def validate_query(self,sql_object,table_structures):
+
+ for qtable_name in sql_object.qtable_names:
+ relevant_table_structures = [table_structures[qtable_name]]
+
+ column_names = None
+ column_types = None
+ for ts in relevant_table_structures:
+ names = ts.column_names
+ types = ts.python_column_types
+ xprint("Comparing column names: %s with %s" % (column_names,names))
+ if column_names is None:
+ column_names = names
+ else:
+ if column_names != names:
+ raise BadHeaderException("Column names differ for table %s: %s vs %s" % (
+ qtable_name, ",".join(column_names), ",".join(names)))
+
+ xprint("Comparing column types: %s with %s" % (column_types,types))
+ if column_types is None:
+ column_types = types
+ else:
+ if column_types != types:
+ raise BadHeaderException("Column types differ for table %s: %s vs %s" % (
+ qtable_name, ",".join(column_types), ",".join(types)))
+
+ xprint("All column names match for qtable name %s: column names: %s column types: %s" % (ts.qtable_name,column_names,column_types))
+
+ xprint("Query validated")
+
+ def _execute(self,query_str,input_params=None,data_streams=None,stop_after_analysis=False,save_db_to_disk_filename=None):
+ warnings = []
+ error = None
+ table_structures = []
+
+ db_results_obj = None
+
+ effective_input_params = self.default_input_params.merged_with(input_params)
+
+ if type(query_str) != str:
+ try:
+ # Heuristic attempt to auto convert the query to unicode before failing
+ query_str = query_str.decode('utf-8')
+ except:
+ error = QError(EncodedQueryException(''),"Query should be in unicode. Please make sure to provide a unicode literal string or decode it using proper the character encoding.",91)
+ return QOutput(error = error)
+
+
+ try:
+ # Create SQL statement
+ sql_object = Sql('%s' % query_str, self.data_streams)
+
+ load_start_time = time.time()
+ iprint("Going to ensure data is loaded. Currently loaded tables: %s" % str(self.loaded_table_structures_dict))
+ new_table_structures = self._ensure_data_is_loaded_for_sql(sql_object,effective_input_params,data_streams,stop_after_analysis=stop_after_analysis)
+ iprint("Ensured data is loaded. loaded tables: %s" % self.loaded_table_structures_dict)
+
+ self.validate_query(sql_object,self.loaded_table_structures_dict)
+
+ iprint("Query validated")
+
+ sql_object.materialize_using(self.loaded_table_structures_dict)
+
+ iprint("Materialized sql object")
+
+ if save_db_to_disk_filename is not None:
+ xprint("Saving query data to disk")
+ dump_start_time = time.time()
+ table_name_mapping = self.materialize_query_level_db(save_db_to_disk_filename,sql_object)
+ print("Data has been saved into %s . Saving has taken %4.3f seconds" % (save_db_to_disk_filename,time.time()-dump_start_time), file=sys.stderr)
+ effective_sql = sql_object.get_effective_sql(table_name_mapping)
+ print("Query to run on the database: %s;" % effective_sql, file=sys.stderr)
+ command_line = 'echo "%s" | sqlite3 %s' % (effective_sql,save_db_to_disk_filename)
+ print("You can run the query directly from the command line using the following command: %s" % command_line, file=sys.stderr)
+
+ # TODO Propagate dump results using a different output class instead of an empty one
+ return QOutput()
+
+ # Ensure that adhoc db is not in the middle of a transaction
+ self.adhoc_db.conn.commit()
+
+ all_databases = self.query_level_db.get_sqlite_database_list()
+ xprint("Query level db: databases %s" % all_databases)
+
+ # Execute the query and fetch the data
+ db_results_obj = sql_object.execute_and_fetch(self.query_level_db)
+ iprint("Query executed")
+
+ if len(db_results_obj.results) == 0:
+ warnings.append(QWarning(None, "Warning - data is empty"))
+
+ return QOutput(
+ data = db_results_obj.results,
+ metadata = QMetadata(
+ table_structures=self.loaded_table_structures_dict,
+ new_table_structures=new_table_structures,
+ output_column_name_list=db_results_obj.query_column_names),
+ warnings = warnings,
+ error = error)
+ except InvalidQueryException as e:
+ error = QError(e,str(e),118)
+ except MissingHeaderException as e:
+ error = QError(e,e.msg,117)
+ except FileNotFoundException as e:
+ error = QError(e,e.msg,30)
+ except SqliteOperationalErrorException as e:
+ xprint("Sqlite Operational error: %s" % e)
+ msg = str(e.original_error)
+ error = QError(e,"query error: %s" % msg,1)
+ if "no such column" in msg and effective_input_params.skip_header:
+ warnings.append(QWarning(e,'Warning - There seems to be a "no such column" error, and -H (header line) exists. Please make sure that you are using the column names from the header line and not the default (cXX) column names. Another issue might be that the file contains a BOM. Files that are encoded with UTF8 and contain a BOM can be read by specifying `-e utf-9-sig` in the command line. Support for non-UTF8 encoding will be provided in the future.'))
+ except ColumnCountMismatchException as e:
+ error = QError(e,e.msg,2)
+ except (UnicodeDecodeError, UnicodeError) as e:
+ error = QError(e,"Cannot decode data. Try to change the encoding by setting it using the -e parameter. Error:%s" % e,3)
+ except BadHeaderException as e:
+ error = QError(e,"Bad header row: %s" % e.msg,35)
+ except CannotUnzipDataStreamException as e:
+ error = QError(e,"Cannot decompress standard input. Pipe the input through zcat in order to decompress.",36)
+ except UniversalNewlinesExistException as e:
+ error = QError(e,"Data contains universal newlines. Run q with -U to use universal newlines. Please note that q still doesn't support universal newlines for .gz files or for stdin. Route the data through a regular file to use -U.",103)
+ # deprecated, but shouldn't be used: error = QError(e,"Standard Input must be provided in order to use it as a table",61)
+ except CouldNotConvertStringToNumericValueException as e:
+ error = QError(e,"Could not convert string to a numeric value. Did you use `-w nonnumeric` with unquoted string values? Error: %s" % e.msg,58)
+ except CouldNotParseInputException as e:
+ error = QError(e,"Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error: %s" % e.msg,59)
+ except ColumnMaxLengthLimitExceededException as e:
+ error = QError(e,e.msg,31)
+ # deprecated, but shouldn't be used: error = QError(e,e.msg,79)
+ except ContentSignatureDiffersException as e:
+ error = QError(e,"%s vs %s: Content Signatures for table %s differ at %s (source value '%s' disk signature value '%s')" %
+ (e.original_filename,e.other_filename,e.filenames_str,e.key,e.source_value,e.signature_value),80)
+ except ContentSignatureDataDiffersException as e:
+ error = QError(e,e.msg,81)
+ except MaximumSourceFilesExceededException as e:
+ error = QError(e,e.msg,82)
+ except ContentSignatureNotFoundException as e:
+ error = QError(e,e.msg,83)
+ except NonExistentTableNameInQsql as e:
+ msg = "Table %s could not be found in qsql file %s . Existing table names: %s" % (e.table_name,e.qsql_filename,",".join(e.existing_table_names))
+ error = QError(e,msg,84)
+ except NonExistentTableNameInSqlite as e:
+ msg = "Table %s could not be found in sqlite file %s . Existing table names: %s" % (e.table_name,e.qsql_filename,",".join(e.existing_table_names))
+ error = QError(e,msg,85)
+ except TooManyTablesInQsqlException as e:
+ msg = "Could not autodetect table name in qsql file. Existing Tables %s" % ",".join(e.existing_table_names)
+ error = QError(e,msg,86)
+ except NoTableInQsqlExcption as e:
+ msg = "Could not autodetect table name in qsql file. File contains no record of a table"
+ error = QError(e,msg,97)
+ except TooManyTablesInSqliteException as e:
+ msg = "Could not autodetect table name in sqlite file %s . Existing tables: %s" % (e.qsql_filename,",".join(e.existing_table_names))
+ error = QError(e,msg,87)
+ except NoTablesInSqliteException as e:
+ msg = "sqlite file %s has no tables" % e.sqlite_filename
+ error = QError(e,msg,88)
+ except TooManyAttachedDatabasesException as e:
+ msg = str(e)
+ error = QError(e,msg,89)
+ except UnknownFileTypeException as e:
+ msg = str(e)
+ error = QError(e,msg,95)
+ except KeyboardInterrupt as e:
+ warnings.append(QWarning(e,"Interrupted"))
+ except Exception as e:
+ if DEBUG:
+ xprint(traceback.format_exc())
+ error = QError(e,repr(e),199)
+
+ return QOutput(data=None,warnings = warnings,error = error , metadata=QMetadata(table_structures=self.loaded_table_structures_dict,new_table_structures=self.loaded_table_structures_dict,output_column_name_list=[]))
+
+ def execute(self,query_str,input_params=None,save_db_to_disk_filename=None):
+ r = self._execute(query_str,input_params,stop_after_analysis=False,save_db_to_disk_filename=save_db_to_disk_filename)
+ return r
+
+ def unload(self):
+ # TODO This would fail, since table structures are just value objects now. Will be fixed as part of making q a full python module
+ for qtable_name,table_creator in self.loaded_table_structures_dict.items():
+ table_creator.close_file()
+ self.loaded_table_structures_dict = OrderedDict()
+
+ def analyze(self,query_str,input_params=None,data_streams=None):
+ q_output = self._execute(query_str,input_params,data_streams=data_streams,stop_after_analysis=True)
+
+ return q_output
+
+
+class QOutputParams(object):
+ def __init__(self,
+ delimiter=' ',
+ beautify=False,
+ output_quoting_mode='minimal',
+ formatting=None,
+ output_header=False,
+ encoding=None):
+ self.delimiter = delimiter
+ self.beautify = beautify
+ self.output_quoting_mode = output_quoting_mode
+ self.formatting = formatting
+ self.output_header = output_header
+ self.encoding = encoding
+
+ def __str__(self):
+ return "QOutputParams<%s>" % str(self.__dict__)
+
+ def __repr__(self):
+ return "QOutputParams(...)"
+
+
+class QOutputPrinter(object):
+ output_quoting_modes = { 'minimal' : quote_minimal_func,
+ 'all' : quote_all_func,
+ 'nonnumeric' : quote_nonnumeric_func,
+ 'none' : quote_none_func }
+
+ def __init__(self,output_params,show_tracebacks=False):
+ self.output_params = output_params
+ self.show_tracebacks = show_tracebacks
+
+ self.output_field_quoting_func = QOutputPrinter.output_quoting_modes[output_params.output_quoting_mode]
+
+ def print_errors_and_warnings(self,f,results):
+ if results.status == 'error':
+ error = results.error
+ print(error.msg, file=f)
+ if self.show_tracebacks:
+ print(error.traceback, file=f)
+
+ for warning in results.warnings:
+ print("%s" % warning.msg, file=f)
+
+ def print_analysis(self,f_out,f_err,results):
+ self.print_errors_and_warnings(f_err,results)
+
+ if results.metadata is None:
+ return
+
+ if results.metadata.table_structures is None:
+ return
+
+ for qtable_name in results.metadata.table_structures:
+ table_structures = results.metadata.table_structures[qtable_name]
+ print("Table: %s" % qtable_name,file=f_out)
+ print(" Sources:",file=f_out)
+ dl = results.metadata.new_table_structures[qtable_name]
+ print(" source_type: %s source: %s" % (dl.source_type,dl.source),file=f_out)
+ print(" Fields:",file=f_out)
+ for n,t in zip(table_structures.column_names,table_structures.sqlite_column_types):
+ print(" `%s` - %s" % (n,t), file=f_out)
+
+ def print_output(self,f_out,f_err,results):
+ try:
+ self._print_output(f_out,f_err,results)
+ except (UnicodeEncodeError, UnicodeError) as e:
+ print("Cannot encode data. Error:%s" % e, file=f_err)
+ sys.exit(3)
+ except IOError as e:
+ if e.errno == 32:
+ # broken pipe, that's ok
+ pass
+ else:
+ # don't miss other problems for now
+ raise
+ except KeyboardInterrupt:
+ pass
+
+ def _print_output(self,f_out,f_err,results):
+ self.print_errors_and_warnings(f_err,results)
+
+ data = results.data
+
+ if data is None:
+ return
+
+ # If the user requested beautifying the output
+ if self.output_params.beautify:
+ if self.output_params.output_header:
+ data_with_possible_headers = data + [tuple(results.metadata.output_column_name_list)]
+ else:
+ data_with_possible_headers = data
+ max_lengths = determine_max_col_lengths(data_with_possible_headers,self.output_field_quoting_func,self.output_params.delimiter)
+
+ if self.output_params.formatting:
+ formatting_dict = dict(
+ [(x.split("=")[0], x.split("=")[1]) for x in self.output_params.formatting.split(",")])
+ else:
+ formatting_dict = {}
+
+ try:
+ if self.output_params.output_header and results.metadata.output_column_name_list is not None:
+ data.insert(0,results.metadata.output_column_name_list)
+ for rownum, row in enumerate(data):
+ row_str = []
+ skip_formatting = rownum == 0 and self.output_params.output_header
+ for i, col in enumerate(row):
+ if str(i + 1) in formatting_dict.keys() and not skip_formatting:
+ fmt_str = formatting_dict[str(i + 1)]
+ else:
+ if self.output_params.beautify:
+ fmt_str = str("{{0:<{}}}").format(max_lengths[i])
+ else:
+ fmt_str = str("{}")
+
+ if col is not None:
+ xx = self.output_field_quoting_func(self.output_params.delimiter,col)
+ row_str.append(fmt_str.format(xx))
+ else:
+ row_str.append(fmt_str.format(""))
+
+
+ xxxx = str(self.output_params.delimiter).join(row_str) + str("\n")
+ f_out.write(xxxx)
+ except (UnicodeEncodeError, UnicodeError) as e:
+ print("Cannot encode data. Error:%s" % e, file=sys.stderr)
+ sys.exit(3)
+ except TypeError as e:
+ print("Error while formatting output: %s" % e, file=sys.stderr)
+ sys.exit(4)
+ except IOError as e:
+ if e.errno == 32:
+ # broken pipe, that's ok
+ pass
+ else:
+ # don't miss other problem for now
+ raise
+ except KeyboardInterrupt:
+ pass
+
+ try:
+ # Prevent python bug when order of pipe shutdowns is reversed
+ f_out.flush()
+ except IOError as e:
+ pass
diff --git a/qtextasdata/csv_reader.py b/qtextasdata/csv_reader.py
new file mode 100644
index 00000000..822407a2
--- /dev/null
+++ b/qtextasdata/csv_reader.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+import csv
+from qtextasdata.logging import xprint
+from qtextasdata.exceptions import (
+ CouldNotParseInputException,
+ CouldNotConvertStringToNumericValueException,
+ ColumnMaxLengthLimitExceededException,
+ UniversalNewlinesExistException
+)
+
+def py3_encoded_csv_reader(encoding, f, dialect,row_data_only=False,**kwargs):
+ try:
+ xprint("f is %s" % str(f))
+ xprint("dialect is %s" % dialect)
+ csv_reader = csv.reader(f, dialect, **kwargs)
+
+ if row_data_only:
+ for row in csv_reader:
+ yield row
+ else:
+ for row in csv_reader:
+ yield (f.filename(),f.isfirstline(),row)
+
+ except UnicodeDecodeError as e1:
+ raise CouldNotParseInputException(e1)
+ except ValueError as e:
+ # TODO Add test for this
+ if str(e) is not None and str(e).startswith('could not convert string to'):
+ raise CouldNotConvertStringToNumericValueException(str(e))
+ else:
+ raise CouldNotParseInputException(str(e))
+ except Exception as e:
+ if str(e).startswith("field larger than field limit"):
+ raise ColumnMaxLengthLimitExceededException(str(e))
+ elif 'universal-newline' in str(e):
+ raise UniversalNewlinesExistException()
+ else:
+ raise
+
+encoded_csv_reader = py3_encoded_csv_reader
diff --git a/qtextasdata/exceptions.py b/qtextasdata/exceptions.py
new file mode 100644
index 00000000..d1a4ef0d
--- /dev/null
+++ b/qtextasdata/exceptions.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+
+class CouldNotConvertStringToNumericValueException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
+
+class SqliteOperationalErrorException(Exception):
+
+ def __init__(self, msg,original_error):
+ self.msg = msg
+ self.original_error = original_error
+
+ def __str(self):
+ return repr(self.msg) + "//" + repr(self.original_error)
+
+class IncorrectDefaultValueException(Exception):
+
+ def __init__(self, option_type,option,actual_value):
+ self.option_type = option_type
+ self.option = option
+ self.actual_value = actual_value
+
+ def __str__(self):
+ return repr(self)
+
+class NonExistentTableNameInQsql(Exception):
+
+ def __init__(self, qsql_filename,table_name,existing_table_names):
+ self.qsql_filename = qsql_filename
+ self.table_name = table_name
+ self.existing_table_names = existing_table_names
+
+class NonExistentTableNameInSqlite(Exception):
+
+ def __init__(self, qsql_filename,table_name,existing_table_names):
+ self.qsql_filename = qsql_filename
+ self.table_name = table_name
+ self.existing_table_names = existing_table_names
+
+class TooManyTablesInQsqlException(Exception):
+
+ def __init__(self, qsql_filename,existing_table_names):
+ self.qsql_filename = qsql_filename
+ self.existing_table_names = existing_table_names
+
+class NoTableInQsqlExcption(Exception):
+
+ def __init__(self, qsql_filename):
+ self.qsql_filename = qsql_filename
+
+class TooManyTablesInSqliteException(Exception):
+
+ def __init__(self, qsql_filename,existing_table_names):
+ self.qsql_filename = qsql_filename
+ self.existing_table_names = existing_table_names
+
+class NoTablesInSqliteException(Exception):
+
+ def __init__(self, sqlite_filename):
+ self.sqlite_filename = sqlite_filename
+
+class ColumnMaxLengthLimitExceededException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
+
+class CouldNotParseInputException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
+
+class BadHeaderException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
+
+class EncodedQueryException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
+
+
+class CannotUnzipDataStreamException(Exception):
+
+ def __init__(self):
+ pass
+
+class UniversalNewlinesExistException(Exception):
+
+ def __init__(self):
+ pass
+
+class EmptyDataException(Exception):
+
+ def __init__(self):
+ pass
+
+class MissingHeaderException(Exception):
+
+ def __init__(self,msg):
+ self.msg = msg
+
+class InvalidQueryException(Exception):
+
+ def __init__(self,msg):
+ self.msg = msg
+
+class TooManyAttachedDatabasesException(Exception):
+
+ def __init__(self,msg):
+ self.msg = msg
+
+class FileNotFoundException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
+
+class UnknownFileTypeException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
+
+
+class ColumnCountMismatchException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+class ContentSignatureNotFoundException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+class StrictModeColumnCountMismatchException(Exception):
+
+ def __init__(self,atomic_fn, expected_col_count,actual_col_count,lines_read):
+ self.atomic_fn = atomic_fn
+ self.expected_col_count = expected_col_count
+ self.actual_col_count = actual_col_count
+ self.lines_read = lines_read
+
+class FluffyModeColumnCountMismatchException(Exception):
+
+ def __init__(self,atomic_fn, expected_col_count,actual_col_count,lines_read):
+ self.atomic_fn = atomic_fn
+ self.expected_col_count = expected_col_count
+ self.actual_col_count = actual_col_count
+ self.lines_read = lines_read
+
+class ContentSignatureDiffersException(Exception):
+
+ def __init__(self,original_filename, other_filename, filenames_str,key,source_value,signature_value):
+ self.original_filename = original_filename
+ self.other_filename = other_filename
+ self.filenames_str = filenames_str
+ self.key = key
+ self.source_value = source_value
+ self.signature_value = signature_value
+
+
+class ContentSignatureDataDiffersException(Exception):
+
+ def __init__(self,msg):
+ self.msg = msg
+
+
+class InvalidQSqliteFileException(Exception):
+
+ def __init__(self,msg):
+ self.msg = msg
+
+
+class MaximumSourceFilesExceededException(Exception):
+
+ def __init__(self,msg):
+ self.msg = msg
+
+
+
+
diff --git a/qtextasdata/logging.py b/qtextasdata/logging.py
new file mode 100644
index 00000000..3c332b40
--- /dev/null
+++ b/qtextasdata/logging.py
@@ -0,0 +1,25 @@
+import os
+import sys
+import datetime
+
+
+DEBUG = bool(os.environ.get('Q_DEBUG', None)) or '-V' in sys.argv
+SQL_DEBUG = False
+
+if DEBUG:
+ def xprint(*args,**kwargs):
+ print(datetime.datetime.now(datetime.timezone.utc).isoformat()," DEBUG ",*args,file=sys.stderr,**kwargs)
+
+ def iprint(*args,**kwargs):
+ print(datetime.datetime.now(datetime.timezone.utc).isoformat()," INFO ",*args,file=sys.stderr,**kwargs)
+
+ def sqlprint(*args,**kwargs):
+ pass
+else:
+ def xprint(*args,**kwargs): pass
+ def iprint(*args,**kwargs): pass
+ def sqlprint(*args,**kwargs): pass
+
+if SQL_DEBUG:
+ def sqlprint(*args,**kwargs):
+ print(datetime.datetime.now(datetime.timezone.utc).isoformat(), " SQL ", *args, file=sys.stderr, **kwargs)
diff --git a/qtextasdata/sql.py b/qtextasdata/sql.py
new file mode 100644
index 00000000..804c1e39
--- /dev/null
+++ b/qtextasdata/sql.py
@@ -0,0 +1,1828 @@
+#!/usr/bin/env python
+
+from collections import OrderedDict
+import datetime
+import glob
+import sqlite3
+from qtextasdata.csv_reader import encoded_csv_reader
+from qtextasdata.exceptions import BadHeaderException, CannotUnzipDataStreamException, ColumnCountMismatchException, ColumnMaxLengthLimitExceededException, ContentSignatureDataDiffersException, ContentSignatureDiffersException, FileNotFoundException, FluffyModeColumnCountMismatchException, InvalidQueryException, MaximumSourceFilesExceededException, MissingHeaderException, NoTableInQsqlExcption, NoTablesInSqliteException, NonExistentTableNameInQsql, NonExistentTableNameInSqlite, StrictModeColumnCountMismatchException, TooManyTablesInQsqlException, TooManyTablesInSqliteException, UniversalNewlinesExistException, UnknownFileTypeException
+from qtextasdata.utilities import ( sqlite_type_to_python_type, user_functions, md5 )
+from qtextasdata.logging import xprint,sqlprint
+import json
+import hashlib
+from uuid import uuid4
+import io
+import time
+from qtextasdata.exceptions import (
+ SqliteOperationalErrorException
+)
+import sys
+import os
+
+SHOW_SQL = False
+
+class DatabaseInfo(object):
+ def __init__(self,db_id,sqlite_db,needs_closing):
+ self.db_id = db_id
+ self.sqlite_db = sqlite_db
+ self.needs_closing = needs_closing
+
+ def __str__(self):
+ return "DatabaseInfo" % (self.sqlite_db,self.needs_closing)
+ __repr__ = __str__
+
+class Sqlite3DBResults(object):
+ def __init__(self,query_column_names,results):
+ self.query_column_names = query_column_names
+ self.results = results
+
+ def __str__(self):
+ return "Sqlite3DBResults" % (len(self.results),str(self.query_column_names))
+ __repr__ = __str__
+
+class Sqlite3DB(object):
+ # TODO Add metadata table with qsql file version
+
+ QCATALOG_TABLE_NAME = '_qcatalog'
+ NUMERIC_COLUMN_TYPES = {int, float}
+ PYTHON_TO_SQLITE_TYPE_NAMES = { str: 'TEXT', int: 'INT', float: 'REAL', None: 'TEXT' }
+
+
+ def __str__(self):
+ return "Sqlite3DB" % self.sqlite_db_url
+ __repr__ = __str__
+
+ def __init__(self, db_id, sqlite_db_url, sqlite_db_filename, create_qcatalog, show_sql=SHOW_SQL):
+ self.show_sql = show_sql
+ self.create_qcatalog = create_qcatalog
+
+ self.db_id = db_id
+ # TODO Is this needed anymore?
+ self.sqlite_db_filename = sqlite_db_filename
+ self.sqlite_db_url = sqlite_db_url
+ self.conn = sqlite3.connect(self.sqlite_db_url, uri=True)
+ self.last_temp_table_id = 10000
+ self.cursor = self.conn.cursor()
+ self.add_user_functions()
+
+ if create_qcatalog:
+ self.create_qcatalog_table()
+ else:
+ xprint('Not creating qcatalog for db_id %s' % db_id)
+
+ def retrieve_all_table_names(self):
+ return [x[0] for x in self.execute_and_fetch("select tbl_name from sqlite_master where type='table'").results]
+
+ def get_sqlite_table_info(self,table_name):
+ return self.execute_and_fetch('PRAGMA table_info(%s)' % table_name).results
+
+ def get_sqlite_database_list(self):
+ return self.execute_and_fetch('pragma database_list').results
+
+ def find_new_table_name(self,planned_table_name):
+ existing_table_names = self.retrieve_all_table_names()
+
+ possible_indices = range(1,1000)
+
+ for index in possible_indices:
+ if index == 1:
+ suffix = ''
+ else:
+ suffix = '_%s' % index
+
+ table_name_attempt = '%s%s' % (planned_table_name,suffix)
+
+ if table_name_attempt not in existing_table_names:
+ xprint("Found free table name %s in db %s for planned table name %s" % (table_name_attempt,self.db_id,planned_table_name))
+ return table_name_attempt
+
+ # TODO Add test for this
+ raise Exception('Cannot find free table name in db %s for planned table name %s' % (self.db_id,planned_table_name))
+
+ def create_qcatalog_table(self):
+ if not self.qcatalog_table_exists():
+ xprint("qcatalog table does not exist. Creating it")
+ r = self.conn.execute("""CREATE TABLE %s (
+ qcatalog_entry_id text not null primary key,
+ content_signature_key text,
+ temp_table_name text,
+ content_signature text,
+ creation_time text,
+ source_type text,
+ source text)""" % self.QCATALOG_TABLE_NAME).fetchall()
+ else:
+ xprint("qcatalog table already exists. No need to create it")
+
+ def qcatalog_table_exists(self):
+ return sqlite_table_exists(self.conn,self.QCATALOG_TABLE_NAME)
+
+ def calculate_content_signature_key(self,content_signature):
+ assert type(content_signature) == OrderedDict
+ pp = json.dumps(content_signature,sort_keys=True)
+ xprint("Calculating content signature for:",pp)
+ return hashlib.sha1(pp.encode('utf-8')).hexdigest()
+
+ def add_to_qcatalog_table(self, temp_table_name, content_signature, creation_time,source_type, source):
+ assert source is not None
+ assert source_type is not None
+ content_signature_key = self.calculate_content_signature_key(content_signature)
+ xprint("db_id: %s Adding to qcatalog table: %s. Calculated signature key %s" % (self.db_id, temp_table_name,content_signature_key))
+ r = self.execute_and_fetch(
+ 'INSERT INTO %s (qcatalog_entry_id,content_signature_key, temp_table_name,content_signature,creation_time,source_type,source) VALUES (?,?,?,?,?,?,?)' % self.QCATALOG_TABLE_NAME,
+ (str(uuid4()),content_signature_key,temp_table_name,json.dumps(content_signature),creation_time,source_type,source))
+ # Ensure transaction is completed
+ self.conn.commit()
+
+ def get_from_qcatalog(self, content_signature):
+ content_signature_key = self.calculate_content_signature_key(content_signature)
+ xprint("Finding table in db_id %s that matches content signature key %s" % (self.db_id,content_signature_key))
+
+ field_names = ["content_signature_key", "temp_table_name", "content_signature", "creation_time","source_type","source","qcatalog_entry_id"]
+
+ q = "SELECT %s FROM %s where content_signature_key = ?" % (",".join(field_names),self.QCATALOG_TABLE_NAME)
+ r = self.execute_and_fetch(q,(content_signature_key,))
+
+ if r is None:
+ return None
+
+ if len(r.results) == 0:
+ return None
+
+ if len(r.results) > 1:
+ raise Exception("Bug - Exactly one result should have been provided: %s" % str(r.results))
+
+ d = dict(zip(field_names,r.results[0]))
+ return d
+
+ def get_from_qcatalog_using_table_name(self, temp_table_name):
+ xprint("getting from qcatalog using table name")
+
+ field_names = ["content_signature", "temp_table_name","creation_time","source_type","source","content_signature_key","qcatalog_entry_id"]
+
+ q = "SELECT %s FROM %s where temp_table_name = ?" % (",".join(field_names),self.QCATALOG_TABLE_NAME)
+ xprint("Query from qcatalog %s params %s" % (q,str(temp_table_name,)))
+ r = self.execute_and_fetch(q,(temp_table_name,))
+ xprint("results: ",r.results)
+
+ if r is None:
+ return None
+
+ if len(r.results) == 0:
+ return None
+
+ if len(r.results) > 1:
+ raise Exception("Bug - Exactly one result should have been provided: %s" % str(r.results))
+
+ d = dict(zip(field_names,r.results[0]))
+ # content_signature should be the first in the list of field_names
+ cs = OrderedDict(json.loads(r.results[0][0]))
+ if self.calculate_content_signature_key(cs) != d['content_signature_key']:
+ raise Exception('Table contains an invalid entry - content signature key is not matching the actual content signature')
+ return d
+
+ def get_all_from_qcatalog(self):
+ xprint("getting from qcatalog using table name")
+
+ field_names = ["temp_table_name", "content_signature", "creation_time","source_type","source","qcatalog_entry_id"]
+
+ q = "SELECT %s FROM %s" % (",".join(field_names),self.QCATALOG_TABLE_NAME)
+ xprint("Query from qcatalog %s" % q)
+ r = self.execute_and_fetch(q)
+
+ if r is None:
+ return None
+
+ def convert(res):
+ d = dict(zip(field_names, res))
+ cs = OrderedDict(json.loads(res[1]))
+ d['content_signature_key'] = self.calculate_content_signature_key(cs)
+ return d
+
+ rr = [convert(r) for r in r.results]
+
+ return rr
+
+ def done(self):
+ xprint("Closing database %s" % self.db_id)
+ try:
+ self.conn.commit()
+ self.conn.close()
+ xprint("Database %s closed" % self.db_id)
+ except Exception as e:
+ xprint("Could not close database %s" % self.db_id)
+ raise
+
+ def add_user_functions(self):
+ for udf in user_functions:
+ if type(udf.func_or_obj) == type(object):
+ self.conn.create_aggregate(udf.name,udf.param_count,udf.func_or_obj)
+ elif type(udf.func_or_obj) == type(md5):
+ self.conn.create_function(udf.name,udf.param_count,udf.func_or_obj)
+ else:
+ raise Exception("Invalid user function definition %s" % str(udf))
+
+ def is_numeric_type(self, column_type):
+ return column_type in Sqlite3DB.NUMERIC_COLUMN_TYPES
+
+ def update_many(self, sql, params):
+ try:
+ sqlprint(sql, " params: " + str(params))
+ self.cursor.executemany(sql, params)
+ _ = self.cursor.fetchall()
+ finally:
+ pass # cursor.close()
+
+ def execute_and_fetch(self, q,params = None):
+ try:
+ try:
+ if self.show_sql:
+ print(repr(q))
+ if params is None:
+ r = self.cursor.execute(q)
+ else:
+ r = self.cursor.execute(q,params)
+ if self.cursor.description is not None:
+ # we decode the column names, so they can be encoded to any output format later on
+ query_column_names = [c[0] for c in self.cursor.description]
+ else:
+ query_column_names = None
+ result = self.cursor.fetchall()
+ finally:
+ pass # cursor.close()
+ except sqlite3.OperationalError as e:
+ raise SqliteOperationalErrorException("Failed executing sqlite query %s with params %s . error: %s" % (q,params,str(e)),e)
+ return Sqlite3DBResults(query_column_names,result)
+
+ def _get_as_list_str(self, l):
+ return ",".join(['"%s"' % x.replace('"', '""') for x in l])
+
+ def generate_insert_row(self, table_name, column_names):
+ col_names_str = self._get_as_list_str(column_names)
+ question_marks = ", ".join(["?" for i in range(0, len(column_names))])
+ return 'INSERT INTO %s (%s) VALUES (%s)' % (table_name, col_names_str, question_marks)
+
+ # Get a list of column names so order will be preserved (Could have used OrderedDict, but
+ # then we would need python 2.7)
+ def generate_create_table(self, table_name, column_names, column_dict):
+ # Convert dict from python types to db types
+ column_name_to_db_type = dict(
+ (n, Sqlite3DB.PYTHON_TO_SQLITE_TYPE_NAMES[t]) for n, t in column_dict.items())
+ column_defs = ','.join(['"%s" %s' % (
+ n.replace('"', '""'), column_name_to_db_type[n]) for n in column_names])
+ return 'CREATE TABLE %s (%s)' % (table_name, column_defs)
+
+ def generate_temp_table_name(self):
+ # WTF - From my own past mutable-self
+ self.last_temp_table_id += 1
+ tn = "temp_table_%s" % self.last_temp_table_id
+ return tn
+
+ def generate_drop_table(self, table_name):
+ return "DROP TABLE %s" % table_name
+
+ def drop_table(self, table_name):
+ return self.execute_and_fetch(self.generate_drop_table(table_name))
+
+ def attach_and_copy_table(self, from_db, relevant_table,stop_after_analysis):
+ xprint("Attaching %s into db %s and copying table %s into it" % (from_db,self,relevant_table))
+ temp_db_id = 'temp_db_id'
+ q = "attach '%s' as %s" % (from_db.sqlite_db_url,temp_db_id)
+ xprint("Attach query: %s" % q)
+ c = self.execute_and_fetch(q)
+
+ new_temp_table_name = 'temp_table_%s' % (self.last_temp_table_id + 1)
+ fully_qualified_table_name = '%s.%s' % (temp_db_id,relevant_table)
+
+ if stop_after_analysis:
+ limit = ' limit 100'
+ else:
+ limit = ''
+
+ copy_query = 'create table %s as select * from %s %s' % (new_temp_table_name,fully_qualified_table_name,limit)
+ copy_results = self.execute_and_fetch(copy_query)
+ xprint("Copied %s.%s into %s in db_id %s. Results %s" % (temp_db_id,relevant_table,new_temp_table_name,self.db_id,copy_results))
+ self.last_temp_table_id += 1
+
+ xprint("Copied table into %s. Detaching db that was attached temporarily" % self.db_id)
+
+ q = "detach database %s" % temp_db_id
+ xprint("detach query: %s" % q)
+ c = self.execute_and_fetch(q)
+ xprint(c)
+ return new_temp_table_name
+
+# Simplistic Sql "parsing" class... We'll eventually require a real SQL parser which will provide us with a parse tree
+#
+# A "qtable" is a filename which behaves like an SQL table...
+class Sql(object):
+
+ def __init__(self, sql, data_streams):
+ # Currently supports only standard SELECT statements
+
+ # Holds original SQL
+ self.sql = sql
+ # Holds sql parts
+ self.sql_parts = sql.split()
+ self.data_streams = data_streams
+
+ self.qtable_metadata_dict = OrderedDict()
+
+ # Set of qtable names
+ self.qtable_names = []
+ # Dict from qtable names to their positions in sql_parts. Value here is a *list* of positions,
+ # since it is possible that the same qtable_name (file) is referenced in multiple positions
+ # and we don't want the database table to be recreated for each
+ # reference
+ self.qtable_name_positions = {}
+ # Dict from qtable names to their effective (actual database) table
+ # names
+ self.qtable_name_effective_table_names = {}
+
+ self.query_column_names = None
+
+ # Go over all sql parts
+ idx = 0
+ while idx < len(self.sql_parts):
+ # Get the part string
+ part = self.sql_parts[idx]
+ # If it's a FROM or a JOIN
+ if part.upper() in ['FROM', 'JOIN']:
+ # and there is nothing after it,
+ if idx == len(self.sql_parts) - 1:
+ # Just fail
+ raise InvalidQueryException(
+ 'FROM/JOIN is missing a table name after it')
+
+ qtable_name = self.sql_parts[idx + 1]
+ # Otherwise, the next part contains the qtable name. In most cases the next part will be only the qtable name.
+ # We handle one special case here, where this is a subquery as a column: "SELECT (SELECT ... FROM qtable),100 FROM ...".
+ # In that case, there will be an ending paranthesis as part of the name, and we want to handle this case gracefully.
+ # This is obviously a hack of a hack :) Just until we have
+ # complete parsing capabilities
+ if ')' in qtable_name:
+ leftover = qtable_name[qtable_name.index(')'):]
+ self.sql_parts.insert(idx + 2, leftover)
+ qtable_name = qtable_name[:qtable_name.index(')')]
+ self.sql_parts[idx + 1] = qtable_name
+
+ if qtable_name[0] != '(':
+ normalized_qtable_name = self.normalize_qtable_name(qtable_name)
+ xprint("Normalized qtable name for %s is %s" % (qtable_name,normalized_qtable_name))
+ self.qtable_names += [normalized_qtable_name]
+
+ if normalized_qtable_name not in self.qtable_name_positions.keys():
+ self.qtable_name_positions[normalized_qtable_name] = []
+
+ self.qtable_name_positions[normalized_qtable_name].append(idx + 1)
+ self.sql_parts[idx + 1] = normalized_qtable_name
+ idx += 2
+ else:
+ idx += 1
+ else:
+ idx += 1
+ xprint("Final sql parts: %s" % self.sql_parts)
+
+ def normalize_qtable_name(self,qtable_name):
+ if self.data_streams.is_data_stream(qtable_name):
+ return qtable_name
+
+ if ':::' in qtable_name:
+ qsql_filename, table_name = qtable_name.split(":::", 1)
+ return '%s:::%s' % (os.path.realpath(os.path.abspath(qsql_filename)),table_name)
+ else:
+ return os.path.realpath(os.path.abspath(qtable_name))
+
+ def set_effective_table_name(self, qtable_name, effective_table_name):
+ if qtable_name in self.qtable_name_effective_table_names.keys():
+ if self.qtable_name_effective_table_names[qtable_name] != effective_table_name:
+ raise Exception(
+ "Already set effective table name for qtable %s. Trying to change the effective table name from %s to %s" %
+ (qtable_name,self.qtable_name_effective_table_names[qtable_name],effective_table_name))
+
+ xprint("Setting effective table name for %s - effective table name is set to %s" % (qtable_name,effective_table_name))
+ self.qtable_name_effective_table_names[
+ qtable_name] = effective_table_name
+
+ def get_effective_sql(self,table_name_mapping=None):
+ if len(list(filter(lambda x: x is None, self.qtable_name_effective_table_names))) != 0:
+ assert False, 'There are qtables without effective tables'
+
+ effective_sql = [x for x in self.sql_parts]
+
+ xprint("Effective table names",self.qtable_name_effective_table_names)
+ for qtable_name, positions in self.qtable_name_positions.items():
+ xprint("Positions for qtable name %s are %s" % (qtable_name,positions))
+ for pos in positions:
+ if table_name_mapping is not None:
+ x = self.qtable_name_effective_table_names[qtable_name]
+ effective_sql[pos] = table_name_mapping[x]
+ else:
+ effective_sql[pos] = self.qtable_name_effective_table_names[qtable_name]
+
+ return " ".join(effective_sql)
+
+
+ def get_qtable_name_effective_table_names(self):
+ return self.qtable_name_effective_table_names
+
+ def execute_and_fetch(self, db):
+ x = self.get_effective_sql()
+ xprint("Final query: %s" % x)
+ db_results_obj = db.execute_and_fetch(x)
+ return db_results_obj
+
+ def materialize_using(self,loaded_table_structures_dict):
+ xprint("Materializing sql object: %s" % str(self.qtable_names))
+ xprint("loaded table structures dict %s" % loaded_table_structures_dict)
+ for qtable_name in self.qtable_names:
+ table_structure = loaded_table_structures_dict[qtable_name]
+
+ table_name_in_disk_db = table_structure.get_table_name_for_querying()
+
+ effective_table_name = '%s.%s' % (table_structure.db_id, table_name_in_disk_db)
+
+ # for a single file - no need to create a union, just use the table name
+ self.set_effective_table_name(qtable_name, effective_table_name)
+ xprint("Materialized filename %s to effective table name %s" % (qtable_name,effective_table_name))
+
+
+class TableColumnInferer(object):
+
+ def __init__(self, input_params):
+ self.inferred = False
+ self.mode = input_params.parsing_mode
+ self.rows = []
+ self.skip_header = input_params.skip_header
+ self.header_row = None
+ self.header_row_filename = None
+ self.expected_column_count = input_params.expected_column_count
+ self.input_delimiter = input_params.delimiter
+ self.disable_column_type_detection = input_params.disable_column_type_detection
+
+ def _generate_content_signature(self):
+ return OrderedDict({
+ "inferred": self.inferred,
+ "mode": self.mode,
+ "rows": "\n".join([",".join(x) for x in self.rows]),
+ "skip_header": self.skip_header,
+ "header_row": self.header_row,
+ "expected_column_count": self.expected_column_count,
+ "input_delimiter": self.input_delimiter,
+ "disable_column_type_detection": self.disable_column_type_detection
+ })
+
+ def analyze(self, filename, col_vals):
+ if self.inferred:
+ assert False, "Already inferred columns"
+
+ if self.skip_header and self.header_row is None:
+ self.header_row = col_vals
+ self.header_row_filename = filename
+ else:
+ self.rows.append(col_vals)
+
+ if len(self.rows) < 100:
+ return False
+
+ self.do_analysis()
+ return True
+
+ def force_analysis(self):
+ # This method is called whenever there is no more data, and an analysis needs
+ # to be performed immediately, regardless of the amount of sample data that has
+ # been collected
+ self.do_analysis()
+
+ def determine_type_of_value(self, value):
+ if self.disable_column_type_detection:
+ return str
+
+ if value is not None:
+ value = value.strip()
+ if value == '' or value is None:
+ return None
+
+ try:
+ i = int(value)
+ if type(i) == int:
+ return int
+ else:
+ return int
+ except:
+ pass
+
+ try:
+ f = float(value)
+ return float
+ except:
+ pass
+
+ return str
+
+ def determine_type_of_value_list(self, value_list):
+ type_list = [self.determine_type_of_value(v) for v in value_list]
+ all_types = set(type_list)
+ if len(set(type_list)) == 1:
+ # all the sample lines are of the same type
+ return type_list[0]
+ else:
+ # check for the number of types without nulls,
+ type_list_without_nulls = list(filter(
+ lambda x: x is not None, type_list))
+ # If all the sample lines are of the same type,
+ if len(set(type_list_without_nulls)) == 1:
+ # return it
+ return type_list_without_nulls[0]
+ else:
+ # If there are only two types, one float an one int, then choose a float type
+ if len(set(type_list_without_nulls)) == 2 and float in type_list_without_nulls and int in type_list_without_nulls:
+ return float
+ return str
+
+ def do_analysis(self):
+ if self.mode == 'strict':
+ self._do_strict_analysis()
+ elif self.mode in ['relaxed']:
+ self._do_relaxed_analysis()
+ else:
+ raise Exception('Unknown parsing mode %s' % self.mode)
+
+ if self.column_count == 1 and self.expected_column_count != 1 and self.expected_column_count is not None:
+ print(f"Warning: column count is one (expected column count is {self.expected_column_count} - did you provide the correct delimiter?", file=sys.stderr)
+
+ self.infer_column_types()
+ self.infer_column_names()
+ self.inferred = True
+
+ def validate_column_names(self, value_list):
+ column_name_errors = []
+ for v in value_list:
+ if v is None:
+ # we allow column names to be None, in relaxed mode it'll be filled with default names.
+ # RLRL
+ continue
+ if ',' in v:
+ column_name_errors.append(
+ (v, "Column name cannot contain commas"))
+ continue
+ if self.input_delimiter in v:
+ column_name_errors.append(
+ (v, "Column name cannot contain the input delimiter. Please make sure you've set the correct delimiter"))
+ continue
+ if '\n' in v:
+ column_name_errors.append(
+ (v, "Column name cannot contain newline"))
+ continue
+ if v != v.strip():
+ column_name_errors.append(
+ (v, "Column name contains leading/trailing spaces"))
+ continue
+ try:
+ v.encode("utf-8", "strict").decode("utf-8")
+ except:
+ column_name_errors.append(
+ (v, "Column name must be UTF-8 Compatible"))
+ continue
+ # We're checking for column duplication for each field in order to be able to still provide it along with other errors
+ if len(list(filter(lambda x: x == v,value_list))) > 1:
+ entry = (v, "Column name is duplicated")
+ # Don't duplicate the error report itself
+ if entry not in column_name_errors:
+ column_name_errors.append(entry)
+ continue
+ nul_index = v.find("\x00")
+ if nul_index >= 0:
+ column_name_errors.append(
+ (v, "Column name cannot contain NUL"))
+ continue
+ t = self.determine_type_of_value(v)
+ if t != str:
+ column_name_errors.append((v, "Column name must be a string"))
+ return column_name_errors
+
+ def infer_column_names(self):
+ if self.header_row is not None:
+ column_name_errors = self.validate_column_names(self.header_row)
+ if len(column_name_errors) > 0:
+ raise BadHeaderException("Header must contain only strings and not numbers or empty strings: '%s'\n%s" % (
+ ",".join(self.header_row), "\n".join(["'%s': %s" % (x, y) for x, y in column_name_errors])))
+
+ # use header row in order to name columns
+ if len(self.header_row) < self.column_count:
+ if self.mode == 'strict':
+ raise ColumnCountMismatchException("Strict mode. Header row contains less columns than expected column count(%s vs %s)" % (
+ len(self.header_row), self.column_count))
+ elif self.mode in ['relaxed']:
+ # in relaxed mode, add columns to fill the missing ones
+ self.header_row = self.header_row + \
+ ['c%s' % (x + len(self.header_row) + 1)
+ for x in range(self.column_count - len(self.header_row))]
+ elif len(self.header_row) > self.column_count:
+ if self.mode == 'strict':
+ raise ColumnCountMismatchException("Strict mode. Header row contains more columns than expected column count (%s vs %s)" % (
+ len(self.header_row), self.column_count))
+ elif self.mode in ['relaxed']:
+ # In relaxed mode, just cut the extra column names
+ self.header_row = self.header_row[:self.column_count]
+ self.column_names = self.header_row
+ else:
+ # Column names are cX starting from 1
+ self.column_names = ['c%s' % (i + 1)
+ for i in range(self.column_count)]
+
+ def _do_relaxed_analysis(self):
+ column_count_list = [len(col_vals) for col_vals in self.rows]
+
+ if len(self.rows) == 0:
+ if self.header_row is None:
+ self.column_count = 0
+ else:
+ self.column_count = len(self.header_row)
+ else:
+ if self.expected_column_count is not None:
+ self.column_count = self.expected_column_count
+ else:
+ # If not specified, we'll take the largest row in the sample rows
+ self.column_count = max(column_count_list)
+
+ def get_column_count_summary(self, column_count_list):
+ counts = {}
+ for column_count in column_count_list:
+ counts[column_count] = counts.get(column_count, 0) + 1
+ return ", ".join(["{} rows with {} columns".format(v, k) for k, v in counts.items()])
+
+ def _do_strict_analysis(self):
+ column_count_list = [len(col_vals) for col_vals in self.rows]
+
+ if len(set(column_count_list)) != 1:
+ raise ColumnCountMismatchException('Strict mode. Column Count is expected to identical. Multiple column counts exist at the first part of the file. Try to check your delimiter, or change to relaxed mode. Details: %s' % (
+ self.get_column_count_summary(column_count_list)))
+
+ self.column_count = len(self.rows[0])
+
+ if self.expected_column_count is not None and self.column_count != self.expected_column_count:
+ raise ColumnCountMismatchException('Strict mode. Column count is expected to be %s but is %s' % (
+ self.expected_column_count, self.column_count))
+
+ self.infer_column_types()
+
+ def infer_column_types(self):
+ assert self.column_count > -1
+ self.column_types = []
+ self.column_types2 = []
+ for column_number in range(self.column_count):
+ column_value_list = [
+ row[column_number] if column_number < len(row) else None for row in self.rows]
+ column_type = self.determine_type_of_value_list(column_value_list)
+ self.column_types.append(column_type)
+
+ column_value_list2 = [row[column_number] if column_number < len(
+ row) else None for row in self.rows[1:]]
+ column_type2 = self.determine_type_of_value_list(
+ column_value_list2)
+ self.column_types2.append(column_type2)
+
+ comparison = map(
+ lambda x: x[0] == x[1], zip(self.column_types, self.column_types2))
+ if False in comparison and not self.skip_header:
+ number_of_column_types = len(set(self.column_types))
+ if number_of_column_types == 1 and list(set(self.column_types))[0] == str:
+ print('Warning - There seems to be header line in the file, but -H has not been specified. All fields will be detected as text fields, and the header line will appear as part of the data', file=sys.stderr)
+
+ def get_column_dict(self):
+ return OrderedDict(zip(self.column_names, self.column_types))
+
+ def get_column_count(self):
+ return self.column_count
+
+ def get_column_names(self):
+ return self.column_names
+
+ def get_column_types(self):
+ return self.column_types
+
+
+def normalized_filename(filename):
+ return filename
+
+
+class TableCreatorState(object):
+ INITIALIZED = 'INITIALIZED'
+ ANALYZED = 'ANALYZED'
+ FULLY_READ = 'FULLY_READ'
+
+
+class MaterializedStateType(object):
+ UNKNOWN = 'unknown'
+ DELIMITED_FILE = 'delimited-file'
+ QSQL_FILE = 'qsql-file'
+ SQLITE_FILE = 'sqlite-file'
+ DATA_STREAM = 'data-stream'
+
+
+class TableSourceType(object):
+ DELIMITED_FILE = 'file'
+ DELIMITED_FILE_WITH_UNUSED_QSQL = 'file-with-unused-qsql'
+ QSQL_FILE = 'qsql-file'
+ QSQL_FILE_WITH_ORIGINAL = 'qsql-file-with-original'
+ SQLITE_FILE = 'sqlite-file'
+ DATA_STREAM = 'data-stream'
+
+
+def is_sqlite_file(filename):
+ # SQLite database file header is 100 bytes
+ if os.path.isfile(filename) and os.access(filename, os.R_OK):
+ with open(filename, 'rb') as fd:
+ header = fd.read(100)
+ return header[:16] == b"SQLite format 3\x00"
+ else:
+ return False
+
+
+def sqlite_table_exists(cursor,table_name):
+ results = cursor.execute("select count(*) from sqlite_master where type='table' and tbl_name == '%s'" % table_name).fetchall()
+ return results[0][0] == 1
+
+
+def is_qsql_file(filename):
+ if not is_sqlite_file(filename):
+ return False
+
+ db = Sqlite3DB('check_qsql_db',filename,filename,create_qcatalog=False)
+ qcatalog_exists = db.qcatalog_table_exists()
+ db.done()
+ return qcatalog_exists
+
+
+def normalize_filename_to_table_name(filename):
+ xprint("Normalizing filename %s" % filename)
+ if filename[0].isdigit():
+ xprint("Filename starts with a digit, adding prefix")
+ filename = 't_%s' % filename
+ if filename.lower().endswith(".qsql"):
+ filename = filename[:-5]
+ elif filename.lower().endswith('.sqlite'):
+ filename = filename[:-7]
+ elif filename.lower().endswith('.sqlite3'):
+ filename = filename[:-8]
+ return filename.replace("-","_dash_").replace(".","_dot_").replace('?','_qm_').replace("/","_slash_").replace("\\","_backslash_").replace(":","_colon_").replace(" ","_space_").replace("+","_plus_")
+
+
+def validate_content_signature(original_filename, source_signature,other_filename, content_signature,scope=None,dump=False):
+ if dump:
+ xprint("Comparing: source value: %s target value: %s" % (source_signature,content_signature))
+
+ s = "%s vs %s:" % (original_filename,other_filename)
+ if scope is None:
+ scope = []
+ for k in source_signature:
+ if type(source_signature[k]) == OrderedDict:
+ validate_content_signature(original_filename, source_signature[k],other_filename, content_signature[k],scope + [k])
+ else:
+ if k not in content_signature:
+ raise ContentSignatureDataDiffersException("%s Content Signatures differ. %s is missing from content signature" % (s,k))
+ if source_signature[k] != content_signature[k]:
+ if k == 'rows':
+ raise ContentSignatureDataDiffersException("%s Content Signatures differ at %s.%s (actual analysis data differs)" % (s,".".join(scope),k))
+ else:
+ raise ContentSignatureDiffersException(original_filename, other_filename, original_filename,".".join(scope + [k]),source_signature[k],content_signature[k])
+
+
+def skip_BOM(f):
+ # BOM is a byte order mark that some editors add as the first
+ # three bytes in a file to mark it as a UTF8 file.
+ pos = f.tell()
+ # Not all file objects support peek([size]) and thus since we dont really know
+ # if peek is available, we'll handle it in a try/except clause and seek back the original position if things fail
+ try:
+ BOM = f.peek(3)
+ if BOM.startswith(b'\xef\xbb\xbf'):
+ f.read(3)
+ except:
+ f.seek(pos)
+
+
+class DelimitedFileReader(object):
+ def __init__(self,atomic_fns, input_params, dialect, f = None,external_f_name = None):
+ if f is not None:
+ assert len(atomic_fns) == 0
+
+ self.atomic_fns = atomic_fns
+ self.input_params = input_params
+ self.dialect = dialect
+
+ self.f = f
+ self.lines_read = 0
+ self.file_number = -1
+
+ self.skipped_bom = False
+
+ self.is_open = f is not None
+
+ self.external_f = f is not None
+ self.external_f_name = external_f_name
+
+ def get_lines_read(self):
+ return self.lines_read
+
+ def get_size_hash(self):
+ if self.atomic_fns is None or len(self.atomic_fns) == 0:
+ return "data-stream-size"
+ else:
+ return ",".join(map(str,[os.stat(atomic_fn).st_size for atomic_fn in self.atomic_fns]))
+
+ def get_last_modification_time_hash(self):
+ if self.atomic_fns is None or len(self.atomic_fns) == 0:
+ return "data stream-lmt"
+ else:
+ x = ",".join(map(lambda x: ':%s:' % x,[os.stat(x).st_mtime_ns for x in self.atomic_fns]))
+ res = hashlib.sha1(x.encode('utf-8')).hexdigest() + '///' + x
+ xprint("Hash of last modification time is %s" % res)
+ return res
+
+ def open_file(self):
+ if self.external_f:
+ xprint("External f has been provided. No need to open the file")
+ return
+
+ # TODO Support universal newlines for gzipped and stdin data as well
+
+ xprint("XX Opening file %s" % ",".join(self.atomic_fns))
+ import fileinput
+
+ def q_openhook(filename, mode):
+ if self.input_params.gzipped_input or filename.endswith('.gz'):
+ import gzip
+ f = gzip.open(filename,mode='rt',encoding=self.input_params.input_encoding)
+ else:
+ if self.input_params.with_universal_newlines:
+ f = io.open(filename, 'r', newline=None, encoding=self.input_params.input_encoding)
+ else:
+ f = io.open(filename, 'r', newline=None, encoding=self.input_params.input_encoding)
+
+ if self.input_params.input_encoding == 'utf-8-sig' and not self.skipped_bom:
+ skip_BOM(f)
+
+ return f
+
+ f = fileinput.input(self.atomic_fns,mode='r',openhook=q_openhook)
+
+ self.f = f
+ self.is_open = True
+ xprint("Actually opened file %s" % self.f)
+ return f
+
+ def close_file(self):
+ if not self.is_open:
+ # TODO Convert to assertion
+ raise Exception("Bug - file should already be open: %s" % ",".join(self.atomic_fns))
+
+ self.f.close()
+ xprint("XX Closed file %s" % ",".join(self.atomic_fns))
+
+ def generate_rows(self):
+ csv_reader = encoded_csv_reader(self.input_params.input_encoding, self.f, dialect=self.dialect,row_data_only=self.external_f)
+ try:
+ # TODO Some order with regard to separating data-streams for actual files
+ if self.external_f:
+ for col_vals in csv_reader:
+ self.lines_read += 1
+ yield self.external_f_name,0, self.lines_read == 0, col_vals
+ else:
+ for file_name,is_first_line,col_vals in csv_reader:
+ if is_first_line:
+ self.file_number = self.file_number + 1
+ self.lines_read += 1
+ yield file_name,self.file_number,is_first_line,col_vals
+ except ColumnMaxLengthLimitExceededException as e:
+ msg = "Column length is larger than the maximum. Offending file is '%s' - Line is %s, counting from 1 (encoding %s). The line number is the raw line number of the file, ignoring whether there's a header or not" % (",".join(self.atomic_fns),self.lines_read + 1,self.input_params.input_encoding)
+ raise ColumnMaxLengthLimitExceededException(msg)
+ except UniversalNewlinesExistException as e2:
+ # No need to translate the exception, but we want it to be explicitly defined here for clarity
+ raise UniversalNewlinesExistException()
+
+
+class MaterializedState(object):
+ def __init__(self, table_source_type,qtable_name, engine_id):
+ xprint("Creating new MS: %s %s" % (id(self), qtable_name))
+
+ self.table_source_type = table_source_type
+
+ self.qtable_name = qtable_name
+ self.engine_id = engine_id
+
+ self.db_to_use = None
+ self.db_id = None
+
+ self.source_type = None
+ self.source = None
+
+ self.mfs_structure = None
+
+ self.start_time = None
+ self.end_time = None
+ self.duration = None
+
+ self.effective_table_name = None
+
+
+ def get_materialized_state_type(self):
+ return MaterializedStateType.UNKNOWN
+
+ def get_planned_table_name(self):
+ assert False, 'not implemented'
+
+ def autodetect_table_name(self):
+ xprint("Autodetecting table name. db_to_use=%s" % self.db_to_use)
+ existing_table_names = self.db_to_use.retrieve_all_table_names()
+ xprint("Existing table names: %s" % existing_table_names)
+
+ possible_indices = range(1,1000)
+
+ for index in possible_indices:
+ if index == 1:
+ suffix = ''
+ else:
+ suffix = '_%s' % index
+
+ table_name_attempt = '%s%s' % (self.get_planned_table_name(),suffix)
+ xprint("Table name attempt: index=%s name=%s" % (index,table_name_attempt))
+
+ if table_name_attempt not in existing_table_names:
+ xprint("Found free table name %s for source type %s source %s" % (table_name_attempt,self.source_type,self.source))
+ return table_name_attempt
+
+ raise Exception('Cannot find free table name for source type %s source %s' % (self.source_type,self.source))
+
+ def initialize(self):
+ self.start_time = time.time()
+
+ def finalize(self):
+ self.end_time = time.time()
+ self.duration = self.end_time - self.start_time
+
+ def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
+ assert False, 'not implemented'
+
+ def make_data_available(self,stop_after_analysis):
+ assert False, 'not implemented'
+
+
+class MaterializedStateTableStructure(object):
+ def __init__(self,qtable_name, atomic_fns, db_id, column_names, python_column_types, sqlite_column_types, table_name_for_querying,source_type,source,planned_table_name):
+ self.qtable_name = qtable_name
+ self.atomic_fns = atomic_fns
+ self.db_id = db_id
+ self.column_names = column_names
+ self.python_column_types = python_column_types
+ self.table_name_for_querying = table_name_for_querying
+ self.source_type = source_type
+ self.source = source
+ self.planned_table_name = planned_table_name
+
+ if sqlite_column_types is not None:
+ self.sqlite_column_types = sqlite_column_types
+ else:
+ self.sqlite_column_types = [Sqlite3DB.PYTHON_TO_SQLITE_TYPE_NAMES[t].lower() for t in python_column_types]
+
+ def get_table_name_for_querying(self):
+ return self.table_name_for_querying
+
+ def __str__(self):
+ return "MaterializedStateTableStructure<%s>" % self.__dict__
+ __repr__ = __str__
+
+
+class TableCreator(object):
+ def __str__(self):
+ return "TableCreator<%s>" % str(self)
+ __repr__ = __str__
+
+ def __init__(self, qtable_name, delimited_file_reader,input_params,sqlite_db=None,target_sqlite_table_name=None):
+
+ self.qtable_name = qtable_name
+ self.delimited_file_reader = delimited_file_reader
+
+ self.db_id = sqlite_db.db_id
+
+ self.sqlite_db = sqlite_db
+ self.target_sqlite_table_name = target_sqlite_table_name
+
+ self.skip_header = input_params.skip_header
+ self.gzipped = input_params.gzipped_input
+ self.table_created = False
+
+ self.encoding = input_params.input_encoding
+ self.mode = input_params.parsing_mode
+ self.expected_column_count = input_params.expected_column_count
+ self.input_delimiter = input_params.delimiter
+ self.with_universal_newlines = input_params.with_universal_newlines
+
+ self.column_inferer = TableColumnInferer(input_params)
+
+ self.pre_creation_rows = []
+ self.buffered_inserts = []
+ self.effective_column_names = None
+
+ # Column type indices for columns that contain numeric types. Lazily initialized
+ # so column inferer can do its work before this information is needed
+ self.numeric_column_indices = None
+
+ self.state = TableCreatorState.INITIALIZED
+
+ self.content_signature = None
+
+ def _generate_content_signature(self):
+ if self.state != TableCreatorState.ANALYZED:
+ # TODO Change to assertion
+ raise Exception('Bug - Wrong state %s. Table needs to be analyzed before a content signature can be calculated' % self.state)
+
+ size = self.delimited_file_reader.get_size_hash()
+ last_modification_time = self.delimited_file_reader.get_last_modification_time_hash()
+
+ m = OrderedDict({
+ "_signature_version": "v1",
+ "skip_header": self.skip_header,
+ "gzipped": self.gzipped,
+ "with_universal_newlines": self.with_universal_newlines,
+ "encoding": self.encoding,
+ "mode": self.mode,
+ "expected_column_count": self.expected_column_count,
+ "input_delimiter": self.input_delimiter,
+ "inferer": self.column_inferer._generate_content_signature(),
+ "original_file_size": size,
+ "last_modification_time": last_modification_time
+ })
+
+ return m
+
+ def validate_extra_header_if_needed(self, file_number, filename,col_vals):
+ xprint("HHX validate",file_number,filename,col_vals)
+ if not self.skip_header:
+ xprint("No need to validate header")
+ return False
+
+ if file_number == 0:
+ xprint("First file, no need to validate extra header")
+ return False
+
+ header_already_exists = self.column_inferer.header_row is not None
+
+ if header_already_exists:
+ xprint("Validating extra header")
+ if tuple(self.column_inferer.header_row) != tuple(col_vals):
+ raise BadHeaderException("Extra header '{}' in file '{}' mismatches original header '{}' from file '{}'. Table name is '{}'".format(
+ ",".join(col_vals),filename,
+ ",".join(self.column_inferer.header_row),
+ self.column_inferer.header_row_filename,
+ self.qtable_name))
+ xprint("header already exists: %s" % self.column_inferer.header_row)
+ else:
+ xprint("Header doesn't already exist")
+
+ return header_already_exists
+
+ def _populate(self,dialect,stop_after_analysis=False):
+ total_data_lines_read = 0
+ try:
+ try:
+ for file_name,file_number,is_first_line,col_vals in self.delimited_file_reader.generate_rows():
+ if is_first_line:
+ if self.validate_extra_header_if_needed(file_number,file_name,col_vals):
+ continue
+ self._insert_row(file_name, col_vals)
+ if stop_after_analysis:
+ if self.column_inferer.inferred:
+ xprint("Stopping after analysis")
+ return
+ if self.delimited_file_reader.get_lines_read() == 0 and self.skip_header:
+ raise MissingHeaderException("Header line is expected but missing in file %s" % ",".join(self.delimited_file_reader.atomic_fns))
+
+ total_data_lines_read += self.delimited_file_reader.lines_read - (1 if self.skip_header else 0)
+ xprint("Total Data lines read %s" % total_data_lines_read)
+ except StrictModeColumnCountMismatchException as e:
+ raise ColumnCountMismatchException(
+ 'Strict mode - Expected %s columns instead of %s columns in file %s row %s. Either use relaxed modes or check your delimiter' % (
+ e.expected_col_count, e.actual_col_count, normalized_filename(e.atomic_fn), e.lines_read))
+ except FluffyModeColumnCountMismatchException as e:
+ raise ColumnCountMismatchException(
+ 'Deprecated fluffy mode - Too many columns in file %s row %s (%s fields instead of %s fields). Consider moving to either relaxed or strict mode' % (
+ normalized_filename(e.atomic_fn), e.lines_read, e.actual_col_count, e.expected_col_count))
+ finally:
+ self._flush_inserts()
+
+ if not self.table_created:
+ self.column_inferer.force_analysis()
+ self._do_create_table(self.qtable_name)
+
+ self.sqlite_db.conn.commit()
+
+ def perform_analyze(self, dialect):
+ xprint("Analyzing... %s" % dialect)
+ if self.state == TableCreatorState.INITIALIZED:
+ self._populate(dialect,stop_after_analysis=True)
+ self.state = TableCreatorState.ANALYZED
+
+ self.content_signature = self._generate_content_signature()
+ content_signature_key = self.sqlite_db.calculate_content_signature_key(self.content_signature)
+ xprint("Setting content signature after analysis: %s" % content_signature_key)
+ else:
+ # TODO Convert to assertion
+ raise Exception('Bug - Wrong state %s' % self.state)
+
+ def perform_read_fully(self, dialect):
+ if self.state == TableCreatorState.ANALYZED:
+ self._populate(dialect,stop_after_analysis=False)
+ self.state = TableCreatorState.FULLY_READ
+ else:
+ # TODO Convert to assertion
+ raise Exception('Bug - Wrong state %s' % self.state)
+
+ def _flush_pre_creation_rows(self, filename):
+ for i, col_vals in enumerate(self.pre_creation_rows):
+ if self.skip_header and i == 0:
+ # skip header line
+ continue
+ self._insert_row(filename, col_vals)
+ self._flush_inserts()
+ self.pre_creation_rows = []
+
+ def _insert_row(self, filename, col_vals):
+ # If table has not been created yet
+ if not self.table_created:
+ # Try to create it along with another "example" line of data
+ self.try_to_create_table(filename, col_vals)
+
+ # If the table is still not created, then we don't have enough data, just
+ # store the data and return
+ if not self.table_created:
+ self.pre_creation_rows.append(col_vals)
+ return
+
+
+ # The table already exists, so we can just add a new row
+ self._insert_row_i(col_vals)
+
+ def initialize_numeric_column_indices_if_needed(self):
+ # Lazy initialization of numeric column indices
+ if self.numeric_column_indices is None:
+ column_types = self.column_inferer.get_column_types()
+ self.numeric_column_indices = [idx for idx, column_type in enumerate(
+ column_types) if self.sqlite_db.is_numeric_type(column_type)]
+
+ def nullify_values_if_needed(self, col_vals):
+ new_vals = col_vals[:]
+ col_count = len(col_vals)
+ for i in self.numeric_column_indices:
+ if i >= col_count:
+ continue
+ v = col_vals[i]
+ if v == '':
+ new_vals[i] = None
+ return new_vals
+
+ def normalize_col_vals(self, col_vals):
+ # Make sure that numeric column indices are initializd
+ self.initialize_numeric_column_indices_if_needed()
+
+ col_vals = self.nullify_values_if_needed(col_vals)
+
+ expected_col_count = self.column_inferer.get_column_count()
+ actual_col_count = len(col_vals)
+ if self.mode == 'strict':
+ if actual_col_count != expected_col_count:
+ raise StrictModeColumnCountMismatchException(",".join(self.delimited_file_reader.atomic_fns), expected_col_count,actual_col_count,self.delimited_file_reader.get_lines_read())
+ return col_vals
+
+ # in all non strict mode, we add dummy data to missing columns
+
+ if actual_col_count < expected_col_count:
+ col_vals = col_vals + \
+ [None for x in range(expected_col_count - actual_col_count)]
+
+ # in relaxed mode, we merge all extra columns to the last column value
+ if self.mode == 'relaxed':
+ if actual_col_count > expected_col_count:
+ xxx = col_vals[:expected_col_count - 1] + \
+ [self.input_delimiter.join([v if v is not None else '' for v in
+ col_vals[expected_col_count - 1:]])]
+ return xxx
+ else:
+ return col_vals
+
+ assert False, "Unidentified parsing mode %s" % self.mode
+
+ def _insert_row_i(self, col_vals):
+ col_vals = self.normalize_col_vals(col_vals)
+
+ if self.effective_column_names is None:
+ self.effective_column_names = self.column_inferer.column_names[:len(col_vals)]
+
+ if len(self.effective_column_names) > 0:
+ self.buffered_inserts.append(col_vals)
+ else:
+ self.buffered_inserts.append([""])
+
+ if len(self.buffered_inserts) < 5000:
+ return
+ self._flush_inserts()
+
+ def _flush_inserts(self):
+ # If the table is still not created, then we don't have enough data
+ if not self.table_created:
+ return
+
+ if len(self.buffered_inserts) > 0:
+ insert_row_stmt = self.sqlite_db.generate_insert_row(
+ self.target_sqlite_table_name, self.effective_column_names)
+
+ self.sqlite_db.update_many(insert_row_stmt, self.buffered_inserts)
+ self.buffered_inserts = []
+
+ def try_to_create_table(self, filename, col_vals):
+ if self.table_created:
+ # TODO Convert to assertion
+ raise Exception('Table is already created')
+
+ # Add that line to the column inferer
+ result = self.column_inferer.analyze(filename, col_vals)
+ # If inferer succeeded,
+ if result:
+ self._do_create_table(filename)
+ else:
+ pass # We don't have enough information for creating the table yet
+
+ def _do_create_table(self,filename):
+ # Get the column definition dict from the inferer
+ column_dict = self.column_inferer.get_column_dict()
+
+ # Guard against empty tables (instead of preventing the creation, just create with a dummy column)
+ if len(column_dict) == 0:
+ column_dict = { 'dummy_column_for_empty_tables' : str }
+ ordered_column_names = [ 'dummy_column_for_empty_tables' ]
+ else:
+ ordered_column_names = self.column_inferer.get_column_names()
+
+ # Create the CREATE TABLE statement
+ create_table_stmt = self.sqlite_db.generate_create_table(
+ self.target_sqlite_table_name, ordered_column_names, column_dict)
+ # And create the table itself
+ self.sqlite_db.execute_and_fetch(create_table_stmt)
+ # Mark the table as created
+ self.table_created = True
+ self._flush_pre_creation_rows(filename)
+
+
+class MaterializedDelimitedFileState(MaterializedState):
+ def __init__(self, table_source_type,qtable_name, input_params, dialect_id,engine_id,target_table_name=None):
+ super().__init__(table_source_type,qtable_name,engine_id)
+
+ self.input_params = input_params
+ self.dialect_id = dialect_id
+ self.target_table_name = target_table_name
+
+ self.content_signature = None
+
+ self.atomic_fns = None
+
+ self.can_store_as_cached = None
+
+ def get_materialized_state_type(self):
+ return MaterializedStateType.DELIMITED_FILE
+
+ def initialize(self):
+ super(MaterializedDelimitedFileState, self).initialize()
+
+ self.atomic_fns = self.materialize_file_list(self.qtable_name)
+ self.delimited_file_reader = DelimitedFileReader(self.atomic_fns,self.input_params,self.dialect_id)
+
+ self.source_type = self.table_source_type
+ self.source = ",".join(self.atomic_fns)
+
+ return
+
+ def materialize_file_list(self,qtable_name):
+ materialized_file_list = []
+
+ unfound_files = []
+ # First check if the file exists without globbing. This will ensure that we don't support non-existent files
+ if os.path.exists(qtable_name):
+ # If it exists, then just use it
+ found_files = [qtable_name]
+ else:
+ # If not, then try with globs (and sort for predictability)
+ found_files = list(sorted(glob.glob(qtable_name)))
+ # If no files
+ if len(found_files) == 0:
+ unfound_files += [qtable_name]
+ materialized_file_list += found_files
+
+ # If there are no files to go over,
+ if len(unfound_files) == 1:
+ raise FileNotFoundException(
+ "No files matching '%s' have been found" % unfound_files[0])
+ elif len(unfound_files) > 1:
+ # TODO Add test for this
+ raise FileNotFoundException(
+ "The following files have not been found for table %s: %s" % (qtable_name,",".join(unfound_files)))
+
+ # deduplicate with matching qsql files
+ filtered_file_list = list(filter(lambda x: not x.endswith('.qsql'),materialized_file_list))
+ xprint("Filtered qsql files from glob search. Original file count: %s new file count: %s" % (len(materialized_file_list),len(filtered_file_list)))
+
+ l = len(filtered_file_list)
+ # If this proves to be a problem for users in terms of usability, then we'll just materialize the files
+ # into the adhoc db, as with the db attach limit of sqlite
+ if l > 500:
+ msg = "Maximum source files for table must be 500. Table is name is %s Number of actual files is %s" % (qtable_name,l)
+ raise MaximumSourceFilesExceededException(msg)
+
+ absolute_path_list = [os.path.abspath(x) for x in filtered_file_list]
+ return absolute_path_list
+
+ def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
+ if forced_db_to_use is not None:
+ self.db_id = forced_db_to_use.db_id
+ self.db_to_use = forced_db_to_use
+ self.can_store_as_cached = False
+ assert self.target_table_name is None
+ self.target_table_name = self.autodetect_table_name()
+ return
+
+ self.can_store_as_cached = True
+
+ self.db_id = '%s' % self._generate_db_name(self.atomic_fns[0])
+ xprint("Database id is %s" % self.db_id)
+ self.db_to_use = Sqlite3DB(self.db_id, 'file:%s?mode=memory&cache=shared' % self.db_id, 'memory<%s>' % self.db_id,create_qcatalog=True)
+
+ if self.target_table_name is None:
+ self.target_table_name = self.autodetect_table_name()
+
+
+ def __analyze_delimited_file(self,database_info):
+ xprint("Analyzing delimited file")
+ if self.target_table_name is not None:
+ target_sqlite_table_name = self.target_table_name
+ else:
+ assert False
+
+ xprint("Target sqlite table name is %s" % target_sqlite_table_name)
+ # Create the matching database table and populate it
+ table_creator = TableCreator(self.qtable_name, self.delimited_file_reader,self.input_params, sqlite_db=database_info.sqlite_db,
+ target_sqlite_table_name=target_sqlite_table_name)
+ table_creator.perform_analyze(self.dialect_id)
+ xprint("after perform_analyze")
+ self.content_signature = table_creator._generate_content_signature()
+
+ now = datetime.datetime.now(datetime.timezone.utc).isoformat()
+
+ database_info.sqlite_db.add_to_qcatalog_table(target_sqlite_table_name,
+ self.content_signature,
+ now,
+ self.source_type,
+ self.source)
+ return table_creator
+
+ def _generate_disk_db_filename(self, filenames_str):
+ fn = '%s.qsql' % (os.path.abspath(filenames_str).replace("+","__"))
+ return fn
+
+
+ def _get_should_read_from_cache(self, disk_db_filename):
+ disk_db_file_exists = os.path.exists(disk_db_filename)
+
+ should_read_from_cache = self.input_params.read_caching and disk_db_file_exists
+
+ return should_read_from_cache
+
+ def calculate_should_read_from_cache(self):
+ # TODO cache filename is chosen according to first filename only, which makes multi-file (glob) caching difficult
+ # cache writing is blocked for now in these cases. Will be added in the future (see save_cache_to_disk_if_needed)
+ disk_db_filename = self._generate_disk_db_filename(self.atomic_fns[0])
+ should_read_from_cache = self._get_should_read_from_cache(disk_db_filename)
+ xprint("should read from cache %s" % should_read_from_cache)
+ return disk_db_filename,should_read_from_cache
+
+ def get_planned_table_name(self):
+ return normalize_filename_to_table_name(os.path.basename(self.atomic_fns[0]))
+
+ def make_data_available(self,stop_after_analysis):
+ xprint("In make_data_available. db_id %s db_to_use %s" % (self.db_id,self.db_to_use))
+ assert self.db_id is not None
+
+ disk_db_filename, should_read_from_cache = self.calculate_should_read_from_cache()
+ xprint("disk_db_filename=%s should_read_from_cache=%s" % (disk_db_filename,should_read_from_cache))
+
+ database_info = DatabaseInfo(self.db_id,self.db_to_use, needs_closing=True)
+ xprint("db %s (%s) has been added to the database list" % (self.db_id, self.db_to_use))
+
+ self.delimited_file_reader.open_file()
+
+ table_creator = self.__analyze_delimited_file(database_info)
+
+ self.mfs_structure = MaterializedStateTableStructure(self.qtable_name, self.atomic_fns, self.db_id,
+ table_creator.column_inferer.get_column_names(),
+ table_creator.column_inferer.get_column_types(),
+ None,
+ self.target_table_name,
+ self.source_type,
+ self.source,
+ self.get_planned_table_name())
+
+ content_signature = table_creator.content_signature
+ content_signature_key = self.db_to_use.calculate_content_signature_key(content_signature)
+ xprint("table creator signature key: %s" % content_signature_key)
+
+ relevant_table = self.db_to_use.get_from_qcatalog(content_signature)['temp_table_name']
+
+ if not stop_after_analysis:
+ table_creator.perform_read_fully(self.dialect_id)
+
+ self.save_cache_to_disk_if_needed(disk_db_filename, table_creator)
+
+
+ self.delimited_file_reader.close_file()
+
+ return database_info, relevant_table
+
+ def save_cache_to_disk_if_needed(self, disk_db_filename, table_creator):
+ if len(self.atomic_fns) > 1:
+ xprint("Cannot save cache for multi-files for now, deciding auto-naming for cache is challenging. Will be added in the future.")
+ return
+
+ effective_write_caching = self.input_params.write_caching
+ if effective_write_caching:
+ if self.can_store_as_cached:
+ assert self.table_source_type != TableSourceType.DELIMITED_FILE_WITH_UNUSED_QSQL
+ xprint("Going to write file cache for %s. Disk filename is %s" % (",".join(self.atomic_fns), disk_db_filename))
+ self._store_qsql(table_creator.sqlite_db, disk_db_filename)
+ else:
+ xprint("Database has been provided externally. Skipping storing a cached version of the data")
+
+ def _store_qsql(self, source_sqlite_db, disk_db_filename):
+ xprint("Storing data as disk db")
+ disk_db_conn = sqlite3.connect(disk_db_filename)
+ with disk_db_conn:
+ source_sqlite_db.conn.backup(disk_db_conn)
+ xprint("Written db to disk: disk db filename %s" % (disk_db_filename))
+ disk_db_conn.close()
+
+ def _generate_db_name(self, qtable_name):
+ return 'e_%s_fn_%s' % (self.engine_id,normalize_filename_to_table_name(qtable_name))
+
+
+def detect_qtable_name_source_info(qtable_name,data_streams,read_caching_enabled):
+ data_stream = data_streams.get_for_filename(qtable_name)
+ xprint("Found data stream %s" % data_stream)
+
+ if data_stream is not None:
+ return MaterializedStateType.DATA_STREAM, TableSourceType.DATA_STREAM,(data_stream,)
+
+ if ':::' in qtable_name:
+ qsql_filename, table_name = qtable_name.split(":::", 1)
+ if not os.path.exists(qsql_filename):
+ raise FileNotFoundException("Could not find file %s" % qsql_filename)
+
+ if is_qsql_file(qsql_filename):
+ return MaterializedStateType.QSQL_FILE, TableSourceType.QSQL_FILE, (qsql_filename, table_name,)
+ if is_sqlite_file(qsql_filename):
+ return MaterializedStateType.SQLITE_FILE, TableSourceType.SQLITE_FILE, (qsql_filename, table_name,)
+ raise UnknownFileTypeException("Cannot detect the type of table %s" % qtable_name)
+ else:
+ if is_qsql_file(qtable_name):
+ return MaterializedStateType.QSQL_FILE, TableSourceType.QSQL_FILE, (qtable_name, None)
+ if is_sqlite_file(qtable_name):
+ return MaterializedStateType.SQLITE_FILE, TableSourceType.SQLITE_FILE, (qtable_name, None)
+ matching_qsql_file_candidate = qtable_name + '.qsql'
+
+ table_source_type = TableSourceType.DELIMITED_FILE
+ if is_qsql_file(matching_qsql_file_candidate):
+ if read_caching_enabled:
+ xprint("Found matching qsql file for original file %s (matching file %s) and read caching is enabled. Using it" % (qtable_name,matching_qsql_file_candidate))
+ return MaterializedStateType.QSQL_FILE, TableSourceType.QSQL_FILE_WITH_ORIGINAL, (matching_qsql_file_candidate, None)
+ else:
+ xprint("Found matching qsql file for original file %s (matching file %s), but read caching is disabled. Not using it" % (qtable_name,matching_qsql_file_candidate))
+ table_source_type = TableSourceType.DELIMITED_FILE_WITH_UNUSED_QSQL
+
+
+ return MaterializedStateType.DELIMITED_FILE,table_source_type ,(qtable_name, None)
+
+
+class MaterialiedDataStreamState(MaterializedDelimitedFileState):
+ def __init__(self, table_source_type, qtable_name, input_params, dialect_id, engine_id, data_stream, stream_target_db): ## should pass adhoc_db
+ assert data_stream is not None
+
+ super().__init__(table_source_type, qtable_name, input_params, dialect_id, engine_id,target_table_name=None)
+
+ self.data_stream = data_stream
+
+ self.stream_target_db = stream_target_db
+
+ self.target_table_name = None
+
+ def get_planned_table_name(self):
+ return 'data_stream_%s' % (normalize_filename_to_table_name(self.source))
+
+ def get_materialized_state_type(self):
+ return MaterializedStateType.DATA_STREAM
+
+ def initialize(self):
+ self.start_time = time.time()
+ if self.input_params.gzipped_input:
+ raise CannotUnzipDataStreamException()
+
+ self.source_type = self.table_source_type
+ self.source = self.data_stream.stream_id
+
+ self.delimited_file_reader = DelimitedFileReader([], self.input_params, self.dialect_id, f=self.data_stream.stream,external_f_name=self.source)
+
+ def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
+ assert forced_db_to_use is None
+
+ self.db_id = self.stream_target_db.db_id
+ self.db_to_use = self.stream_target_db
+
+ self.target_table_name = self.autodetect_table_name()
+
+ return
+
+ def calculate_should_read_from_cache(self):
+ # No disk_db_filename, and no reading from cache when reading a datastream
+ return None, False
+
+ def finalize(self):
+ super(MaterialiedDataStreamState, self).finalize()
+
+ def save_cache_to_disk_if_needed(self, disk_db_filename, table_creator):
+ xprint("Saving to cache is disabled for data streams")
+ return
+
+
+class MaterializedSqliteState(MaterializedState):
+ def __init__(self,table_source_type,qtable_name,sqlite_filename,table_name, engine_id):
+ super(MaterializedSqliteState, self).__init__(table_source_type,qtable_name,engine_id)
+ self.sqlite_filename = sqlite_filename
+ self.table_name = table_name
+
+ self.table_name_autodetected = None
+
+ def initialize(self):
+ super(MaterializedSqliteState, self).initialize()
+
+ self.table_name_autodetected = False
+ if self.table_name is None:
+ self.table_name = self.autodetect_table_name()
+ self.table_name_autodetected = True
+ return
+
+ self.validate_table_name()
+
+ def get_planned_table_name(self):
+ if self.table_name_autodetected:
+ return normalize_filename_to_table_name(os.path.basename(self.qtable_name))
+ else:
+ return self.table_name
+
+
+ def autodetect_table_name(self):
+ db = Sqlite3DB('temp_db','file:%s?immutable=1' % self.sqlite_filename,self.sqlite_filename,create_qcatalog=False)
+ try:
+ table_names = list(sorted(db.retrieve_all_table_names()))
+ if len(table_names) == 1:
+ return table_names[0]
+ elif len(table_names) == 0:
+ raise NoTablesInSqliteException(self.sqlite_filename)
+ else:
+ raise TooManyTablesInSqliteException(self.sqlite_filename,table_names)
+ finally:
+ db.done()
+
+ def validate_table_name(self):
+ db = Sqlite3DB('temp_db', 'file:%s?immutable=1' % self.sqlite_filename, self.sqlite_filename,
+ create_qcatalog=False)
+ try:
+ table_names = list(db.retrieve_all_table_names())
+ if self.table_name.lower() not in map(lambda x:x.lower(),table_names):
+ raise NonExistentTableNameInSqlite(self.sqlite_filename, self.table_name, table_names)
+ finally:
+ db.done()
+
+ def finalize(self):
+ super(MaterializedSqliteState, self).finalize()
+
+ def get_materialized_state_type(self):
+ return MaterializedStateType.SQLITE_FILE
+
+ def _generate_qsql_only_db_name__temp(self, filenames_str):
+ return 'e_%s_fn_%s' % (self.engine_id,hashlib.sha1(filenames_str.encode('utf-8')).hexdigest())
+
+ def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
+ self.source = self.sqlite_filename
+ self.source_type = self.table_source_type
+
+ self.db_id = '%s' % self._generate_qsql_only_db_name__temp(self.qtable_name)
+
+ x = 'file:%s?immutable=1' % self.sqlite_filename
+ self.db_to_use = Sqlite3DB(self.db_id, x, self.sqlite_filename,create_qcatalog=False)
+
+ if forced_db_to_use:
+ xprint("Forced sqlite db_to_use %s" % forced_db_to_use)
+ new_table_name = forced_db_to_use.attach_and_copy_table(self.db_to_use,self.table_name,stop_after_analysis)
+ self.table_name = new_table_name
+ self.db_id = forced_db_to_use.db_id
+ self.db_to_use = forced_db_to_use
+
+ return
+
+ def make_data_available(self,stop_after_analysis):
+ xprint("db %s (%s) has been added to the database list" % (self.db_id, self.db_to_use))
+
+ database_info,relevant_table = DatabaseInfo(self.db_id,self.db_to_use, needs_closing=True), self.table_name
+
+ column_names, column_types, sqlite_column_types = self._extract_information()
+
+ self.mfs_structure = MaterializedStateTableStructure(self.qtable_name, [self.qtable_name], self.db_id,
+ column_names, column_types, sqlite_column_types,
+ self.table_name,
+ self.source_type,self.source,
+ self.get_planned_table_name())
+ return database_info, relevant_table
+
+ def _extract_information(self):
+ table_list = self.db_to_use.retrieve_all_table_names()
+ if len(table_list) == 1:
+ table_name = table_list[0][0]
+ xprint("Only one table in sqlite database, choosing it: %s" % table_name)
+ else:
+ # self.table_name has either beein autodetected, or validated as an existing table up the stack
+ table_name = self.table_name
+ xprint("Multiple tables in sqlite file. Using provided table name %s" % self.table_name)
+
+ table_info = self.db_to_use.get_sqlite_table_info(table_name)
+ xprint('Table info is %s' % table_info)
+ column_names = list(map(lambda x: x[1], table_info))
+ sqlite_column_types = list(map(lambda x: x[2].lower(),table_info))
+ column_types = list(map(lambda x: sqlite_type_to_python_type(x[2]), table_info))
+ xprint("Column names and types for table %s: %s" % (table_name, list(zip(column_names, zip(sqlite_column_types,column_types)))))
+ self.content_signature = OrderedDict()
+
+ return column_names, column_types, sqlite_column_types
+
+
+class MaterializedQsqlState(MaterializedState):
+ def __init__(self,table_source_type,qtable_name,qsql_filename,table_name, engine_id,input_params,dialect_id):
+ super(MaterializedQsqlState, self).__init__(table_source_type,qtable_name,engine_id)
+ self.qsql_filename = qsql_filename
+ self.table_name = table_name
+
+ # These are for cases where the qsql file is just a cache and the original is still there, used for content
+ # validation
+ self.input_params = input_params
+ self.dialect_id = dialect_id
+
+ self.table_name_autodetected = None
+
+ def initialize(self):
+ super(MaterializedQsqlState, self).initialize()
+
+ self.table_name_autodetected = False
+ if self.table_name is None:
+ self.table_name = self.autodetect_table_name()
+ self.table_name_autodetected = True
+ return
+
+ self.validate_table_name()
+
+ def get_planned_table_name(self):
+ if self.table_name_autodetected:
+ return normalize_filename_to_table_name(os.path.basename(self.qtable_name))
+ else:
+ return self.table_name
+
+
+ def autodetect_table_name(self):
+ db = Sqlite3DB('temp_db','file:%s?immutable=1' % self.qsql_filename,self.qsql_filename,create_qcatalog=False)
+ assert db.qcatalog_table_exists()
+ try:
+ qcatalog_entries = db.get_all_from_qcatalog()
+ if len(qcatalog_entries) == 0:
+ raise NoTableInQsqlExcption(self.qsql_filename)
+ elif len(qcatalog_entries) == 1:
+ return qcatalog_entries[0]['temp_table_name']
+ else:
+ # TODO Add a test for this
+ table_names = list(sorted([x['temp_table_name'] for x in qcatalog_entries]))
+ raise TooManyTablesInQsqlException(self.qsql_filename,table_names)
+ finally:
+ db.done()
+
+ def validate_table_name(self):
+ db = Sqlite3DB('temp_db', 'file:%s?immutable=1' % self.qsql_filename, self.qsql_filename,
+ create_qcatalog=False)
+ assert db.qcatalog_table_exists()
+ try:
+ entry = db.get_from_qcatalog_using_table_name(self.table_name)
+ if entry is None:
+ qcatalog_entries = db.get_all_from_qcatalog()
+ table_names = list(sorted([x['temp_table_name'] for x in qcatalog_entries]))
+ raise NonExistentTableNameInQsql(self.qsql_filename,self.table_name,table_names)
+ finally:
+ db.done()
+
+ def finalize(self):
+ super(MaterializedQsqlState, self).finalize()
+
+ def get_materialized_state_type(self):
+ return MaterializedStateType.QSQL_FILE
+
+ def _generate_qsql_only_db_name__temp(self, filenames_str):
+ return 'e_%s_fn_%s' % (self.engine_id,hashlib.sha1(filenames_str.encode('utf-8')).hexdigest())
+
+ def choose_db_to_use(self,forced_db_to_use=None,stop_after_analysis=False):
+ self.source = self.qsql_filename
+ self.source_type = self.table_source_type
+
+ self.db_id = '%s' % self._generate_qsql_only_db_name__temp(self.qtable_name)
+
+ x = 'file:%s?immutable=1' % self.qsql_filename
+ self.db_to_use = Sqlite3DB(self.db_id, x, self.qsql_filename,create_qcatalog=False)
+
+ if forced_db_to_use:
+ xprint("Forced qsql to use forced_db: %s" % forced_db_to_use)
+
+ # TODO RLRL Move query to Sqlite3DB
+ all_table_names = [(x[0],x[1]) for x in self.db_to_use.execute_and_fetch("select content_signature_key,temp_table_name from %s" % self.db_to_use.QCATALOG_TABLE_NAME).results]
+ csk,t = list(filter(lambda x: x[1] == self.table_name,all_table_names))[0]
+ xprint("Copying table %s from db_id %s" % (t,self.db_id))
+ d = self.db_to_use.get_from_qcatalog_using_table_name(t)
+
+ new_table_name = forced_db_to_use.attach_and_copy_table(self.db_to_use,self.table_name,stop_after_analysis)
+
+ xprint("CS",d['content_signature'])
+ cs = OrderedDict(json.loads(d['content_signature']))
+ forced_db_to_use.add_to_qcatalog_table(new_table_name, cs, d['creation_time'],
+ d['source_type'], d['source'])
+
+ self.table_name = new_table_name
+ self.db_id = forced_db_to_use.db_id
+ self.db_to_use = forced_db_to_use
+
+ return
+
+ def make_data_available(self,stop_after_analysis):
+ xprint("db %s (%s) has been added to the database list" % (self.db_id, self.db_to_use))
+
+ database_info,relevant_table = self._read_table_from_cache(stop_after_analysis)
+
+ column_names, column_types, sqlite_column_types = self._extract_information()
+
+ self.mfs_structure = MaterializedStateTableStructure(self.qtable_name, [self.qtable_name], self.db_id,
+ column_names, column_types, sqlite_column_types,
+ self.table_name,
+ self.source_type,self.source,
+ self.get_planned_table_name())
+ return database_info, relevant_table
+
+ def _extract_information(self):
+ assert self.db_to_use.qcatalog_table_exists()
+ table_info = self.db_to_use.get_sqlite_table_info(self.table_name)
+ xprint('table_name=%s Table info is %s' % (self.table_name,table_info))
+
+ x = self.db_to_use.get_from_qcatalog_using_table_name(self.table_name)
+
+ column_names = list(map(lambda x: x[1], table_info))
+ sqlite_column_types = list(map(lambda x: x[2].lower(),table_info))
+ column_types = list(map(lambda x: sqlite_type_to_python_type(x[2]), table_info))
+ self.content_signature = OrderedDict(
+ **json.loads(x['content_signature']))
+ xprint('Inferred column names and types from qsql: %s' % list(zip(column_names, zip(sqlite_column_types,column_types))))
+
+ return column_names, column_types, sqlite_column_types
+
+ def _backing_original_file_exists(self):
+ return '%s.qsql' % self.qtable_name == self.qsql_filename
+
+ def _read_table_from_cache(self, stop_after_analysis):
+ if self._backing_original_file_exists():
+ xprint("Found a matching source file for qsql file with qtable name %s. Checking content signature by creating a temp MFDS + analysis" % self.qtable_name)
+ mdfs = MaterializedDelimitedFileState(TableSourceType.DELIMITED_FILE,self.qtable_name,self.input_params,self.dialect_id,self.engine_id,target_table_name=None)
+ mdfs.initialize()
+ mdfs.choose_db_to_use(forced_db_to_use=None,stop_after_analysis=stop_after_analysis)
+ _,_ = mdfs.make_data_available(stop_after_analysis=True)
+
+ original_file_content_signature = mdfs.content_signature
+ original_file_content_signature_key = self.db_to_use.calculate_content_signature_key(original_file_content_signature)
+
+ qcatalog_entry = self.db_to_use.get_from_qcatalog_using_table_name(self.table_name)
+
+ if qcatalog_entry is None:
+ raise Exception('missing content signature!')
+
+ xprint("Actual Signature Key: %s Expected Signature Key: %s" % (qcatalog_entry['content_signature_key'],original_file_content_signature_key))
+ actual_content_signature = json.loads(qcatalog_entry['content_signature'])
+
+ xprint("Validating content signatures: original %s vs qsql %s" % (original_file_content_signature,actual_content_signature))
+ validate_content_signature(self.qtable_name, original_file_content_signature, self.qsql_filename, actual_content_signature,dump=True)
+ mdfs.finalize()
+ return DatabaseInfo(self.db_id,self.db_to_use, needs_closing=True), self.table_name
+
diff --git a/qtextasdata/utilities.py b/qtextasdata/utilities.py
new file mode 100644
index 00000000..c0bb3ed7
--- /dev/null
+++ b/qtextasdata/utilities.py
@@ -0,0 +1,313 @@
+import os
+import math
+import re
+import hashlib
+import locale
+import sys
+
+def get_stdout_encoding(encoding_override=None):
+ if encoding_override is not None and encoding_override != 'none':
+ return encoding_override
+
+ if sys.stdout.isatty():
+ return sys.stdout.encoding
+ else:
+ return locale.getpreferredencoding()
+
+sha_algorithms = {
+ 1 : hashlib.sha1,
+ 224: hashlib.sha224,
+ 256: hashlib.sha256,
+ 386: hashlib.sha384,
+ 512: hashlib.sha512
+}
+
+def sha(data,algorithm,encoding):
+ try:
+ f = sha_algorithms[algorithm]
+ return f(str(data).encode(encoding)).hexdigest()
+ except Exception as e:
+ print(e)
+
+# For backward compatibility only (doesn't handle encoding well enough)
+def sha1(data):
+ return hashlib.sha1(str(data).encode('utf-8')).hexdigest()
+
+# TODO Add caching of compiled regexps - Will be added after benchmarking capability is baked in
+def regexp(regular_expression, data):
+ if data is not None:
+ if not isinstance(data, str):
+ data = str(data)
+ return re.search(regular_expression, data) is not None
+ else:
+ return False
+
+def regexp_extract(regular_expression, data,group_number):
+ if data is not None:
+ if not isinstance(data, str):
+ data = str(data)
+ m = re.search(regular_expression, data)
+ if m is not None:
+ return m.groups()[group_number]
+ else:
+ return False
+
+def md5(data,encoding):
+ m = hashlib.md5()
+ m.update(str(data).encode(encoding))
+ return m.hexdigest()
+
+def sqrt(data):
+ return math.sqrt(data)
+
+def power(data,p):
+ return data**p
+
+def file_ext(data):
+ if data is None:
+ return None
+
+ return os.path.splitext(data)[1]
+
+def file_folder(data):
+ if data is None:
+ return None
+ return os.path.split(data)[0]
+
+def file_basename(data):
+ if data is None:
+ return None
+ return os.path.split(data)[1]
+
+def file_basename_no_ext(data):
+ if data is None:
+ return None
+
+ return os.path.split(os.path.splitext(data)[0])[-1]
+
+def percentile(l, p):
+ # TODO Alpha implementation, need to provide multiple interpolation methods, and add tests
+ if not l:
+ return None
+ k = p*(len(l) - 1)
+ f = math.floor(k)
+ c = math.ceil(k)
+ if c == f:
+ return l[int(k)]
+ return (c-k) * l[int(f)] + (k-f) * l[int(c)]
+
+# TODO Streaming Percentile to prevent memory consumption blowup for large datasets
+class StrictPercentile(object):
+ def __init__(self):
+ self.values = []
+ self.p = None
+
+ def step(self,value,p):
+ if self.p is None:
+ self.p = p
+ self.values.append(value)
+
+ def finalize(self):
+ if len(self.values) == 0 or (self.p < 0 or self.p > 1):
+ return None
+ else:
+ return percentile(sorted(self.values),self.p)
+
+class StdevPopulation(object):
+ def __init__(self):
+ self.M = 0.0
+ self.S = 0.0
+ self.k = 0
+
+ def step(self, value):
+ try:
+ # Ignore nulls
+ if value is None:
+ return
+ val = float(value) # if fails, skips this iteration, which also ignores nulls
+ tM = self.M
+ self.k += 1
+ self.M += ((val - tM) / self.k)
+ self.S += ((val - tM) * (val - self.M))
+ except ValueError:
+ # TODO propagate udf errors to console
+ raise Exception("Data is not numeric when calculating stddev (%s)" % value)
+
+ def finalize(self):
+ if self.k <= 1: # avoid division by zero
+ return None
+ else:
+ return math.sqrt(self.S / (self.k))
+
+class StdevSample(object):
+ def __init__(self):
+ self.M = 0.0
+ self.S = 0.0
+ self.k = 0
+
+ def step(self, value):
+ try:
+ # Ignore nulls
+ if value is None:
+ return
+ val = float(value) # if fails, skips this iteration, which also ignores nulls
+ tM = self.M
+ self.k += 1
+ self.M += ((val - tM) / self.k)
+ self.S += ((val - tM) * (val - self.M))
+ except ValueError:
+ # TODO propagate udf errors to console
+ raise Exception("Data is not numeric when calculating stddev (%s)" % value)
+
+ def finalize(self):
+ if self.k <= 1: # avoid division by zero
+ return None
+ else:
+ return math.sqrt(self.S / (self.k-1))
+
+class FunctionType(object):
+ REGULAR = 1
+ AGG = 2
+
+class UserFunctionDef(object):
+ def __init__(self,func_type,name,usage,description,func_or_obj,param_count):
+ self.func_type = func_type
+ self.name = name
+ self.usage = usage
+ self.description = description
+ self.func_or_obj = func_or_obj
+ self.param_count = param_count
+
+user_functions = [
+ UserFunctionDef(FunctionType.REGULAR,
+ "regexp","regexp(,) = <1|0>",
+ "Find regexp in string expression. Returns 1 if found or 0 if not",
+ regexp,
+ 2),
+ UserFunctionDef(FunctionType.REGULAR,
+ "regexp_extract","regexp_extract(,,group_number) = ",
+ "Get regexp capture group content",
+ regexp_extract,
+ 3),
+ UserFunctionDef(FunctionType.REGULAR,
+ "sha","sha(,,) = ",
+ "Calculate sha of some expression. Algorithm can be one of 1,224,256,384,512. For now encoding must be manually provided. Will use the input encoding automatically in the future.",
+ sha,
+ 3),
+ UserFunctionDef(FunctionType.REGULAR,
+ "sha1","sha1() = ",
+ "Exists for backward compatibility only, since it doesn't handle encoding properly. Calculates sha1 of some expression",
+ sha1,
+ 1),
+ UserFunctionDef(FunctionType.REGULAR,
+ "md5","md5(,) = ",
+ "Calculate md5 of expression. Returns a hex-string of the result. Currently requires to manually provide the encoding of the data. Will be taken automatically from the input encoding in the future.",
+ md5,
+ 2),
+ UserFunctionDef(FunctionType.REGULAR,
+ "sqrt","sqrt() = ",
+ "Calculate the square root of the expression",
+ sqrt,
+ 1),
+ UserFunctionDef(FunctionType.REGULAR,
+ "power","power(,) = ",
+ "Raise expr1 to the power of expr2",
+ power,
+ 2),
+ UserFunctionDef(FunctionType.REGULAR,
+ "file_ext","file_ext() = ",
+ "Get the extension of a filename",
+ file_ext,
+ 1),
+ UserFunctionDef(FunctionType.REGULAR,
+ "file_folder","file_folder() = ",
+ "Get the folder part of a filename",
+ file_folder,
+ 1),
+ UserFunctionDef(FunctionType.REGULAR,
+ "file_basename","file_basename() = ",
+ "Get the basename of a filename, including extension if any",
+ file_basename,
+ 1),
+ UserFunctionDef(FunctionType.REGULAR,
+ "file_basename_no_ext","file_basename_no_ext() = ",
+ "Get the basename of a filename, without the extension if there is one",
+ file_basename_no_ext,
+ 1),
+ UserFunctionDef(FunctionType.AGG,
+ "percentile","percentile(,) = ",
+ "Calculate the strict percentile of a set of a values.",
+ StrictPercentile,
+ 2),
+ UserFunctionDef(FunctionType.AGG,
+ "stddev_pop","stddev_pop() = ",
+ "Calculate the population standard deviation of a set of values",
+ StdevPopulation,
+ 1),
+ UserFunctionDef(FunctionType.AGG,
+ "stddev_sample","stddev_sample() = ",
+ "Calculate the sample standard deviation of a set of values",
+ StdevSample,
+ 1)
+]
+
+def print_user_functions():
+ for udf in user_functions:
+ print("Function: %s" % udf.name)
+ print(" Usage: %s" % udf.usage)
+ print(" Description: %s" % udf.description)
+
+def get_sqlite_type_affinity(sqlite_type):
+ sqlite_type = sqlite_type.upper()
+ if 'INT' in sqlite_type:
+ return 'INTEGER'
+ elif 'CHAR' in sqlite_type or 'TEXT' in sqlite_type or 'CLOB' in sqlite_type:
+ return 'TEXT'
+ elif 'BLOB' in sqlite_type:
+ return 'BLOB'
+ elif 'REAL' in sqlite_type or 'FLOA' in sqlite_type or 'DOUB' in sqlite_type:
+ return 'REAL'
+ else:
+ return 'NUMERIC'
+
+def sqlite_type_to_python_type(sqlite_type):
+ SQLITE_AFFINITY_TO_PYTHON_TYPE_NAMES = {
+ 'INTEGER': int,
+ 'TEXT': str,
+ 'BLOB': bytes,
+ 'REAL': float,
+ 'NUMERIC': float
+ }
+ return SQLITE_AFFINITY_TO_PYTHON_TYPE_NAMES[get_sqlite_type_affinity(sqlite_type)]
+
+
+def escape_double_quotes_if_needed(v):
+ x = v.replace('"', '""')
+ return x
+
+
+def quote_none_func(output_delimiter,v):
+ return v
+
+
+def quote_minimal_func(output_delimiter,v):
+ if v is None:
+ return v
+ if isinstance(v, str) and (output_delimiter in v or '\n' in v or '\r' in v):
+ return '"{}"'.format(escape_double_quotes_if_needed(v))
+ return v
+
+
+def quote_nonnumeric_func(output_delimiter,v):
+ if v is None:
+ return v
+ if isinstance(v, str):
+ return '"{}"'.format(escape_double_quotes_if_needed(v))
+ return v
+
+
+def quote_all_func(output_delimiter,v):
+ if isinstance(v, str):
+ return '"{}"'.format(escape_double_quotes_if_needed(v))
+ else:
+ return str('"{}"').format(v)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 3ad7d2bf..e69de29b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +0,0 @@
-six==1.11.0
-flake8==3.6.0
-setuptools<45.0.0
diff --git a/run-tests.sh b/run-tests.sh
deleted file mode 100755
index f28e57e6..00000000
--- a/run-tests.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-pytest -m 'not benchmark' "$@"
diff --git a/setup.py b/setup.py
index 6d0fac7e..87559c1f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,17 +1,15 @@
#!/usr/bin/env python
-from setuptools import setup
-import setuptools
-
-q_version = '3.1.6'
+from setuptools import setup, find_packages
+from qtextasdata import q_version
with open("README.markdown", "r", encoding="utf-8") as fh:
long_description = fh.read()
setup(
- name='q',
+ name='qtextasdata',
url='https://github.com/harelba/q',
- license='LICENSE',
+ license='Apache License 2.0',
version=q_version,
author='Harel Ben-Attia',
description="Run SQL directly on CSV or TSV files",
@@ -19,13 +17,38 @@
long_description_content_type="text/markdown",
author_email='harelba@gmail.com',
install_requires=[
- 'six==1.11.0'
],
- package_dir={"": "bin"},
- packages=setuptools.find_packages(where="bin"),
+ packages=find_packages(exclude=['tests', 'tests.*']),
entry_points={
'console_scripts': [
- 'q = bin.q:run_standalone'
+ 'q = qtextasdata.cli:run_standalone'
]
- }
+ },
+ classifiers=[
+ 'Development Status :: 5 - Production/Stable',
+ 'Environment :: Console',
+ 'Intended Audience :: Developers',
+ 'Intended Audience :: System Administrators',
+ 'Intended Audience :: Science/Research',
+ 'License :: OSI Approved :: Apache Software License',
+ 'Operating System :: OS Independent',
+ 'Programming Language :: Python',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
+ 'Programming Language :: SQL',
+ 'Topic :: Database',
+ 'Topic :: Scientific/Engineering',
+ 'Topic :: Scientific/Engineering :: Information Analysis',
+ 'Topic :: Text Processing :: General',
+ 'Topic :: Utilities',
+ ],
+ keywords='csv tsv sql data analysis query',
+ project_urls={
+ 'Bug Reports': 'https://github.com/harelba/q/issues',
+ 'Source': 'https://github.com/harelba/q',
+ 'Documentation': 'https://github.com/harelba/q/blob/master/README.markdown',
+ },
)
diff --git a/test-requirements.txt b/test-requirements.txt
index 7e995bae..c922d095 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -1,3 +1,3 @@
-pytest==6.2.2
-flake8==3.6.0
-six==1.11.0
+pytest==8.3.5
+pytest-xdist==3.5.0
+flake8==7.0.0
diff --git a/test/AnalysisTests.py b/test/AnalysisTests.py
new file mode 100644
index 00000000..623f51ef
--- /dev/null
+++ b/test/AnalysisTests.py
@@ -0,0 +1,182 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+
+
+from test.utils import DEBUG,b
+from test.test_data import header_row_with_spaces, sample_data_no_header, sample_data_with_header
+from test.base import AbstractQTestCase
+
+class AnalysisTests(AbstractQTestCase):
+
+ def test_analyze_result(self):
+ d = "\n".join(['%s\t%s\t%s' % (x+1,x+1,x+1) for x in range(100)])
+ tmpfile = self.create_file_with_data(b(d))
+
+ cmd = Q_EXECUTABLE + ' -c 1 "select count(*) from %s" -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 5)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1], b(' Sources:'))
+ self.assertEqual(o[2], b(' source_type: file source: %s' %(tmpfile.name)))
+ self.assertEqual(o[3], b(' Fields:'))
+ self.assertEqual(o[4], b(' `c1` - text'))
+
+ self.cleanup(tmpfile)
+
+ def test_analyze_result_with_data_stream(self):
+ d = "\n".join(['%s\t%s\t%s' % (x+1,x+1,x+1) for x in range(100)])
+ tmpfile = self.create_file_with_data(b(d))
+
+ cmd = 'cat %s | %s -c 1 "select count(*) from -" -A' % (tmpfile.name,Q_EXECUTABLE)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 5)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b('Table: -'))
+ self.assertEqual(o[1], b(' Sources:'))
+ self.assertEqual(o[2], b(' source_type: data-stream source: stdin'))
+ self.assertEqual(o[3], b(' Fields:'))
+ self.assertEqual(o[4], b(' `c1` - text'))
+
+ self.cleanup(tmpfile)
+
+ def test_column_analysis(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+
+ cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(o[0], b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s' % tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4], b(' `c1` - text'))
+ self.assertEqual(o[5], b(' `c2` - int'))
+ self.assertEqual(o[6], b(' `c3` - int'))
+
+ self.cleanup(tmpfile)
+
+ def test_column_analysis_with_mixed_ints_and_floats(self):
+ tmpfile = self.create_file_with_data(b("""planet_id,name,diameter_km,length_of_day_hours\n1000,Earth,12756,24\n2000,Mars,6792,24.7\n3000,Jupiter,142984,9.9"""))
+
+ cmd = Q_EXECUTABLE + ' -d , -H "select * from %s" -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),8)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0], b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s' % tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4], b(' `planet_id` - int'))
+ self.assertEqual(o[5], b(' `name` - text'))
+ self.assertEqual(o[6], b(' `diameter_km` - int'))
+ self.assertEqual(o[7], b(' `length_of_day_hours` - real'))
+
+ self.cleanup(tmpfile)
+
+ def test_column_analysis_with_mixed_ints_and_floats_and_nulls(self):
+ tmpfile = self.create_file_with_data(b("""planet_id,name,diameter_km,length_of_day_hours\n1000,Earth,12756,24\n2000,Mars,6792,24.7\n2500,Venus,,\n3000,Jupiter,142984,9.9"""))
+
+ cmd = Q_EXECUTABLE + ' -d , -H "select * from %s" -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),8)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0], b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s' % tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4], b(' `planet_id` - int'))
+ self.assertEqual(o[5], b(' `name` - text'))
+ self.assertEqual(o[6], b(' `diameter_km` - int'))
+ self.assertEqual(o[7], b(' `length_of_day_hours` - real'))
+
+ self.cleanup(tmpfile)
+
+ def test_column_analysis_no_header(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+
+ cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(o[0], b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s' % tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4], b(' `c1` - text'))
+ self.assertEqual(o[5], b(' `c2` - int'))
+ self.assertEqual(o[6], b(' `c3` - int'))
+
+ def test_column_analysis_with_unexpected_header(self):
+ tmpfile = self.create_file_with_data(sample_data_with_header)
+ cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 7)
+ self.assertEqual(len(e), 1)
+
+ self.assertEqual(o[0], b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s' % tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4],b(' `c1` - text'))
+ self.assertEqual(o[5],b(' `c2` - text'))
+ self.assertEqual(o[6],b(' `c3` - text'))
+
+ self.assertEqual(
+ e[0], b('Warning - There seems to be header line in the file, but -H has not been specified. All fields will be detected as text fields, and the header line will appear as part of the data'))
+
+ self.cleanup(tmpfile)
+
+ def test_column_analysis_for_spaces_in_header_row(self):
+ tmpfile = self.create_file_with_data(
+ header_row_with_spaces + b("\n") + sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select name,\\`value 1\\` from %s" -H -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 7)
+
+ self.assertEqual(o[0], b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s' % tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4], b(' `name` - text'))
+ self.assertEqual(o[5], b(' `value 1` - int'))
+ self.assertEqual(o[6], b(' `value2` - int'))
+
+ self.cleanup(tmpfile)
+
+ def test_column_analysis_with_header(self):
+ tmpfile = self.create_file_with_data(sample_data_with_header)
+ cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A -H' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode, 0)
+ self.assertEqual(len(o),7)
+ self.assertEqual(len(e),2)
+ self.assertEqual(o[0], b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s' % tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4], b(' `name` - text'))
+ self.assertEqual(o[5], b(' `value1` - int'))
+ self.assertEqual(o[6], b(' `value2` - int'))
+
+ self.assertEqual(e[0],b('query error: no such column: c1'))
+ self.assertTrue(e[1].startswith(b('Warning - There seems to be a ')))
+
+ self.cleanup(tmpfile)
\ No newline at end of file
diff --git a/test/BasicModuleTests.py b/test/BasicModuleTests.py
new file mode 100644
index 00000000..5a4f0c25
--- /dev/null
+++ b/test/BasicModuleTests.py
@@ -0,0 +1,494 @@
+from io import StringIO
+from qtextasdata.core import DataStream, QInputParams, QTextAsData
+from test.base import AbstractQTestCase
+
+
+from test.utils import DEBUG,b
+import sys
+import codecs
+from test.test_data import header_row_with_spaces, sample_data_no_header, sample_data_with_header
+from test.base import AbstractQTestCase
+
+class BasicModuleTests(AbstractQTestCase):
+
+ def test_engine_isolation(self):
+ tmpfile1 = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+ tmpfile2 = self.create_file_with_data(b("d e f\n10 20 30\n40 50 60"))
+
+ # Run file 1 on engine 1
+ q1 = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
+ r = q1.execute('select * from %s' % tmpfile1.name)
+ print("QueryQuery",file=sys.stdout)
+
+ self.assertTrue(r.status == 'ok')
+ self.assertEqual(len(r.warnings),0)
+ self.assertEqual(len(r.data),2)
+ self.assertEqual(r.metadata.output_column_name_list,['a','b','c'])
+ self.assertEqual(r.data,[(1,2,3),(4,5,6)])
+ self.assertTrue(tmpfile1.name in r.metadata.table_structures)
+ self.assertTrue(tmpfile1.name in r.metadata.new_table_structures)
+ self.assertEqual(r.metadata.table_structures[tmpfile1.name].atomic_fns,[tmpfile1.name])
+ self.assertEqual(r.metadata.table_structures[tmpfile1.name].source_type,'file')
+ self.assertEqual(r.metadata.table_structures[tmpfile1.name].source,tmpfile1.name)
+
+ # run file 1 on engine 2
+ q2 = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
+ r2 = q2.execute('select * from %s' % tmpfile1.name)
+ print("QueryQuery",file=sys.stdout)
+
+ self.assertTrue(r2.status == 'ok')
+ self.assertEqual(len(r2.warnings),0)
+ self.assertEqual(len(r2.data),2)
+ self.assertEqual(r2.metadata.output_column_name_list,['a','b','c'])
+ self.assertEqual(r2.data,[(1,2,3),(4,5,6)])
+ self.assertTrue(tmpfile1.name in r2.metadata.table_structures)
+ self.assertTrue(tmpfile1.name in r2.metadata.new_table_structures)
+ self.assertEqual(r2.metadata.table_structures[tmpfile1.name].atomic_fns,[tmpfile1.name])
+ self.assertEqual(r2.metadata.table_structures[tmpfile1.name].source_type,'file')
+ self.assertEqual(r2.metadata.table_structures[tmpfile1.name].source,tmpfile1.name)
+
+ # run file 2 on engine 1
+ r3 = q1.execute('select * from %s' % tmpfile2.name)
+ print("QueryQuery",file=sys.stdout)
+
+ print(r3)
+ self.assertTrue(r3.status == 'ok')
+ self.assertEqual(len(r3.warnings),0)
+ self.assertEqual(len(r3.data),2)
+ self.assertEqual(r3.metadata.output_column_name_list,['d','e','f'])
+ self.assertEqual(r3.data,[(10,20,30),(40,50,60)])
+ self.assertTrue(tmpfile2.name in r3.metadata.table_structures)
+ self.assertTrue(tmpfile2.name in r3.metadata.new_table_structures)
+ self.assertEqual(r3.metadata.table_structures[tmpfile2.name].atomic_fns,[tmpfile2.name])
+ self.assertEqual(r3.metadata.table_structures[tmpfile2.name].source,tmpfile2.name)
+ self.assertEqual(r3.metadata.table_structures[tmpfile2.name].source_type,'file')
+
+ q1.done()
+ q2.done()
+
+ self.cleanup(tmpfile1)
+ self.cleanup(tmpfile2)
+
+ def test_simple_query(self):
+ tmpfile = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+
+ q = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
+ r = q.execute('select * from %s' % tmpfile.name)
+
+ self.assertTrue(r.status == 'ok')
+ self.assertEqual(len(r.warnings),0)
+ self.assertEqual(len(r.data),2)
+ self.assertEqual(r.metadata.output_column_name_list,['a','b','c'])
+ self.assertEqual(r.data,[(1,2,3),(4,5,6)])
+ self.assertTrue(tmpfile.name in r.metadata.table_structures)
+ self.assertTrue(tmpfile.name in r.metadata.new_table_structures)
+ self.assertEqual(r.metadata.table_structures[tmpfile.name].atomic_fns,[tmpfile.name])
+ self.assertEqual(r.metadata.table_structures[tmpfile.name].source_type,'file')
+ self.assertEqual(r.metadata.table_structures[tmpfile.name].source,tmpfile.name)
+
+ q.done()
+ self.cleanup(tmpfile)
+
+ def test_loaded_data_reuse(self):
+ tmpfile = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+
+ q = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
+ r1 = q.execute('select * from %s' % tmpfile.name)
+
+ r2 = q.execute('select * from %s' % tmpfile.name)
+
+ self.assertTrue(r1.status == 'ok')
+ self.assertEqual(len(r1.warnings),0)
+ self.assertEqual(len(r1.data),2)
+ self.assertEqual(r1.metadata.output_column_name_list,['a','b','c'])
+ self.assertEqual(r1.data,[(1,2,3),(4,5,6)])
+ self.assertTrue(tmpfile.name in r1.metadata.table_structures)
+ self.assertTrue(tmpfile.name in r1.metadata.new_table_structures)
+ self.assertEqual(r1.metadata.table_structures[tmpfile.name].atomic_fns,[tmpfile.name])
+ self.assertEqual(r1.metadata.table_structures[tmpfile.name].source_type,'file')
+ self.assertEqual(r1.metadata.table_structures[tmpfile.name].source,tmpfile.name)
+
+ self.assertTrue(r2.status == 'ok')
+ self.assertTrue(tmpfile.name in r2.metadata.table_structures)
+ self.assertTrue(tmpfile.name not in r2.metadata.new_table_structures)
+ self.assertEqual(r2.data,r1.data)
+ self.assertEqual(r2.metadata.output_column_name_list,r2.metadata.output_column_name_list)
+ self.assertEqual(len(r2.warnings),0)
+
+ q.done()
+
+ self.cleanup(tmpfile)
+
+ def test_stdin_injection(self):
+ tmpfile = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+
+ data_streams_dict = {
+ '-': DataStream('stdin','-',codecs.open(tmpfile.name,'rb',encoding='utf-8'))
+ }
+ q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
+ r = q.execute('select * from -')
+
+ self.assertTrue(r.status == 'ok')
+ self.assertEqual(len(r.warnings),0)
+ self.assertEqual(len(r.data),2)
+ self.assertEqual(r.metadata.output_column_name_list,['a','b','c'])
+ self.assertEqual(r.data,[(1,2,3),(4,5,6)])
+ self.assertEqual(r.metadata.new_table_structures['-'],r.metadata.table_structures['-'])
+ self.assertEqual(r.metadata.table_structures['-'].column_names,['a','b','c'])
+ self.assertEqual(r.metadata.table_structures['-'].python_column_types,[int,int,int])
+ self.assertEqual(r.metadata.table_structures['-'].sqlite_column_types,['int','int','int'])
+ self.assertEqual(r.metadata.table_structures['-'].source_type,'data-stream')
+ self.assertEqual(r.metadata.table_structures['-'].source,'stdin')
+
+ q.done()
+ self.cleanup(tmpfile)
+
+ def test_named_stdin_injection(self):
+ tmpfile = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+
+ data_streams_dict = {
+ 'my_stdin_data': DataStream('my_stdin_data','my_stdin_data',codecs.open(tmpfile.name,'rb',encoding='utf-8'))
+ }
+
+ q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
+ r = q.execute('select a from my_stdin_data')
+
+ self.assertTrue(r.status == 'ok')
+ self.assertEqual(len(r.warnings),0)
+ self.assertEqual(len(r.data),2)
+ self.assertEqual(r.metadata.output_column_name_list,['a'])
+ self.assertEqual(r.data,[(1,),(4,)])
+ self.assertTrue('my_stdin_data' in r.metadata.table_structures)
+ self.assertTrue('my_stdin_data' in r.metadata.new_table_structures)
+ self.assertEqual(r.metadata.table_structures['my_stdin_data'].qtable_name,'my_stdin_data')
+
+ q.done()
+ self.cleanup(tmpfile)
+
+ def test_data_stream_isolation(self):
+ tmpfile1 = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+ tmpfile2 = self.create_file_with_data(b("d e f\n7 8 9\n10 11 12"))
+
+ data_streams_dict = {
+ 'a-': DataStream('a-','a-',codecs.open(tmpfile1.name, 'rb', encoding='utf-8')),
+ 'b-': DataStream('b-','b-',codecs.open(tmpfile2.name, 'rb', encoding='utf-8'))
+ }
+
+ q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
+ r1 = q.execute('select * from a-')
+
+ self.assertTrue(r1.status == 'ok')
+ self.assertEqual(len(r1.warnings),0)
+ self.assertEqual(len(r1.data),2)
+ self.assertEqual(r1.metadata.output_column_name_list,['a','b','c'])
+ self.assertEqual(r1.data,[(1,2,3),(4,5,6)])
+ self.assertTrue('a-' in r1.metadata.table_structures)
+ self.assertEqual(len(r1.metadata.table_structures),1)
+ self.assertEqual(r1.metadata.table_structures['a-'].source_type, 'data-stream')
+ self.assertEqual(r1.metadata.table_structures['a-'].source, 'a-')
+ self.assertEqual(r1.metadata.table_structures['a-'].column_names, ['a','b','c'])
+ self.assertEqual(r1.metadata.table_structures['a-'].python_column_types, [int,int,int])
+ self.assertEqual(r1.metadata.table_structures['a-'].sqlite_column_types, ['int','int','int'])
+
+ r2 = q.execute('select * from b-')
+
+ self.assertTrue(r2.status == 'ok')
+ self.assertEqual(len(r2.warnings),0)
+ self.assertEqual(len(r2.data),2)
+ self.assertEqual(r2.metadata.output_column_name_list,['d','e','f'])
+ self.assertEqual(r2.data,[(7,8,9),(10,11,12)])
+
+ self.assertEqual(len(r1.metadata.table_structures),2)
+ self.assertTrue('b-' in r1.metadata.table_structures)
+ self.assertEqual(r1.metadata.table_structures['b-'].source_type, 'data-stream')
+ self.assertEqual(r1.metadata.table_structures['b-'].source, 'b-')
+ self.assertEqual(r1.metadata.table_structures['b-'].column_names, ['d','e','f'])
+ self.assertEqual(r1.metadata.table_structures['b-'].python_column_types, [int,int,int])
+ self.assertEqual(r1.metadata.table_structures['b-'].sqlite_column_types, ['int','int','int'])
+
+ q.done()
+ self.cleanup(tmpfile1)
+ self.cleanup(tmpfile2)
+
+ def test_multiple_stdin_injection(self):
+ tmpfile1 = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+ tmpfile2 = self.create_file_with_data(b("d e f\n7 8 9\n10 11 12"))
+
+ data_streams_dict = {
+ 'my_stdin_data1': DataStream('my_stdin_data1','my_stdin_data1',codecs.open(tmpfile1.name,'rb',encoding='utf-8')),
+ 'my_stdin_data2': DataStream('my_stdin_data2','my_stdin_data2',codecs.open(tmpfile2.name,'rb',encoding='utf-8'))
+ }
+ q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
+ r1 = q.execute('select * from my_stdin_data1')
+
+ self.assertTrue(r1.status == 'ok')
+ self.assertEqual(len(r1.warnings),0)
+ self.assertEqual(len(r1.data),2)
+ self.assertEqual(r1.metadata.output_column_name_list,['a','b','c'])
+ self.assertEqual(r1.data,[(1,2,3),(4,5,6)])
+ self.assertTrue('my_stdin_data1' in r1.metadata.table_structures)
+ self.assertTrue('my_stdin_data1' in r1.metadata.new_table_structures)
+ self.assertEqual(r1.metadata.table_structures['my_stdin_data1'].qtable_name,'my_stdin_data1')
+
+ r2 = q.execute('select * from my_stdin_data2')
+
+ self.assertTrue(r2.status == 'ok')
+ self.assertEqual(len(r2.warnings),0)
+ self.assertEqual(len(r2.data),2)
+ self.assertEqual(r2.metadata.output_column_name_list,['d','e','f'])
+ self.assertEqual(r2.data,[(7,8,9),(10,11,12)])
+ # There should be another data load, even though it's the same 'filename' as before
+ self.assertTrue('my_stdin_data2' in r2.metadata.table_structures)
+ self.assertTrue('my_stdin_data2' in r2.metadata.new_table_structures)
+ self.assertEqual(r2.metadata.table_structures['my_stdin_data2'].qtable_name,'my_stdin_data2')
+
+ r3 = q.execute('select aa.*,bb.* from my_stdin_data1 aa join my_stdin_data2 bb')
+
+ self.assertTrue(r3.status == 'ok')
+ self.assertEqual(len(r3.warnings),0)
+ self.assertEqual(len(r3.data),4)
+ self.assertEqual(r3.metadata.output_column_name_list,['a','b','c','d','e','f'])
+ self.assertEqual(r3.data,[(1,2,3,7,8,9),(1,2,3,10,11,12),(4,5,6,7,8,9),(4,5,6,10,11,12)])
+ self.assertTrue('my_stdin_data1' in r3.metadata.table_structures)
+ self.assertTrue('my_stdin_data1' not in r3.metadata.new_table_structures)
+
+ q.done()
+ self.cleanup(tmpfile1)
+ self.cleanup(tmpfile2)
+
+ def test_different_input_params_for_different_files(self):
+ tmpfile1 = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+ tmpfile2 = self.create_file_with_data(b("7\t8\t9\n10\t11\t12"))
+
+ q = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
+
+ q.load_data(tmpfile1.name,QInputParams(skip_header=True,delimiter=' '))
+ q.load_data(tmpfile2.name,QInputParams(skip_header=False,delimiter='\t'))
+
+ r = q.execute('select aa.*,bb.* from %s aa join %s bb' % (tmpfile1.name,tmpfile2.name))
+
+ self.assertTrue(r.status == 'ok')
+ self.assertEqual(len(r.warnings),0)
+ self.assertEqual(len(r.data),4)
+ self.assertEqual(r.metadata.output_column_name_list,['a','b','c','c1','c2','c3'])
+ self.assertEqual(r.data,[(1,2,3,7,8,9),(1,2,3,10,11,12),(4,5,6,7,8,9),(4,5,6,10,11,12)])
+ self.assertTrue(tmpfile1.name not in r.metadata.new_table_structures)
+ self.assertTrue(tmpfile2.name not in r.metadata.new_table_structures)
+
+ q.done()
+ self.cleanup(tmpfile1)
+ self.cleanup(tmpfile2)
+
+ def test_different_input_params_for_different_files_2(self):
+ tmpfile1 = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+ tmpfile2 = self.create_file_with_data(b("7\t8\t9\n10\t11\t12"))
+
+ q = QTextAsData()
+
+ q.load_data(tmpfile1.name,QInputParams(skip_header=True,delimiter=' '))
+ q.load_data(tmpfile2.name,QInputParams(skip_header=False,delimiter='\t'))
+
+ r = q.execute('select aa.*,bb.* from %s aa join %s bb' % (tmpfile1.name,tmpfile2.name))
+
+ self.assertTrue(r.status == 'ok')
+ self.assertEqual(len(r.warnings),0)
+ self.assertEqual(len(r.data),4)
+ self.assertEqual(r.metadata.output_column_name_list,['a','b','c','c1','c2','c3'])
+ self.assertEqual(r.data,[(1,2,3,7,8,9),(1,2,3,10,11,12),(4,5,6,7,8,9),(4,5,6,10,11,12)])
+ self.assertTrue(tmpfile1.name not in r.metadata.new_table_structures)
+ self.assertTrue(tmpfile2.name not in r.metadata.new_table_structures)
+
+ q.done()
+ self.cleanup(tmpfile1)
+ self.cleanup(tmpfile2)
+
+ def test_input_params_override(self):
+ tmpfile = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+
+ default_input_params = QInputParams()
+
+ for k in default_input_params.__dict__.keys():
+ setattr(default_input_params,k,'GARBAGE')
+
+ q = QTextAsData(default_input_params)
+
+ r = q.execute('select * from %s' % tmpfile.name)
+
+ self.assertTrue(r.status == 'error')
+
+ overwriting_input_params = QInputParams(skip_header=True,delimiter=' ')
+
+ r2 = q.execute('select * from %s' % tmpfile.name,input_params=overwriting_input_params)
+
+ self.assertTrue(r2.status == 'ok')
+ self.assertEqual(len(r2.warnings),0)
+ self.assertEqual(len(r2.data),2)
+ self.assertEqual(r2.metadata.output_column_name_list,['a','b','c'])
+ self.assertEqual(r2.data,[(1,2,3),(4,5,6)])
+ self.assertTrue(tmpfile.name in r2.metadata.table_structures)
+ self.assertTrue(tmpfile.name in r2.metadata.new_table_structures)
+ self.assertEqual(r2.metadata.table_structures[tmpfile.name].atomic_fns,[tmpfile.name])
+ self.assertEqual(r2.metadata.table_structures[tmpfile.name].source,tmpfile.name)
+ self.assertEqual(r2.metadata.table_structures[tmpfile.name].source_type,'file')
+
+ q.done()
+ self.cleanup(tmpfile)
+
+ def test_input_params_merge(self):
+ input_params = QInputParams()
+
+ for k in input_params.__dict__.keys():
+ setattr(input_params,k,'GARBAGE')
+
+ merged_input_params = input_params.merged_with(QInputParams())
+
+ for k in merged_input_params.__dict__.keys():
+ self.assertTrue(getattr(merged_input_params,k) != 'GARBAGE')
+
+ for k in input_params.__dict__.keys():
+ self.assertTrue(getattr(merged_input_params,k) != 'GARBAGE')
+
+ def test_table_analysis_with_syntax_error(self):
+
+ q = QTextAsData()
+
+ q_output = q.analyze("bad syntax")
+
+ q.done()
+ self.assertTrue(q_output.status == 'error')
+ self.assertTrue(q_output.error.msg.startswith('query error'))
+
+ def test_execute_response(self):
+ tmpfile = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+
+ q = QTextAsData()
+
+ q_output = q.execute("select a,c from %s" % tmpfile.name,QInputParams(skip_header=True))
+
+ self.assertTrue(q_output.status == 'ok')
+ self.assertTrue(q_output.error is None)
+ self.assertEqual(len(q_output.warnings),0)
+ self.assertEqual(len(q_output.data),2)
+ self.assertEqual(q_output.data,[ (1,3),(4,6) ])
+ self.assertTrue(q_output.metadata is not None)
+
+ metadata = q_output.metadata
+
+ self.assertEqual(metadata.output_column_name_list, [ 'a','c'])
+ self.assertTrue(tmpfile.name in metadata.new_table_structures)
+ self.assertEqual(len(metadata.table_structures),1)
+
+ table_structure = metadata.new_table_structures[tmpfile.name]
+
+ self.assertEqual(table_structure.column_names,[ 'a','b','c'])
+ self.assertEqual(table_structure.python_column_types,[ int,int,int])
+ self.assertEqual(table_structure.sqlite_column_types,[ 'int','int','int'])
+ self.assertEqual(table_structure.qtable_name, tmpfile.name)
+ self.assertEqual(table_structure.atomic_fns,[tmpfile.name])
+ self.assertEqual(table_structure.source_type,'file')
+ self.assertEqual(table_structure.source,tmpfile.name)
+
+ q.done()
+ self.cleanup(tmpfile)
+
+ def test_analyze_response(self):
+ tmpfile = self.create_file_with_data(b("a b c\n1 2 3\n4 5 6"))
+
+ q = QTextAsData()
+
+ q_output = q.analyze("select a,c from %s" % tmpfile.name,QInputParams(skip_header=True))
+
+ self.assertTrue(q_output.status == 'ok')
+ self.assertTrue(q_output.error is None)
+ self.assertEqual(len(q_output.warnings),0)
+ self.assertEqual(len(q_output.data),2)
+ self.assertEqual(q_output.data,[ (1,3),(4,6) ])
+ self.assertTrue(q_output.metadata is not None)
+
+ metadata = q_output.metadata
+
+ self.assertEqual(metadata.output_column_name_list, [ 'a','c'])
+ self.assertEqual(len(metadata.table_structures),1)
+ self.assertTrue(tmpfile.name in metadata.new_table_structures)
+
+ table_structure = metadata.table_structures[tmpfile.name]
+
+ self.assertEqual(table_structure.column_names,[ 'a','b','c'])
+ self.assertEqual(table_structure.python_column_types,[ int,int,int])
+ self.assertEqual(table_structure.sqlite_column_types,[ 'int','int','int'])
+ self.assertEqual(table_structure.qtable_name, tmpfile.name)
+ self.assertEqual(table_structure.atomic_fns,[tmpfile.name])
+ self.assertEqual(table_structure.source_type,'file')
+ self.assertEqual(table_structure.source,tmpfile.name)
+
+ q.done()
+ self.cleanup(tmpfile)
+
+ def test_load_data_from_string_without_previous_data_load(self):
+ input_str = 'column1,column2,column3\n' + '\n'.join(['value1,2.5,value3'] * 1000)
+
+
+ data_streams_dict = {
+ 'my_data': DataStream('my_data_stream_id','my_data',StringIO(input_str))
+ }
+ q = QTextAsData(default_input_params=QInputParams(skip_header=True,delimiter=','),data_streams_dict=data_streams_dict)
+
+ q_output = q.execute('select column2,column3 from my_data')
+
+ self.assertTrue(q_output.status == 'ok')
+ self.assertTrue(q_output.error is None)
+ self.assertEqual(len(q_output.warnings),0)
+ self.assertTrue(len(q_output.data),1000)
+ self.assertEqual(len(set(q_output.data)),1)
+ self.assertEqual(list(set(q_output.data))[0],(2.5,'value3'))
+
+ metadata = q_output.metadata
+
+ self.assertTrue(metadata.output_column_name_list,['column2','column3'])
+ self.assertTrue('my_data' in metadata.new_table_structures)
+ self.assertEqual(len(metadata.table_structures),1)
+
+ table_structure = metadata.table_structures['my_data']
+
+ self.assertEqual(table_structure.column_names,['column1','column2','column3'])
+ self.assertEqual(table_structure.sqlite_column_types,['text','real','text'])
+ self.assertEqual(table_structure.python_column_types,[str,float,str])
+ self.assertEqual(table_structure.qtable_name, 'my_data')
+ self.assertEqual(table_structure.source_type, 'data-stream')
+ self.assertEqual(table_structure.source, 'my_data_stream_id')
+
+ q.done()
+
+ def test_load_data_from_string_with_previous_data_load(self):
+ input_str = 'column1,column2,column3\n' + '\n'.join(['value1,2.5,value3'] * 1000)
+
+ data_streams_dict = {
+ 'my_data': DataStream('a','my_data',StringIO(input_str))
+ }
+ q = QTextAsData(default_input_params=QInputParams(skip_header=True,delimiter=','),data_streams_dict=data_streams_dict)
+
+ dl = q.load_data('my_data',QInputParams(skip_header=True,delimiter=','))
+
+ q_output = q.execute('select column2,column3 from my_data')
+
+ self.assertTrue(q_output.status == 'ok')
+ self.assertTrue(q_output.error is None)
+ self.assertEqual(len(q_output.warnings),0)
+ self.assertTrue(len(q_output.data),1000)
+ self.assertEqual(len(set(q_output.data)),1)
+ self.assertEqual(list(set(q_output.data))[0],(2.5,'value3'))
+
+ metadata = q_output.metadata
+
+ self.assertTrue(metadata.output_column_name_list,['column2','column3'])
+ self.assertTrue('my_data' not in metadata.new_table_structures)
+ self.assertEqual(len(metadata.table_structures),1)
+
+ table_structure = metadata.table_structures['my_data']
+
+ self.assertEqual(table_structure.column_names,['column1','column2','column3'])
+ self.assertEqual(table_structure.sqlite_column_types,['text','real','text'])
+ self.assertEqual(table_structure.python_column_types,[str,float,str])
+ self.assertEqual(table_structure.qtable_name, 'my_data')
+
+ q.done()
\ No newline at end of file
diff --git a/test/BasicTests.py b/test/BasicTests.py
new file mode 100644
index 00000000..79885f65
--- /dev/null
+++ b/test/BasicTests.py
@@ -0,0 +1,654 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+
+
+
+from test.utils import DEBUG,b
+import os
+from test.base import AbstractQTestCase
+from test.test_data import (uneven_ls_output,find_output,header_row,sample_data_rows,sample_data_rows_with_empty_string,sample_data_no_header,sample_data_with_empty_string_no_header,sample_data_with_header,sample_data_with_missing_header_names,generate_sample_data_with_header,sample_quoted_data,double_double_quoted_data,escaped_double_quoted_data,combined_quoted_data,sample_quoted_data2,sample_quoted_data2_with_newline,one_column_data,sample_data_rows_with_spaces,sample_data_with_spaces_no_header,header_row_with_spaces,sample_data_with_spaces_with_header,long_value1,int_value,sample_data_with_long_values)
+
+class BasicTests(AbstractQTestCase):
+
+ def test_basic_aggregation(self):
+ retcode, o, e = run_command(
+ 'seq 1 10 | ' + Q_EXECUTABLE + ' "select sum(c1),avg(c1) from -"')
+ self.assertTrue(retcode == 0)
+ self.assertTrue(len(o) == 1)
+ self.assertTrue(len(e) == 0)
+
+ s = sum(range(1, 11))
+ self.assertTrue(o[0] == b('%s %s' % (s, s / 10.0)))
+
+ def test_select_one_column(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+
+ cmd = Q_EXECUTABLE + ' -d , "select c1 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(b(" ").join(o), b('a b c'))
+
+ self.cleanup(tmpfile)
+
+ def test_column_separation(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select c1,c2,c3 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], sample_data_rows[0])
+ self.assertEqual(o[1], sample_data_rows[1])
+ self.assertEqual(o[2], sample_data_rows[2])
+
+ self.cleanup(tmpfile)
+
+ def test_header_exception_on_numeric_header_data(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select * from %s" -A -H' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 3)
+ self.assertTrue(
+ b('Bad header row: Header must contain only strings') in e[0])
+ self.assertTrue(b("Column name must be a string") in e[1])
+ self.assertTrue(b("Column name must be a string") in e[2])
+
+ self.cleanup(tmpfile)
+
+ def test_different_header_in_second_file(self):
+ folder_name = self.create_folder_with_files({
+ 'file1': self.arrays_to_csv_file_content(b(','),[b('a'),b('b')],[[b(str(x)),b(str(x))] for x in range(1,6)]),
+ 'file2': self.arrays_to_csv_file_content(b(','),[b('c'),b('d')],[[b(str(x)),b(str(x))] for x in range(1,6)])
+ },prefix="xx",suffix="aa")
+
+ cmd = Q_EXECUTABLE + ' -d , "select * from %s/*" -H' % (folder_name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 35)
+ self.assertEqual(len(e),1)
+ self.assertEqual(e[0],b("Bad header row: Extra header 'c,d' in file '%s/file2' mismatches original header 'a,b' from file '%s/file1'. Table name is '%s/*'" % (folder_name,folder_name,folder_name)))
+
+ def test_data_with_header(self):
+ tmpfile = self.create_file_with_data(sample_data_with_header)
+ cmd = Q_EXECUTABLE + ' -d , "select name from %s" -H' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(b(" ").join(o), b("a b c"))
+
+ self.cleanup(tmpfile)
+
+ def test_output_header_when_input_header_exists(self):
+ tmpfile = self.create_file_with_data(sample_data_with_header)
+ cmd = Q_EXECUTABLE + ' -d , "select name from %s" -H -O' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 4)
+ self.assertEqual(o[0],b('name'))
+ self.assertEqual(o[1],b('a'))
+ self.assertEqual(o[2],b('b'))
+ self.assertEqual(o[3],b('c'))
+
+ self.cleanup(tmpfile)
+
+ def test_generated_column_name_warning_when_header_line_exists(self):
+ tmpfile = self.create_file_with_data(sample_data_with_header)
+ cmd = Q_EXECUTABLE + ' -d , "select c3 from %s" -H' % tmpfile.name
+
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 2)
+ self.assertTrue(b('no such column: c3') in e[0])
+ self.assertTrue(
+ e[1].startswith(b('Warning - There seems to be a "no such column" error, and -H (header line) exists. Please make sure that you are using the column names from the header line and not the default (cXX) column names')))
+
+ self.cleanup(tmpfile)
+
+ def test_empty_data(self):
+ tmpfile = self.create_file_with_data(b(''))
+ cmd = Q_EXECUTABLE + ' -d , "select * from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+
+ self.assertTrue(b('Warning - data is empty') in e[0])
+
+ self.cleanup(tmpfile)
+
+ def test_empty_data_with_header_param(self):
+ tmpfile = self.create_file_with_data(b(''))
+ cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -H' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+
+ m = b("Header line is expected but missing in file %s" % tmpfile.name)
+ self.assertTrue(m in e[0])
+
+ self.cleanup(tmpfile)
+
+ def test_one_row_of_data_without_header_param(self):
+ tmpfile = self.create_file_with_data(header_row)
+ cmd = Q_EXECUTABLE + ' -d , "select c2 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b('value1'))
+
+ self.cleanup(tmpfile)
+
+ def test_one_row_of_data_with_header_param(self):
+ tmpfile = self.create_file_with_data(header_row)
+ cmd = Q_EXECUTABLE + ' -d , "select name from %s" -H' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+
+ self.assertTrue(b('Warning - data is empty') in e[0])
+
+ self.cleanup(tmpfile)
+
+ def test_dont_leading_keep_whitespace_in_values(self):
+ tmpfile = self.create_file_with_data(sample_data_with_spaces_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select c1 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 3)
+
+ self.assertEqual(o[0], b('a'))
+ self.assertEqual(o[1], b('b'))
+ self.assertEqual(o[2], b('c'))
+
+ self.cleanup(tmpfile)
+
+ def test_keep_leading_whitespace_in_values(self):
+ tmpfile = self.create_file_with_data(sample_data_with_spaces_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -k' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 3)
+
+ self.assertEqual(o[0], b('a'))
+ self.assertEqual(o[1], b(' b'))
+ self.assertEqual(o[2], b('c'))
+
+ self.cleanup(tmpfile)
+
+ def test_no_impact_of_keeping_leading_whitespace_on_integers(self):
+ tmpfile = self.create_file_with_data(sample_data_with_spaces_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select c2 from %s" -k -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ f = open("/var/tmp/XXX","wb")
+ f.write(b("\n").join(o))
+ f.write(b("STDERR:"))
+ f.write(b("\n").join(e))
+ f.close()
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 7)
+
+
+ self.assertEqual(o[0], b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1], b(' Sources:'))
+ self.assertEqual(o[2], b(' source_type: file source: %s') % b(tmpfile.name))
+ self.assertEqual(o[3], b(' Fields:'))
+ self.assertEqual(o[4], b(' `c1` - text'))
+ self.assertEqual(o[5], b(' `c2` - int'))
+ self.assertEqual(o[6], b(' `c3` - int'))
+
+
+ self.cleanup(tmpfile)
+
+ def test_spaces_in_header_row(self):
+ tmpfile = self.create_file_with_data(
+ header_row_with_spaces + b("\n") + sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select name,\\`value 1\\` from %s" -H' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 3)
+
+ self.assertEqual(o[0], b('a,1'))
+ self.assertEqual(o[1], b('b,2'))
+ self.assertEqual(o[2], b('c,'))
+
+ self.cleanup(tmpfile)
+
+ def test_no_query_in_command_line(self):
+ cmd = Q_EXECUTABLE + ' -d , ""'
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 1)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(len(o), 0)
+
+ self.assertEqual(e[0],b('Query cannot be empty (query number 1)'))
+
+ def test_empty_query_in_command_line(self):
+ cmd = Q_EXECUTABLE + ' -d , " "'
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 1)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(len(o), 0)
+
+ self.assertEqual(e[0],b('Query cannot be empty (query number 1)'))
+
+ def test_failure_in_query_stops_processing_queries(self):
+ cmd = Q_EXECUTABLE + ' -d , "select 500" "select 300" "wrong-query" "select 8000"'
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 1)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(o[0],b('500'))
+ self.assertEqual(o[1],b('300'))
+
+ def test_multiple_queries_in_command_line(self):
+ cmd = Q_EXECUTABLE + ' -d , "select 500" "select 300+100" "select 300" "select 200"'
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 4)
+
+ self.assertEqual(o[0],b('500'))
+ self.assertEqual(o[1],b('400'))
+ self.assertEqual(o[2],b('300'))
+ self.assertEqual(o[3],b('200'))
+
+ def test_literal_calculation_query(self):
+ cmd = Q_EXECUTABLE + ' -d , "select 1+40/6"'
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 1)
+
+ self.assertEqual(o[0],b('7'))
+
+ def test_literal_calculation_query_float_result(self):
+ cmd = Q_EXECUTABLE + ' -d , "select 1+40/6.0"'
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 1)
+
+ self.assertEqual(o[0],b('7.666666666666667'))
+
+ def test_use_query_file(self):
+ tmp_data_file = self.create_file_with_data(sample_data_with_header)
+ tmp_query_file = self.create_file_with_data(b("select name from %s" % tmp_data_file.name))
+
+ cmd = Q_EXECUTABLE + ' -d , -q %s -H' % tmp_query_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 3)
+
+ self.assertEqual(o[0], b('a'))
+ self.assertEqual(o[1], b('b'))
+ self.assertEqual(o[2], b('c'))
+
+ self.cleanup(tmp_data_file)
+ self.cleanup(tmp_query_file)
+
+ def test_use_query_file_with_incorrect_query_encoding(self):
+ tmp_data_file = self.create_file_with_data(sample_data_with_header)
+ tmp_query_file = self.create_file_with_data(b("select name,'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)
+
+ cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q ascii' % tmp_query_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,3)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+
+ self.assertTrue(e[0].startswith(b('Could not decode query number 1 using the provided query encoding (ascii)')))
+
+ self.cleanup(tmp_data_file)
+ self.cleanup(tmp_query_file)
+
+ def test_output_header_with_non_ascii_names(self):
+ OUTPUT_ENCODING = 'utf-8'
+
+ tmp_data_file = self.create_file_with_data(sample_data_with_header)
+ tmp_query_file = self.create_file_with_data(b("select name,'Hr\xc3\xa1\xc4\x8d' Hr\xc3\xa1\xc4\x8d from %s" % tmp_data_file.name),encoding=None)
+
+ cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -O -E %s' % (tmp_query_file.name,OUTPUT_ENCODING)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),4)
+ self.assertEqual(len(e),0)
+
+ self.assertEqual(o[0].decode(OUTPUT_ENCODING), u'name,Hr\xe1\u010d')
+ self.assertEqual(o[1].decode(OUTPUT_ENCODING), u'a,Hr\xe1\u010d')
+ self.assertEqual(o[2].decode(OUTPUT_ENCODING), u'b,Hr\xe1\u010d')
+ self.assertEqual(o[3].decode(OUTPUT_ENCODING), u'c,Hr\xe1\u010d')
+
+ self.cleanup(tmp_data_file)
+ self.cleanup(tmp_query_file)
+
+ def test_use_query_file_with_query_encoding(self):
+ OUTPUT_ENCODING = 'utf-8'
+
+ tmp_data_file = self.create_file_with_data(sample_data_with_header)
+ tmp_query_file = self.create_file_with_data(b("select name,'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)
+
+ cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -E %s' % (tmp_query_file.name,OUTPUT_ENCODING)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 3)
+
+ self.assertEqual(o[0].decode(OUTPUT_ENCODING), u'a,Hr\xe1\u010d')
+ self.assertEqual(o[1].decode(OUTPUT_ENCODING), u'b,Hr\xe1\u010d')
+ self.assertEqual(o[2].decode(OUTPUT_ENCODING), u'c,Hr\xe1\u010d')
+
+ self.cleanup(tmp_data_file)
+ self.cleanup(tmp_query_file)
+
+ def test_use_query_file_and_command_line(self):
+ tmp_data_file = self.create_file_with_data(sample_data_with_header)
+ tmp_query_file = self.create_file_with_data(b("select name from %s" % tmp_data_file.name))
+
+ cmd = Q_EXECUTABLE + ' -d , -q %s -H "select * from ppp"' % tmp_query_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 1)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(len(o), 0)
+
+ self.assertTrue(e[0].startswith(b("Can't provide both a query file and a query on the command line")))
+
+ self.cleanup(tmp_data_file)
+ self.cleanup(tmp_query_file)
+
+ def test_select_output_encoding(self):
+ tmp_data_file = self.create_file_with_data(sample_data_with_header)
+ tmp_query_file = self.create_file_with_data(b("select 'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)
+
+ for target_encoding in ['utf-8','ibm852']:
+ cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -E %s' % (tmp_query_file.name,target_encoding)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 3)
+
+ self.assertEqual(o[0].decode(target_encoding), u'Hr\xe1\u010d')
+ self.assertEqual(o[1].decode(target_encoding), u'Hr\xe1\u010d')
+ self.assertEqual(o[2].decode(target_encoding), u'Hr\xe1\u010d')
+
+ self.cleanup(tmp_data_file)
+ self.cleanup(tmp_query_file)
+
+ def test_select_failed_output_encoding(self):
+ tmp_data_file = self.create_file_with_data(sample_data_with_header)
+ tmp_query_file = self.create_file_with_data(b("select 'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)
+
+ cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -E ascii' % tmp_query_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 3)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(len(o), 0)
+
+ self.assertTrue(e[0].startswith(b('Cannot encode data')))
+
+ self.cleanup(tmp_data_file)
+ self.cleanup(tmp_query_file)
+
+
+ def test_use_query_file_with_empty_query(self):
+ tmp_query_file = self.create_file_with_data(b(" "))
+
+ cmd = Q_EXECUTABLE + ' -d , -q %s -H' % tmp_query_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 1)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(len(o), 0)
+
+ self.assertTrue(e[0].startswith(b("Query cannot be empty")))
+
+ self.cleanup(tmp_query_file)
+
+ def test_use_non_existent_query_file(self):
+ cmd = Q_EXECUTABLE + ' -d , -q non-existent-query-file -H'
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 1)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(len(o), 0)
+
+ self.assertTrue(e[0].startswith(b("Could not read query from file")))
+
+ def test_nonexistent_file(self):
+ cmd = Q_EXECUTABLE + ' "select * from non-existent-file"'
+
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode,0)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+
+ self.assertEqual(e[0],b("No files matching '%s/non-existent-file' have been found" % os.getcwd()))
+
+ def test_default_column_max_length_parameter__short_enough(self):
+ huge_text = b("x" * 131000)
+
+ file_data = b("a,b,c\n1,{},3\n".format(huge_text))
+
+ tmpfile = self.create_file_with_data(file_data)
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0],b('1'))
+
+ self.cleanup(tmpfile)
+
+ def test_default_column_max_length_parameter__too_long(self):
+ huge_text = b("x") * 132000
+
+ file_data = b("a,b,c\n1,{},3\n".format(huge_text))
+
+ tmpfile = self.create_file_with_data(file_data)
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 31)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+
+ self.assertTrue(e[0].startswith(b("Column length is larger than the maximum")))
+ self.assertTrue(b("Offending file is '{}'".format(tmpfile.name)) in e[0])
+ self.assertTrue(b('Line is 2') in e[0])
+
+ self.cleanup(tmpfile)
+
+ def test_column_max_length_parameter(self):
+ file_data = b("a,b,c\nvery-long-text,2,3\n")
+ tmpfile = self.create_file_with_data(file_data)
+
+ cmd = Q_EXECUTABLE + ' -H -d , -M 3 "select a from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 31)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+
+ self.assertTrue(e[0].startswith(b("Column length is larger than the maximum")))
+ self.assertTrue((b("Offending file is '%s'" % tmpfile.name)) in e[0])
+ self.assertTrue(b('Line is 2') in e[0])
+
+ cmd2 = Q_EXECUTABLE + ' -H -d , -M 300 -H "select a from %s"' % tmpfile.name
+ retcode2, o2, e2 = run_command(cmd2)
+
+ self.assertEqual(retcode2, 0)
+ self.assertEqual(len(o2), 1)
+ self.assertEqual(len(e2), 0)
+
+ self.assertEqual(o2[0],b('very-long-text'))
+
+ self.cleanup(tmpfile)
+
+ def test_invalid_column_max_length_parameter(self):
+ file_data = b("a,b,c\nvery-long-text,2,3\n")
+ tmpfile = self.create_file_with_data(file_data)
+
+ cmd = Q_EXECUTABLE + ' -H -d , -M xx "select a from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 31)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+
+ self.assertEqual(e[0],b('Max column length limit must be an integer larger than 2 (xx)'))
+
+ self.cleanup(tmpfile)
+
+ def test_duplicate_column_name_detection(self):
+ file_data = b("a,b,a\n10,20,30\n30,40,50")
+ tmpfile = self.create_file_with_data(file_data)
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 35)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 2)
+
+ self.assertTrue(e[0].startswith(b('Bad header row:')))
+ self.assertEqual(e[1],b("'a': Column name is duplicated"))
+
+ self.cleanup(tmpfile)
+
+ def test_join_with_stdin(self):
+ x = [b(a) for a in map(str,range(1,101))]
+ large_file_data = b("val\n") + b("\n").join(x)
+ tmpfile = self.create_file_with_data(large_file_data)
+
+ cmd = '(echo id ; seq 1 2 10) | %s -c 1 -H -O "select stdin.*,f.* from - stdin left join %s f on (stdin.id * 10 = f.val)"' % (Q_EXECUTABLE,tmpfile.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 6)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0],b('id val'))
+ self.assertEqual(o[1],b('1 10'))
+ self.assertEqual(o[2],b('3 30'))
+ self.assertEqual(o[3],b('5 50'))
+ self.assertEqual(o[4],b('7 70'))
+ self.assertEqual(o[5],b('9 90'))
+
+ self.cleanup(tmpfile)
+
+ def test_concatenated_files(self):
+ file_data1 = b("a,b,c\n10,11,12\n20,21,22")
+ tmpfile1 = self.create_file_with_data(file_data1)
+ tmpfile1_folder = os.path.dirname(tmpfile1.name)
+ tmpfile1_filename = os.path.basename(tmpfile1.name)
+ expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
+
+ file_data2 = b("a,b,c\n30,31,32\n40,41,42")
+ tmpfile2 = self.create_file_with_data(file_data2)
+ tmpfile2_folder = os.path.dirname(tmpfile2.name)
+ tmpfile2_filename = os.path.basename(tmpfile2.name)
+ expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -O -H -d , "select * from %s UNION ALL select * from %s" -C none' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 5)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('a,b,c'))
+ self.assertEqual(o[1],b('10,11,12'))
+ self.assertEqual(o[2],b('20,21,22'))
+ self.assertEqual(o[3],b('30,31,32'))
+ self.assertEqual(o[4],b('40,41,42'))
+
+ self.cleanup(tmpfile1)
+ self.cleanup(tmpfile2)
+
+ def test_out_of_range_expected_column_count(self):
+ cmd = '%s "select count(*) from some_table" -c -1' % Q_EXECUTABLE
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 90)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0], b('Column count must be between 1 and 131072'))
+
+ def test_out_of_range_expected_column_count__with_explicit_limit(self):
+ cmd = '%s "select count(*) from some_table" -c -1 -M 100' % Q_EXECUTABLE
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 90)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0], b('Column count must be between 1 and 100'))
+
+ def test_other_out_of_range_expected_column_count__with_explicit_limit(self):
+ cmd = '%s "select count(*) from some_table" -c 101 -M 100' % Q_EXECUTABLE
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 90)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0], b('Column count must be between 1 and 100'))
+
+ def test_explicit_limit_of_columns__data_is_ok(self):
+ file_data1 = b("191\n192\n")
+ tmpfile1 = self.create_file_with_data(file_data1)
+
+ cmd = '%s "select count(*) from %s" -c 1 -M 3' % (Q_EXECUTABLE,tmpfile1.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0], b('2'))
+
+ self.cleanup(tmpfile1)
\ No newline at end of file
diff --git a/test/BenchmarkTests.py b/test/BenchmarkTests.py
new file mode 100644
index 00000000..46e19516
--- /dev/null
+++ b/test/BenchmarkTests.py
@@ -0,0 +1,208 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+
+
+import pytest
+
+from test.utils import DEBUG,b
+from test.base import AbstractQTestCase
+import itertools
+import os
+import re
+import time
+from gzip import GzipFile
+
+
+class BenchmarkResults(object):
+ def __init__(self, lines, columns, attempt_results, mean, stddev):
+ self.lines = lines
+ self.columns = columns
+ self.attempt_results = attempt_results
+ self.mean = mean
+ self.stddev = stddev
+
+ def __str__(self):
+ return "{}".format(self.__dict__)
+ __repr__ = __str__
+
+
+class BenchmarkAttemptResults(object):
+ def __init__(self, attempt, lines, columns, duration,return_code):
+ self.attempt = attempt
+ self.lines = lines
+ self.columns = columns
+ self.duration = duration
+ self.return_code = return_code
+
+ def __str__(self):
+ return "{}".format(self.__dict__)
+ __repr__ = __str__
+
+
+@pytest.mark.benchmark
+class BenchmarkTests(AbstractQTestCase):
+
+ BENCHMARK_DIR = os.environ.get('Q_BENCHMARK_DATA_DIR')
+
+ def _ensure_benchmark_data_dir_exists(self):
+ try:
+ os.mkdir(BenchmarkTests.BENCHMARK_DIR)
+ except Exception as e:
+ pass
+
+ def _create_benchmark_file_if_needed(self):
+ self._ensure_benchmark_data_dir_exists()
+
+ if os.path.exists('{}/'.format(BenchmarkTests.BENCHMARK_DIR)):
+ return
+
+ g = GzipFile('unit-file.csv.gz')
+ d = g.read().decode('utf-8')
+ f = open('{}/benchmark-file.csv'.format(BenchmarkTests.BENCHMARK_DIR), 'w')
+ for i in range(100):
+ f.write(d)
+ f.close()
+
+ def _prepare_test_file(self, lines, columns):
+
+ filename = '{}/_benchmark_data__lines_{}_columns_{}.csv'.format(BenchmarkTests.BENCHMARK_DIR,lines, columns)
+
+ if os.path.exists(filename):
+ return filename
+
+ c = ['c{}'.format(x + 1) for x in range(columns)]
+
+ # write a header line
+ ff = open(filename,'w')
+ ff.write(",".join(c))
+ ff.write('\n')
+ ff.close()
+
+ r, o, e = run_command('head -{} {}/benchmark-file.csv | ' + Q_EXECUTABLE + ' -d , "select {} from -" >> {}'.format(lines, BenchmarkTests.BENCHMARK_DIR, ','.join(c), filename))
+ self.assertEqual(r, 0)
+ # Create file cache as part of preparation
+ r, o, e = run_command(Q_EXECUTABLE + ' -C readwrite -d , "select count(*) from %s"' % filename)
+ self.asserEqual(r, 0)
+ return filename
+
+ def _decide_result(self,attempt_results):
+
+ failed = list(filter(lambda a: a.return_code != 0,attempt_results))
+
+ if len(failed) == 0:
+ mean = sum([x.duration for x in attempt_results]) / len(attempt_results)
+ sum_squared = sum([(x.duration - mean)**2 for x in attempt_results])
+ ddof = 0
+ pvar = sum_squared / (len(attempt_results) - ddof)
+ stddev = pvar ** 0.5
+ else:
+ mean = None
+ stddev = None
+
+ return BenchmarkResults(
+ attempt_results[0].lines,
+ attempt_results[0].columns,
+ attempt_results,
+ mean,
+ stddev
+ )
+
+ def _perform_test_performance_matrix(self,name,generate_cmd_function):
+ results = []
+
+ benchmark_results_folder = os.environ.get("Q_BENCHMARK_RESULTS_FOLDER",'')
+ if benchmark_results_folder == "":
+ raise Exception("Q_BENCHMARK_RESULTS_FOLDER must be provided as an environment variable")
+
+ self._create_benchmark_file_if_needed()
+ for columns in [1, 5, 10, 20, 50, 100]:
+ for lines in [1, 10, 100, 1000, 10000, 100000, 1000000]:
+ attempt_results = []
+ for attempt in range(10):
+ filename = self._prepare_test_file(lines, columns)
+ if DEBUG:
+ print("Testing {}".format(filename))
+ t0 = time.time()
+ r, o, e = run_command(generate_cmd_function(filename,lines,columns))
+ duration = time.time() - t0
+ attempt_result = BenchmarkAttemptResults(attempt, lines, columns, duration, r)
+ attempt_results += [attempt_result]
+ if DEBUG:
+ print("Results: {}".format(attempt_result.__dict__))
+ final_result = self._decide_result(attempt_results)
+ results += [final_result]
+
+ series_fields = ['lines','columns']
+ value_fields = ['mean','stddev']
+
+ all_fields = series_fields + value_fields
+
+ output_filename = '{}/{}.benchmark-results'.format(benchmark_results_folder,name)
+ output_file = open(output_filename,'w')
+ for columns,g in itertools.groupby(sorted(results,key=lambda x:x.columns),key=lambda x:x.columns):
+ x = "\t".join(series_fields + ['{}_{}'.format(name, f) for f in value_fields])
+ print(x,file = output_file)
+ for result in g:
+ print("\t".join(map(str,[getattr(result,f) for f in all_fields])),file=output_file)
+ output_file.close()
+
+ print("results have been written to : {}".format(output_filename))
+ if DEBUG:
+ print("RESULTS FOR {}".format(name))
+ print(open(output_filename,'r').read())
+
+ def test_q_matrix(self):
+ Q_BENCHMARK_NAME = os.environ.get('Q_BENCHMARK_NAME')
+ if Q_BENCHMARK_NAME is None:
+ raise Exception('Q_BENCHMARK_NAME must be provided as an env var')
+
+ def generate_q_cmd(data_filename, line_count, column_count):
+ Q_BENCHMARK_ADDITIONAL_PARAMS = os.environ.get('Q_BENCHMARK_ADDITIONAL_PARAMS') or ''
+ additional_params = ''
+ additional_params = additional_params + ' ' + Q_BENCHMARK_ADDITIONAL_PARAMS
+ return '{} -d , {} "select count(*) from {}"'.format(Q_EXECUTABLE,additional_params, data_filename)
+ self._perform_test_performance_matrix(Q_BENCHMARK_NAME,generate_q_cmd)
+
+ def _get_textql_version(self):
+ r,o,e = run_command("textql --version")
+ if r != 0:
+ raise Exception("Could not find textql")
+ if len(e) != 0:
+ raise Exception("Errors while getting textql version")
+ return o[0]
+
+ def _get_octosql_version(self):
+ r,o,e = run_command("octosql --version")
+ if r != 0:
+ raise Exception("Could not find octosql")
+ if len(e) != 0:
+ raise Exception("Errors while getting octosql version")
+ version = re.findall('v[0-9]+\\.[0-9]+\\.[0-9]+',str(o[0],encoding='utf-8'))[0]
+ return version
+
+ def test_textql_matrix(self):
+ def generate_textql_cmd(data_filename,line_count,column_count):
+ return 'textql -dlm , -sql "select count(*)" {}'.format(data_filename)
+
+ name = 'textql_%s' % self._get_textql_version()
+ self._perform_test_performance_matrix(name,generate_textql_cmd)
+
+ def test_octosql_matrix(self):
+ config_fn = self.random_tmp_filename('octosql', 'config')
+ def generate_octosql_cmd(data_filename,line_count,column_count):
+ j = """
+dataSources:
+ - name: bmdata
+ type: csv
+ config:
+ path: "{}"
+ headerRow: false
+ batchSize: 10000
+""".format(data_filename)[1:]
+ f = open(config_fn,'w')
+ f.write(j)
+ f.close()
+ return 'octosql -c {} -o batch-csv "select count(*) from bmdata a"'.format(config_fn)
+
+ name = 'octosql_%s' % self._get_octosql_version()
+ self._perform_test_performance_matrix(name,generate_octosql_cmd)
\ No newline at end of file
diff --git a/test/CachingTests.py b/test/CachingTests.py
new file mode 100644
index 00000000..cd75ff17
--- /dev/null
+++ b/test/CachingTests.py
@@ -0,0 +1,457 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+
+
+from test.utils import DEBUG,b
+import os
+import sqlite3
+from test.test_data import sample_data_with_header
+from test.test_data import sample_data_no_header
+from test.test_data import header_row_with_spaces
+from test.base import AbstractQTestCase
+
+class CachingTests(AbstractQTestCase):
+
+ def test_cache_empty_file(self):
+ file_data = b("a,b,c")
+ tmpfile = self.create_file_with_data(file_data)
+ tmpfile_folder = os.path.dirname(tmpfile.name)
+ tmpfile_filename = os.path.basename(tmpfile.name)
+ tmpfile_expected_table_name = os.path.basename(tmpfile.name)
+ expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C none' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0],b("Warning - data is empty"))
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0],b("Warning - data is empty"))
+
+ # After readwrite caching has been activated, the cache file is expected to exist
+ self.assertTrue(os.path.exists(expected_cache_filename))
+
+ # Read the cache file directly, to make sure it's a valid sqlite file
+ import sqlite3
+ db = sqlite3.connect(expected_cache_filename)
+ table_list = db.execute("select content_signature_key,temp_table_name,content_signature,creation_time,source_type,source from _qcatalog where temp_table_name == '%s'" % (tmpfile_expected_table_name)).fetchall()
+ self.assertTrue(len(table_list) == 1)
+ table_metadata = table_list[0]
+ results = db.execute("select * from %s" % table_metadata[1]).fetchall()
+ self.assertTrue(len(results) == 0)
+
+ self.cleanup(tmpfile)
+
+ def test_reading_the_wrong_cache__original_file_having_different_data(self):
+ file_data1 = b("a,b,c\n10,20,30\n30,40,50")
+
+ tmpfile1 = self.create_file_with_data(file_data1)
+ tmpfile1_folder = os.path.dirname(tmpfile1.name)
+ tmpfile1_filename = os.path.basename(tmpfile1.name)
+ expected_cache_filename = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0], b('10'))
+ self.assertEqual(o[1], b('30'))
+
+ # Ensure cache has been created
+ self.assertTrue(os.path.exists(expected_cache_filename))
+
+ # Overwrite the original file
+ file_data2 = b("a,b,c\n10,20,30\n30,40,50\n50,60,70")
+ self.write_file(tmpfile1.name,file_data2)
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 81)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0], b('%s vs %s.qsql: Content Signatures differ at inferer.rows (actual analysis data differs)' % \
+ (tmpfile1.name,tmpfile1.name)))
+
+
+ def test_reading_the_wrong_cache__original_file_having_different_delimiter(self):
+ file_data1 = b("a,b,c\n10,20,30\n30,40,50")
+
+ tmpfile1 = self.create_file_with_data(file_data1)
+ tmpfile1_folder = os.path.dirname(tmpfile1.name)
+ tmpfile1_filename = os.path.basename(tmpfile1.name)
+ expected_cache_filename = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0], b('10'))
+ self.assertEqual(o[1], b('30'))
+
+ # Ensure cache has been created
+ self.assertTrue(os.path.exists(expected_cache_filename))
+
+ # Overwrite the original file
+ file_data2 = b("a\tb\tc\n10\t20\t30\n30\t40\t50")
+ self.write_file(tmpfile1.name,file_data2)
+
+ cmd = Q_EXECUTABLE + ' -H -t "select a from %s" -C read' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 80)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ x = b("%s vs %s.qsql: Content Signatures for table %s differ at input_delimiter (source value '\t' disk signature value ',')" % \
+ (tmpfile1.name,tmpfile1.name,tmpfile1.name))
+ self.assertEqual(e[0], x)
+
+ def test_rename_cache_and_read_from_it(self):
+ # create a file, along with its qsql
+ file_data1 = b("a,b,c\n10,20,30\n30,40,50")
+
+ tmpfile1 = self.create_file_with_data(file_data1)
+ tmpfile1_folder = os.path.dirname(tmpfile1.name)
+ tmpfile1_filename = os.path.basename(tmpfile1.name)
+ expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0], b('10'))
+ self.assertEqual(o[1], b('30'))
+ # Ensure cache has been created
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+
+ tmp_fn = self.generate_tmpfile_name("aa","qsql")
+ os.rename(expected_cache_filename1,tmp_fn)
+
+ cmd = '%s "select a from %s"' % (Q_EXECUTABLE,tmp_fn)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0], b('10'))
+ self.assertEqual(o[1], b('30'))
+
+
+ def test_reading_the_wrong_cache__qsql_file_not_having_a_matching_content_signature(self):
+ # create a file, along with its qsql
+ file_data1 = b("a,b,c\n10,20,30\n30,40,50")
+
+ tmpfile1 = self.create_file_with_data(file_data1)
+ tmpfile1_folder = os.path.dirname(tmpfile1.name)
+ tmpfile1_filename = os.path.basename(tmpfile1.name)
+ expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0], b('10'))
+ self.assertEqual(o[1], b('30'))
+ # Ensure cache has been created
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+
+ file_data2 = b("c,d,e\n10,20,30\n30,40,50")
+
+ # create another file with a different header, along with its qsql
+ tmpfile2 = self.create_file_with_data(file_data2)
+ tmpfile2_folder = os.path.dirname(tmpfile2.name)
+ tmpfile2_filename = os.path.basename(tmpfile2.name)
+ expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select c from %s" -C readwrite' % tmpfile2.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0], b('10'))
+ self.assertEqual(o[1], b('30'))
+ # Ensure cache has been created
+ self.assertTrue(os.path.exists(expected_cache_filename2))
+
+ # now take the second qsql file as if it was the first. Execution on file 1 should fail, since the qsql file
+ # does not really contain the table we're after
+
+ os.remove(expected_cache_filename1)
+ os.rename(expected_cache_filename2,expected_cache_filename1)
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 80)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ x = b("%s vs %s.qsql: Content Signatures for table %s differ at inferer.header_row (source value '['a', 'b', 'c']' disk signature value '['c', 'd', 'e']')" % (tmpfile1.name,tmpfile1.name,tmpfile1.name))
+ self.assertEqual(e[0], x)
+
+ def test_reading_the_wrong_cache__qsql_file_not_having_any_content_signature(self):
+ # create a file, along with its qsql
+ file_data1 = b("a,b,c\n10,20,30\n30,40,50")
+
+ tmpfile1 = self.create_file_with_data(file_data1)
+ tmpfile1_folder = os.path.dirname(tmpfile1.name)
+ tmpfile1_filename = os.path.basename(tmpfile1.name)
+ expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0], b('10'))
+ self.assertEqual(o[1], b('30'))
+ # Ensure cache has been created
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+
+ file_data2 = b("c,d,e\n10,20,30\n30,40,50")
+
+ # delete qcatalog content, so no entries will be available
+ c = sqlite3.connect(expected_cache_filename1)
+ c.execute('delete from _qcatalog').fetchall()
+ c.commit()
+ c.close()
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 97)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+ self.assertEqual(e[0],b("Could not autodetect table name in qsql file. File contains no record of a table"))
+
+
+ def test_cache_full_flow(self):
+ file_data = b("a,b,c\n10,20,30\n30,40,50")
+ tmpfile = self.create_file_with_data(file_data)
+ tmpfile_folder = os.path.dirname(tmpfile.name)
+ tmpfile_filename = os.path.basename(tmpfile.name)
+ expected_tmpfile_table_name = tmpfile_filename
+ expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C none' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0],b('10'))
+ self.assertEqual(o[1],b('30'))
+
+ # Ensure cache has not been created
+ self.assertTrue(not os.path.exists(expected_cache_filename))
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0],b('10'))
+ self.assertEqual(o[1],b('30'))
+
+ # Ensure cache has not been created, as cache mode is "read" only
+ self.assertTrue(not os.path.exists(expected_cache_filename))
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0],b('10'))
+ self.assertEqual(o[1],b('30'))
+
+ # After readwrite caching has been activated, the cache file is expected to exist
+ self.assertTrue(os.path.exists(expected_cache_filename))
+
+ # Read the cache file directly, to make sure it's a valid sqlite file
+ db = sqlite3.connect(expected_cache_filename)
+ table_list = db.execute("select content_signature_key,temp_table_name,content_signature,creation_time,source_type,source from _qcatalog where temp_table_name == '%s'" % expected_tmpfile_table_name).fetchall()
+ self.assertTrue(len(table_list) == 1)
+ table_metadata = table_list[0]
+ results = db.execute("select * from %s" % table_metadata[1]).fetchall()
+ self.assertEqual(results[0],(10,20,30))
+ self.assertEqual(results[1],(30,40,50))
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0],b('10'))
+ self.assertEqual(o[1],b('30'))
+
+ # After readwrite caching has been activated, the cache file is expected to exist
+ self.assertTrue(os.path.exists(expected_cache_filename))
+
+ self.cleanup(tmpfile)
+
+ def test_cache_full_flow_with_concatenated_files(self):
+ file_data1 = b("a,b,c\n10,11,12\n20,21,22")
+ tmpfile1 = self.create_file_with_data(file_data1)
+ tmpfile1_folder = os.path.dirname(tmpfile1.name)
+ tmpfile1_filename = os.path.basename(tmpfile1.name)
+ expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
+
+ file_data2 = b("a,b,c\n30,31,32\n40,41,42")
+ tmpfile2 = self.create_file_with_data(file_data2)
+ tmpfile2_folder = os.path.dirname(tmpfile2.name)
+ tmpfile2_filename = os.path.basename(tmpfile2.name)
+ expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -O -H -d , "select * from (select * from %s UNION ALL select * from %s)" -C readwrite' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 5)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('a,b,c'))
+ self.assertEqual(o[1],b('10,11,12'))
+ self.assertEqual(o[2],b('20,21,22'))
+ self.assertEqual(o[3],b('30,31,32'))
+ self.assertEqual(o[4],b('40,41,42'))
+
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+ self.assertTrue(os.path.exists(expected_cache_filename2))
+
+ self.cleanup(tmpfile1)
+ self.cleanup(tmpfile2)
+
+
+ def test_analyze_result_with_cache_file(self):
+ file_data = b("a,b,c\n10,20,30\n30,40,50")
+ tmpfile = self.create_file_with_data(file_data)
+ tmpfile_folder = os.path.dirname(tmpfile.name)
+ tmpfile_filename = os.path.basename(tmpfile.name)
+ expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ # Ensure cache has not been created yet
+ self.assertTrue(not os.path.exists(expected_cache_filename))
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0],b('10'))
+ self.assertEqual(o[1],b('30'))
+
+ # Ensure cache is now created
+ self.assertTrue(os.path.exists(expected_cache_filename))
+
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),7)
+ self.assertEqual(len(e),0)
+
+ self.assertEqual(o[0],b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: qsql-file-with-original source: %s.qsql' % tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4],b(' `a` - int'))
+ self.assertEqual(o[5],b(' `b` - int'))
+ self.assertEqual(o[6],b(' `c` - int'))
+
+ # delete the newly created cache
+ os.remove(expected_cache_filename)
+
+ # Now rerun the analysis without the cache file
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),7)
+ self.assertEqual(len(e),0)
+
+ self.assertEqual(o[0],b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s' % tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4],b(' `a` - int'))
+ self.assertEqual(o[5],b(' `b` - int'))
+ self.assertEqual(o[6],b(' `c` - int'))
+
+ self.cleanup(tmpfile)
+
+ def test_partial_caching_exists(self):
+ file1_data = b("a,b,c\n10,20,30\n30,40,50\n60,70,80")
+ tmpfile1 = self.create_file_with_data(file1_data)
+ tmpfile1_folder = os.path.dirname(tmpfile1.name)
+ tmpfile1_filename = os.path.basename(tmpfile1.name)
+ expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
+
+ file2_data = b("b,x\n10,linewith10\n20,linewith20\n30,linewith30\n40,linewith40")
+ tmpfile2 = self.create_file_with_data(file2_data)
+ tmpfile2_folder = os.path.dirname(tmpfile2.name)
+ tmpfile2_filename = os.path.basename(tmpfile2.name)
+ expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')
+
+ # Use only first file, and cache
+ cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0],b('10'))
+ self.assertEqual(o[1],b('30'))
+
+ # Ensure cache has been created for file 1
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+
+ # Use both files with read caching, one should be read from cache, the other from the file
+ cmd = Q_EXECUTABLE + ' -H -d , "select file1.a,file1.b,file1.c,file2.x from %s file1 left join %s file2 on (file1.b = file2.b)" -C read' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('10,20,30,linewith20'))
+ self.assertEqual(o[1],b('30,40,50,linewith40'))
+ self.assertEqual(o[2],b('60,70,80,'))
+
+ # Ensure cache has NOT been created for file 2
+ self.assertTrue(not os.path.exists(expected_cache_filename2))
+
+ # Now rerun the query, this time with readwrite caching, so the second file cache will be written
+ cmd = Q_EXECUTABLE + ' -H -d , "select file1.a,file1.b,file1.c,file2.x from %s file1 left join %s file2 on (file1.b = file2.b)" -C readwrite' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('10,20,30,linewith20'))
+ self.assertEqual(o[1],b('30,40,50,linewith40'))
+ self.assertEqual(o[2],b('60,70,80,'))
+
+ # Ensure cache has now been created for file 2
+ self.assertTrue(os.path.exists(expected_cache_filename2))
+
+ self.cleanup(tmpfile1)
+ self.cleanup(tmpfile2)
\ No newline at end of file
diff --git a/test/DelimiterTests.py b/test/DelimiterTests.py
new file mode 100644
index 00000000..3a584ef1
--- /dev/null
+++ b/test/DelimiterTests.py
@@ -0,0 +1,162 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.test_data import sample_data_no_header, sample_data_rows
+
+
+from test.utils import DEBUG,b
+
+class DelimiterTests(AbstractQTestCase):
+
+ def test_delimition_mistake_with_header(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+
+ cmd = Q_EXECUTABLE + ' -d " " "select * from %s" -H' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 2)
+
+ self.assertTrue(e[0].startswith(b("Bad header row")))
+ self.assertTrue(b("Column name cannot contain commas") in e[1])
+
+ self.cleanup(tmpfile)
+
+ def test_tab_delimition_parameter(self):
+ tmpfile = self.create_file_with_data(
+ sample_data_no_header.replace(b(","), b("\t")))
+ cmd = Q_EXECUTABLE + ' -t "select c1,c2,c3 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0], sample_data_rows[0].replace(b(","), b("\t")))
+ self.assertEqual(o[1], sample_data_rows[1].replace(b(","), b("\t")))
+ self.assertEqual(o[2], sample_data_rows[2].replace(b(","), b("\t")))
+
+ self.cleanup(tmpfile)
+
+ def test_pipe_delimition_parameter(self):
+ tmpfile = self.create_file_with_data(
+ sample_data_no_header.replace(b(","), b("|")))
+ cmd = Q_EXECUTABLE + ' -p "select c1,c2,c3 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0], sample_data_rows[0].replace(b(","), b("|")))
+ self.assertEqual(o[1], sample_data_rows[1].replace(b(","), b("|")))
+ self.assertEqual(o[2], sample_data_rows[2].replace(b(","), b("|")))
+
+ self.cleanup(tmpfile)
+
+ def test_tab_delimition_parameter__with_manual_override_attempt(self):
+ tmpfile = self.create_file_with_data(
+ sample_data_no_header.replace(b(","), b("\t")))
+ cmd = Q_EXECUTABLE + ' -t -d , "select c1,c2,c3 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(o[0], sample_data_rows[0].replace(b(","), b("\t")))
+ self.assertEqual(o[1], sample_data_rows[1].replace(b(","), b("\t")))
+ self.assertEqual(o[2], sample_data_rows[2].replace(b(","), b("\t")))
+ self.assertEqual(e[0],b('Warning: -t parameter overrides -d parameter (,)'))
+
+ self.cleanup(tmpfile)
+
+ def test_pipe_delimition_parameter__with_manual_override_attempt(self):
+ tmpfile = self.create_file_with_data(
+ sample_data_no_header.replace(b(","), b("|")))
+ cmd = Q_EXECUTABLE + ' -p -d , "select c1,c2,c3 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(o[0], sample_data_rows[0].replace(b(","), b("|")))
+ self.assertEqual(o[1], sample_data_rows[1].replace(b(","), b("|")))
+ self.assertEqual(o[2], sample_data_rows[2].replace(b(","), b("|")))
+ self.assertEqual(e[0],b('Warning: -p parameter overrides -d parameter (,)'))
+
+ self.cleanup(tmpfile)
+
+ def test_output_delimiter(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , -D "|" "select c1,c2,c3 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], sample_data_rows[0].replace(b(","), b("|")))
+ self.assertEqual(o[1], sample_data_rows[1].replace(b(","), b("|")))
+ self.assertEqual(o[2], sample_data_rows[2].replace(b(","), b("|")))
+
+ self.cleanup(tmpfile)
+
+ def test_output_delimiter_tab_parameter(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , -T "select c1,c2,c3 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], sample_data_rows[0].replace(b(","), b("\t")))
+ self.assertEqual(o[1], sample_data_rows[1].replace(b(","), b("\t")))
+ self.assertEqual(o[2], sample_data_rows[2].replace(b(","), b("\t")))
+
+ self.cleanup(tmpfile)
+
+ def test_output_delimiter_pipe_parameter(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , -P "select c1,c2,c3 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], sample_data_rows[0].replace(b(","), b("|")))
+ self.assertEqual(o[1], sample_data_rows[1].replace(b(","), b("|")))
+ self.assertEqual(o[2], sample_data_rows[2].replace(b(","), b("|")))
+
+ self.cleanup(tmpfile)
+
+ def test_output_delimiter_tab_parameter__with_manual_override_attempt(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , -T -D "|" "select c1,c2,c3 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 1)
+
+ self.assertEqual(o[0], sample_data_rows[0].replace(b(","), b("\t")))
+ self.assertEqual(o[1], sample_data_rows[1].replace(b(","), b("\t")))
+ self.assertEqual(o[2], sample_data_rows[2].replace(b(","), b("\t")))
+ self.assertEqual(e[0], b('Warning: -T parameter overrides -D parameter (|)'))
+
+ self.cleanup(tmpfile)
+
+ def test_output_delimiter_pipe_parameter__with_manual_override_attempt(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , -P -D ":" "select c1,c2,c3 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 1)
+
+ self.assertEqual(o[0], sample_data_rows[0].replace(b(","), b("|")))
+ self.assertEqual(o[1], sample_data_rows[1].replace(b(","), b("|")))
+ self.assertEqual(o[2], sample_data_rows[2].replace(b(","), b("|")))
+ self.assertEqual(e[0],b('Warning: -P parameter overrides -D parameter (:)'))
+
+ self.cleanup(tmpfile)
\ No newline at end of file
diff --git a/test/EncodingTests.py b/test/EncodingTests.py
new file mode 100644
index 00000000..2657a823
--- /dev/null
+++ b/test/EncodingTests.py
@@ -0,0 +1,28 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.test_data import sample_data_with_header
+from test.test_data import sample_data_no_header
+from test.test_data import header_row_with_spaces
+from test.base import AbstractQTestCase
+
+
+from test.utils import DEBUG,b
+
+class EncodingTests(AbstractQTestCase):
+
+ def test_utf8_with_bom_encoding(self):
+ utf_8_data_with_bom = b('\xef\xbb\xbf"typeid","limit","apcost","date","checkpointId"\n"1","2","5","1,2,3,4,5,6,7","3000,3001,3002"\n"2","2","5","1,2,3,4,5,6,7","3003,3004,3005"\n')
+ tmp_data_file = self.create_file_with_data(utf_8_data_with_bom,encoding=None)
+
+ cmd = Q_EXECUTABLE + ' -d , -H -O -e utf-8-sig "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),3)
+
+ self.assertEqual(o[0],b('typeid,limit,apcost,date,checkpointId'))
+ self.assertEqual(o[1],b('1,2,5,"1,2,3,4,5,6,7","3000,3001,3002"'))
+ self.assertEqual(o[2],b('2,2,5,"1,2,3,4,5,6,7","3003,3004,3005"'))
+
+ self.cleanup(tmp_data_file)
\ No newline at end of file
diff --git a/test/FormattingTests.py b/test/FormattingTests.py
new file mode 100644
index 00000000..67adcbf0
--- /dev/null
+++ b/test/FormattingTests.py
@@ -0,0 +1,100 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.test_data import sample_data_no_header, sample_data_rows
+from test.base import AbstractQTestCase
+
+from test.utils import DEBUG,b
+
+class FormattingTests(AbstractQTestCase):
+
+ def test_column_formatting(self):
+ # TODO Decide if this breaking change is reasonable
+ #cmd = 'seq 1 10 | ' + Q_EXECUTABLE + ' -f 1=%4.3f,2=%4.3f "select sum(c1),avg(c1) from -" -c 1'
+ cmd = 'seq 1 10 | ' + Q_EXECUTABLE + ' -f 1={:4.3f},2={:4.3f} "select sum(c1),avg(c1) from -" -c 1'
+
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b('55.000 5.500'))
+
+ def test_column_formatting_with_output_header(self):
+ perl_regex = "'s/1\n/column_name\n1\n/;'"
+ # TODO Decide if this breaking change is reasonable
+ #cmd = 'seq 1 10 | perl -pe ' + perl_regex + ' | ' + Q_EXECUTABLE + ' -f 1=%4.3f,2=%4.3f "select sum(column_name) mysum,avg(column_name) myavg from -" -c 1 -H -O'
+ cmd = 'seq 1 10 | LANG=C perl -pe ' + perl_regex + ' | ' + Q_EXECUTABLE + ' -f 1={:4.3f},2={:4.3f} "select sum(column_name) mysum,avg(column_name) myavg from -" -c 1 -H -O'
+
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b('mysum myavg'))
+ self.assertEqual(o[1], b('55.000 5.500'))
+
+ def py3_test_successfuly_parse_universal_newlines_without_explicit_flag(self):
+ def list_as_byte_list(l):
+ return list(map(lambda x:b(x),l))
+
+ expected_output = list(map(lambda x:list_as_byte_list(x),[['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-May-07', '6850000', 'USD', 'b'],
+ ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Oct-06', '6000000', 'USD', 'a'],
+ ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Jan-08', '25000000', 'USD', 'c'],
+ ['mycityfaces', 'MyCityFaces', '7', 'web', 'Scottsdale', 'AZ', '1-Jan-08', '50000', 'USD', 'seed'],
+ ['flypaper', 'Flypaper', '', 'web', 'Phoenix', 'AZ', '1-Feb-08', '3000000', 'USD', 'a'],
+ ['infusionsoft', 'Infusionsoft', '105', 'software', 'Gilbert', 'AZ', '1-Oct-07', '9000000', 'USD', 'a']]))
+
+ data = b('permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round\rlifelock,LifeLock,,web,Tempe,AZ,1-May-07,6850000,USD,b\rlifelock,LifeLock,,web,Tempe,AZ,1-Oct-06,6000000,USD,a\rlifelock,LifeLock,,web,Tempe,AZ,1-Jan-08,25000000,USD,c\rmycityfaces,MyCityFaces,7,web,Scottsdale,AZ,1-Jan-08,50000,USD,seed\rflypaper,Flypaper,,web,Phoenix,AZ,1-Feb-08,3000000,USD,a\rinfusionsoft,Infusionsoft,105,software,Gilbert,AZ,1-Oct-07,9000000,USD,a')
+ tmp_data_file = self.create_file_with_data(data)
+
+ cmd = Q_EXECUTABLE + ' -d , -H "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 6)
+
+ actual_output = list(map(lambda row: row.split(b(",")),o))
+
+ self.assertEqual(actual_output,expected_output)
+
+ self.cleanup(tmp_data_file)
+
+ test_parsing_universal_newlines_without_explicit_flag = py3_test_successfuly_parse_universal_newlines_without_explicit_flag
+
+ def test_universal_newlines_parsing_flag(self):
+ def list_as_byte_list(l):
+ return list(map(lambda x:b(x),l))
+
+ expected_output = list(map(lambda x:list_as_byte_list(x),[['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-May-07', '6850000', 'USD', 'b'],
+ ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Oct-06', '6000000', 'USD', 'a'],
+ ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Jan-08', '25000000', 'USD', 'c'],
+ ['mycityfaces', 'MyCityFaces', '7', 'web', 'Scottsdale', 'AZ', '1-Jan-08', '50000', 'USD', 'seed'],
+ ['flypaper', 'Flypaper', '', 'web', 'Phoenix', 'AZ', '1-Feb-08', '3000000', 'USD', 'a'],
+ ['infusionsoft', 'Infusionsoft', '105', 'software', 'Gilbert', 'AZ', '1-Oct-07', '9000000', 'USD', 'a']]))
+
+ data = b('permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round\rlifelock,LifeLock,,web,Tempe,AZ,1-May-07,6850000,USD,b\rlifelock,LifeLock,,web,Tempe,AZ,1-Oct-06,6000000,USD,a\rlifelock,LifeLock,,web,Tempe,AZ,1-Jan-08,25000000,USD,c\rmycityfaces,MyCityFaces,7,web,Scottsdale,AZ,1-Jan-08,50000,USD,seed\rflypaper,Flypaper,,web,Phoenix,AZ,1-Feb-08,3000000,USD,a\rinfusionsoft,Infusionsoft,105,software,Gilbert,AZ,1-Oct-07,9000000,USD,a')
+ tmp_data_file = self.create_file_with_data(data)
+
+ cmd = Q_EXECUTABLE + ' -d , -H -U "select permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+
+ if len(e) == 2 or len(e) == 1:
+ # In python 3.7, there's a deprecation warning for the 'U' file opening mode, which is ok for now
+ self.assertIn(len(e), [1,2])
+ self.assertTrue(b"DeprecationWarning: 'U' mode is deprecated" in e[0])
+ elif len(e) != 0:
+ # Nothing should be output to stderr in other versions
+ self.assertTrue(False,msg='Unidentified output in stderr')
+
+ self.assertEqual(len(o), 6)
+
+ actual_output = list(map(lambda row: row.split(b(",")),o))
+
+ self.assertEqual(actual_output,expected_output)
+
+ self.cleanup(tmp_data_file)
\ No newline at end of file
diff --git a/test/GzippingTests.py b/test/GzippingTests.py
new file mode 100644
index 00000000..0d06d19f
--- /dev/null
+++ b/test/GzippingTests.py
@@ -0,0 +1,24 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.test_data import sample_data_no_header, sample_data_rows
+
+from test.utils import b
+
+from test.utils import DEBUG
+class GzippingTests(AbstractQTestCase):
+
+ def test_gzipped_file(self):
+ tmpfile = self.create_file_with_data(
+ b('\x1f\x8b\x08\x08\xf2\x18\x12S\x00\x03xxxxxx\x003\xe42\xe22\xe62\xe12\xe52\xe32\xe7\xb2\xe0\xb2\xe424\xe0\x02\x00\xeb\xbf\x8a\x13\x15\x00\x00\x00'))
+
+ cmd = Q_EXECUTABLE + ' -z "select sum(c1),avg(c1) from %s"' % tmpfile.name
+
+ retcode, o, e = run_command(cmd)
+ self.assertTrue(retcode == 0)
+ self.assertTrue(len(o) == 1)
+ self.assertTrue(len(e) == 0)
+
+ s = sum(range(1, 11))
+ self.assertTrue(o[0] == b('%s %s' % (s, s / 10.0)))
+
+ self.cleanup(tmpfile)
\ No newline at end of file
diff --git a/test/ManyOpenFilesTests.py b/test/ManyOpenFilesTests.py
new file mode 100644
index 00000000..f5c66a0a
--- /dev/null
+++ b/test/ManyOpenFilesTests.py
@@ -0,0 +1,297 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, batch, run_command
+from test.test_data import sample_data_no_header, sample_data_rows
+from test.base import AbstractQTestCase
+
+from test.utils import b
+from test.utils import DEBUG
+
+import collections
+import os
+
+
+class ManyOpenFilesTests(AbstractQTestCase):
+
+
+ def test_multi_file_header_skipping(self):
+ BATCH_SIZE = 50
+ FILE_COUNT = 5
+
+ numbers = list(range(1,1+BATCH_SIZE*FILE_COUNT))
+ numbers_as_text = batch([str(x) for x in numbers],n=BATCH_SIZE)
+
+ content_list = list(map(b,['a\n' + "\n".join(x)+'\n' for x in numbers_as_text]))
+
+ filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d,'split-files','multi-header')
+
+ cmd = '%s -d , -H -c 1 "select count(a),sum(a) from %s/*" -C none' % (Q_EXECUTABLE,tmpfolder)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0],b("%s,%s" % (BATCH_SIZE*FILE_COUNT,sum(numbers))))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_that_globs_dont_max_out_sqlite_attached_database_limits(self):
+ BATCH_SIZE = 50
+ FILE_COUNT = 40
+
+ numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
+
+ content_list = map(b,["\n".join(x)+'\n' for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
+ #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ cmd = 'cd %s && %s -c 1 "select count(*) from *" -C none --max-attached-sqlite-databases=10' % (tmpfolder,Q_EXECUTABLE)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0],b(str(BATCH_SIZE*FILE_COUNT)))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_maxing_out_max_attached_database_limits__regular_files(self):
+ BATCH_SIZE = 50
+ FILE_COUNT = 40
+
+ numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
+
+ content_list = map(b,["\n".join(x)+'\n' for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
+ #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ unioned_subquery = " UNION ALL ".join(["select * from %s/%s" % (tmpfolder,filename) for filename in filename_list])
+ cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C none --max-attached-sqlite-databases=10' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0],b(str(BATCH_SIZE*FILE_COUNT)))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_maxing_out_max_attached_database_limits__with_qsql_files_below_attached_limit(self):
+ MAX_ATTACHED_SQLITE_DATABASES = 10
+
+ BATCH_SIZE = 50
+ FILE_COUNT = MAX_ATTACHED_SQLITE_DATABASES - 1
+
+ numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
+
+ content_list = map(b,["\n".join(x)+'\n' for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
+ #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ # Execute the query with -C readwrite, so all qsql files will be created
+ unioned_subquery = " UNION ALL ".join(["select * from %s/%s" % (tmpfolder,filename) for filename in filename_list])
+ cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite --max-attached-sqlite-databases=%s' % (tmpfolder,Q_EXECUTABLE,unioned_subquery,MAX_ATTACHED_SQLITE_DATABASES)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b(str(BATCH_SIZE*FILE_COUNT)))
+
+ # Now execute the same query with -C readwrite, so all files will be read directly from the qsql files
+ cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b(str(BATCH_SIZE*FILE_COUNT)))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_maxing_out_max_attached_database_limits__with_qsql_files_above_attached_limit(self):
+ MAX_ATTACHED_SQLITE_DATABASES = 10
+
+ BATCH_SIZE = 50
+ # Here's the difference from test_maxing_out_max_attached_database_limits__with_qsql_files_below_attached_limit
+ # We're trying to cache 2 times the number of files than the number of databases that can be attached.
+ # Expectation is that only a part of the files will be cached
+ FILE_COUNT = MAX_ATTACHED_SQLITE_DATABASES * 2
+
+ numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
+
+ content_list = map(b,["\n".join(x)+'\n' for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
+ #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ # Execute the query with -C readwrite, so all qsql files will be created
+ unioned_subquery = " UNION ALL ".join(["select * from %s/%s" % (tmpfolder,filename) for filename in filename_list])
+ cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite --max-attached-sqlite-databases=%s' % (tmpfolder,Q_EXECUTABLE,unioned_subquery,MAX_ATTACHED_SQLITE_DATABASES)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b(str(BATCH_SIZE*FILE_COUNT)))
+
+ # Now execute the same query with -C readwrite, so all files will be read directly from the qsql files
+ cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b(str(BATCH_SIZE*FILE_COUNT)))
+
+ from glob import glob
+ files_in_folder = [os.path.basename(x) for x in glob('%s/*' % (tmpfolder))]
+
+ expected_files_in_folder = filename_list + list(map(lambda x: 'file-%s.qsql' % x,range(MAX_ATTACHED_SQLITE_DATABASES-2)))
+
+ self.assertEqual(sorted(files_in_folder),sorted(expected_files_in_folder))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_maxing_out_max_attached_database_limits__with_directly_using_qsql_files(self):
+ MAX_ATTACHED_SQLITE_DATABASES = 10
+
+ BATCH_SIZE = 50
+ FILE_COUNT = MAX_ATTACHED_SQLITE_DATABASES * 2
+
+ numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
+
+ content_list = map(b,["\n".join(x)+'\n' for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
+ #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ # Prepare qsql for each of the files (separately, just for simplicity)
+ for fn in filename_list:
+ cmd = 'cd %s && %s -c 1 "select count(*) from %s" -C readwrite' % (tmpfolder,Q_EXECUTABLE,fn)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ # Now execute a big query which uses the created qsql files
+ unioned_subquery = " UNION ALL ".join(["select * from %s/%s.qsql" % (tmpfolder,filename) for filename in filename_list])
+
+ cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b(str(BATCH_SIZE*FILE_COUNT)))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_too_many_open_files_for_one_table(self):
+ # Previously file opening was parallel, causing too-many-open-files
+
+ MAX_ALLOWED_FILES = 500
+
+ BATCH_SIZE = 2
+ FILE_COUNT = MAX_ALLOWED_FILES + 1
+
+ numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
+
+ content_list = map(b,["\n".join(x) for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
+
+ cmd = 'cd %s && %s -c 1 "select count(*) from * where 1 = 1 or c1 != 2" -C none' % (tmpfolder,Q_EXECUTABLE)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 82)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ x = b('Maximum source files for table must be %s. Table is name is %s/* Number of actual files is %s' % (MAX_ALLOWED_FILES,os.path.realpath(tmpfolder),FILE_COUNT))
+ print(x)
+ self.assertEqual(e[0],x)
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_many_open_files_for_one_table(self):
+ # Previously file opening was parallel, causing too-many-open-files
+
+ BATCH_SIZE = 2
+ FILE_COUNT = 500
+
+ numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
+
+ content_list = map(b,["\n".join(x) for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
+ #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ cmd = 'cd %s && %s -c 1 "select count(*) from * where 1 = 1 or c1 != 2" -C none' % (tmpfolder,Q_EXECUTABLE)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0],b(str(BATCH_SIZE*FILE_COUNT)))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_many_open_files_for_two_tables(self):
+ BATCH_SIZE = 2
+ FILE_COUNT = 500
+
+ numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)
+
+ content_list = map(b, ["\n".join(x) for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder1 = self.create_folder_with_files(d, 'split-files1', 'blah')
+ tmpfolder2 = self.create_folder_with_files(d, 'split-files1', 'blah')
+
+ cmd = '%s -c 1 "select count(*) from %s/* a left join %s/* b on (a.c1 = b.c1)" -C none' % (
+ Q_EXECUTABLE,
+ tmpfolder1,
+ tmpfolder2)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b(str(BATCH_SIZE * FILE_COUNT)))
+
+ self.cleanup_folder(tmpfolder1)
+ self.cleanup_folder(tmpfolder2)
\ No newline at end of file
diff --git a/test/MultiHeaderTests.py b/test/MultiHeaderTests.py
new file mode 100644
index 00000000..d470ca4d
--- /dev/null
+++ b/test/MultiHeaderTests.py
@@ -0,0 +1,80 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.base import AbstractQTestCase
+from test.test_data import (uneven_ls_output,find_output,header_row,sample_data_rows,sample_data_rows_with_empty_string,sample_data_no_header,sample_data_with_empty_string_no_header,sample_data_with_header,sample_data_with_missing_header_names,generate_sample_data_with_header,sample_quoted_data,double_double_quoted_data,escaped_double_quoted_data,combined_quoted_data,sample_quoted_data2,sample_quoted_data2_with_newline,one_column_data,sample_data_rows_with_spaces,sample_data_with_spaces_no_header,header_row_with_spaces,sample_data_with_spaces_with_header,long_value1,int_value,sample_data_with_long_values)
+
+from test.utils import b
+from test.utils import DEBUG
+
+class MultiHeaderTests(AbstractQTestCase):
+ def test_output_header_when_multiple_input_headers_exist(self):
+ TMPFILE_COUNT = 5
+ tmpfiles = [self.create_file_with_data(sample_data_with_header) for x in range(TMPFILE_COUNT)]
+
+ tmpfilenames = " UNION ALL ".join(map(lambda x:"select * from %s" % x.name, tmpfiles))
+
+ cmd = Q_EXECUTABLE + ' -d , "select name,value1,value2 from (%s) order by name" -H -O' % tmpfilenames
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), TMPFILE_COUNT*3+1)
+ self.assertEqual(o[0], b("name,value1,value2"))
+
+ for i in range (TMPFILE_COUNT):
+ self.assertEqual(o[1+i],sample_data_rows[0])
+ for i in range (TMPFILE_COUNT):
+ self.assertEqual(o[TMPFILE_COUNT+1+i],sample_data_rows[1])
+ for i in range (TMPFILE_COUNT):
+ self.assertEqual(o[TMPFILE_COUNT*2+1+i],sample_data_rows[2])
+
+ for oi in o[1:]:
+ self.assertTrue(b('name') not in oi)
+
+ for i in range(TMPFILE_COUNT):
+ self.cleanup(tmpfiles[i])
+
+ def test_output_header_when_extra_header_column_names_are_different__concatenation_replacement(self):
+ tmpfile1 = self.create_file_with_data(sample_data_with_header)
+ tmpfile2 = self.create_file_with_data(generate_sample_data_with_header(b('othername,value1,value2')))
+
+ cmd = Q_EXECUTABLE + ' -d , "select name,value1,value2 from (select * from %s union all select * from %s) order by name" -H -O' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 7)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o, [
+ b('name,value1,value2'),
+ b('a,1,0'),
+ b('a,1,0'),
+ b('b,2,0'),
+ b('b,2,0'),
+ b('c,,0'),
+ b('c,,0')
+ ])
+
+ self.cleanup(tmpfile1)
+ self.cleanup(tmpfile2)
+
+ def test_output_header_when_extra_header_has_different_number_of_columns(self):
+ tmpfile1 = self.create_file_with_data(sample_data_with_header)
+ tmpfile2 = self.create_file_with_data(generate_sample_data_with_header(b('name,value1')))
+
+ cmd = Q_EXECUTABLE + ' -d , "select name,value1,value2 from (select * from %s UNION ALL select * from %s) order by name" -H -O' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 7)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o, [
+ b('name,value1,value2'),
+ b('a,1,0'),
+ b('a,1,0'),
+ b('b,2,0'),
+ b('b,2,0'),
+ b('c,,0'),
+ b('c,,0')
+ ])
+
+ self.cleanup(tmpfile1)
+ self.cleanup(tmpfile2)
\ No newline at end of file
diff --git a/test/OldSaveDbToDiskTests.py b/test/OldSaveDbToDiskTests.py
new file mode 100644
index 00000000..7d5ee8b1
--- /dev/null
+++ b/test/OldSaveDbToDiskTests.py
@@ -0,0 +1,362 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command, sqlite_dict_factory
+
+from test.utils import DEBUG, b
+from test.test_data import sample_data_no_header, sample_data_rows
+from test.base import AbstractQTestCase
+
+import os
+import re
+import sqlite3
+
+
+class OldSaveDbToDiskTests(AbstractQTestCase):
+
+ def test_join_with_stdin_and_save(self):
+ x = [b(a) for a in map(str,range(1,101))]
+ large_file_data = b("val\n") + b("\n").join(x)
+ tmpfile = self.create_file_with_data(large_file_data)
+ tmpfile_expected_table_name = os.path.basename(tmpfile.name)
+
+ disk_db_filename = self.random_tmp_filename('save-to-db','sqlite')
+
+ cmd = '(echo id ; seq 1 2 10) | ' + Q_EXECUTABLE + ' -c 1 -H -O "select stdin.*,f.* from - stdin left join %s f on (stdin.id * 10 = f.val)" -S %s' % \
+ (tmpfile.name,disk_db_filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+
+ self.assertEqual(e[0],b('Going to save data into a disk database: %s' % disk_db_filename))
+ self.assertTrue(e[1].startswith(b('Data has been saved into %s . Saving has taken ' % disk_db_filename)))
+ self.assertEqual(e[2],b('Query to run on the database: select stdin.*,f.* from data_stream_stdin stdin left join %s f on (stdin.id * 10 = f.val);' % \
+ tmpfile_expected_table_name))
+ self.assertEqual(e[3],b('You can run the query directly from the command line using the following command: echo "select stdin.*,f.* from data_stream_stdin stdin left join %s f on (stdin.id * 10 = f.val)" | sqlite3 %s' %
+ (tmpfile_expected_table_name,disk_db_filename)))
+
+ P = re.compile(b("^Query to run on the database: (?P.*)$"))
+ m = P.search(e[2])
+ query_to_run_on_db = m.groupdict()['query_to_run_on_db']
+
+ self.assertTrue(os.path.exists(disk_db_filename))
+
+ # validate disk db content natively
+ c = sqlite3.connect(disk_db_filename)
+ c.row_factory = sqlite_dict_factory
+ t0_results = c.execute('select * from data_stream_stdin').fetchall()
+ self.assertEqual(len(t0_results),5)
+ self.assertEqual(sorted(list(t0_results[0].keys())), ['id'])
+ self.assertEqual(list(map(lambda x:x['id'],t0_results)),[1,3,5,7,9])
+ t1_results = c.execute('select * from %s' % tmpfile_expected_table_name).fetchall()
+ self.assertEqual(len(t1_results),100)
+ self.assertEqual(sorted(list(t1_results[0].keys())), ['val'])
+ self.assertEqual("\n".join(list(map(lambda x:str(x['val']),t1_results))),"\n".join(map(str,range(1,101))))
+
+ query_results = c.execute(query_to_run_on_db.decode('utf-8')).fetchall()
+
+ self.assertEqual(query_results[0],{ 'id': 1 , 'val': 10})
+ self.assertEqual(query_results[1],{ 'id': 3 , 'val': 30})
+ self.assertEqual(query_results[2],{ 'id': 5 , 'val': 50})
+ self.assertEqual(query_results[3],{ 'id': 7 , 'val': 70})
+ self.assertEqual(query_results[4],{ 'id': 9 , 'val': 90})
+
+ self.cleanup(tmpfile)
+
+ def test_join_with_qsql_file(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+
+ header = [b('aa'), b('bb'), b('cc')]
+
+ new_tmp_folder = self.create_folder_with_files({
+ 'some_csv_file': self.arrays_to_csv_file_content(b(','),header,numbers1),
+ 'some_qsql_database.qsql' : self.arrays_to_qsql_file_content(header,numbers2)
+ },prefix='xx',suffix='yy')
+
+ effective_filename1 = '%s/some_csv_file' % new_tmp_folder
+ effective_filename2 = '%s/some_qsql_database.qsql' % new_tmp_folder
+
+ cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(small_file.aa) from %s large_file left join %s small_file on (small_file.aa == large_file.bb)"' % \
+ (effective_filename1,effective_filename2)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('50005000,55'))
+
+ # TODO RLRL Check if needed anymore
+
+ # def test_creation_of_qsql_database(self):
+ # numbers = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+ # header = [b('aa'), b('bb'), b('cc')]
+ #
+ # qsql_filename = self.create_qsql_file_with_content_and_return_filename(header,numbers)
+ #
+ # conn = sqlite3.connect(qsql_filename)
+ # qcatalog = conn.execute('select temp_table_name,source_type,source from _qcatalog').fetchall()
+ # print(qcatalog)
+ #
+ # cmd = '%s "select count(*) from %s" -A' % (Q_EXECUTABLE,qsql_filename)
+ # retcode, o, e = run_command(cmd)
+ # print(o)
+
+ def test_join_with_qsql_file_and_save(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+
+ header = [b('aa'), b('bb'), b('cc')]
+
+ saved_qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
+
+ new_tmp_folder = self.create_folder_with_files({
+ 'some_csv_file': self.arrays_to_csv_file_content(b(','),header,numbers1),
+ 'some_qsql_database' : self.arrays_to_csv_file_content(b(','),header,numbers2)
+ },prefix='xx',suffix='yy')
+ cmd = '%s -d , -H "select count(*) from %s/some_qsql_database" -C readwrite' % (Q_EXECUTABLE,new_tmp_folder)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode,0)
+ os.remove('%s/some_qsql_database' % new_tmp_folder)
+
+ effective_filename1 = '%s/some_csv_file' % new_tmp_folder
+ effective_filename2 = '%s/some_qsql_database.qsql' % new_tmp_folder
+
+ cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(small_file.aa) from %s large_file left join %s small_file on (small_file.aa == large_file.bb)" -S %s' % \
+ (effective_filename1,effective_filename2,saved_qsql_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+
+ conn = sqlite3.connect(saved_qsql_with_multiple_tables)
+ c1 = conn.execute('select count(*) from some_csv_file').fetchall()
+ c2 = conn.execute('select count(*) from some_qsql_database').fetchall()
+
+ self.assertEqual(c1[0][0],10000)
+ self.assertEqual(c2[0][0],10)
+
+
+ def test_saving_to_db_with_same_basename_files(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+
+ header = [b('aa'), b('bb'), b('cc')]
+
+ qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
+
+ new_tmp_folder = self.create_folder_with_files({
+ 'filename1': self.arrays_to_csv_file_content(b(','),header,numbers1),
+ 'otherfolder/filename1' : self.arrays_to_csv_file_content(b(','),header,numbers2)
+ },prefix='xx',suffix='yy')
+
+ effective_filename1 = '%s/filename1' % new_tmp_folder
+ effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder
+
+ expected_stored_table_name1 = 'filename1'
+ expected_stored_table_name2 = 'filename1_2'
+
+ cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
+ (effective_filename1,effective_filename2,qsql_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+ self.assertEqual(e[0], b('Going to save data into a disk database: %s' % qsql_with_multiple_tables))
+ self.assertTrue(e[1].startswith(b('Data has been saved into %s . Saving has taken' % qsql_with_multiple_tables)))
+ self.assertEqual(e[2],b('Query to run on the database: select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb);' % \
+ (expected_stored_table_name1,expected_stored_table_name2)))
+ self.assertEqual(e[3],b('You can run the query directly from the command line using the following command: echo "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" | sqlite3 %s' % \
+ (expected_stored_table_name1,expected_stored_table_name2,qsql_with_multiple_tables)))
+
+ #self.assertTrue(False) # pxpx - need to actually test reading from the saved db file
+ conn = sqlite3.connect(qsql_with_multiple_tables)
+ c1 = conn.execute('select count(*) from filename1').fetchall()
+ c2 = conn.execute('select count(*) from filename1_2').fetchall()
+
+ self.assertEqual(c1[0][0],10000)
+ self.assertEqual(c2[0][0],10)
+
+
+ def test_error_when_not_specifying_table_name_in_multi_table_qsql(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+
+ header = [b('aa'), b('bb'), b('cc')]
+
+ qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
+
+ new_tmp_folder = self.create_folder_with_files({
+ 'filename1': self.arrays_to_csv_file_content(b(','),header,numbers1),
+ 'otherfolder/filename1' : self.arrays_to_csv_file_content(b(','),header,numbers2)
+ },prefix='xx',suffix='yy')
+
+ effective_filename1 = '%s/filename1' % new_tmp_folder
+ effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder
+
+ expected_stored_table_name1 = 'filename1'
+ expected_stored_table_name2 = 'filename1_2'
+
+ cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
+ (effective_filename1,effective_filename2,qsql_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+
+ # Actual tests
+
+ cmd = '%s "select count(*) from %s"' % (Q_EXECUTABLE,qsql_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 87)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+ self.assertEqual(e[0],b('Could not autodetect table name in sqlite file %s . Existing tables: %s,%s' % (qsql_with_multiple_tables,expected_stored_table_name1,expected_stored_table_name2)))
+
+ def test_error_when_not_specifying_table_name_in_multi_table_sqlite(self):
+ sqlite_with_multiple_tables = self.generate_tmpfile_name(suffix='.sqlite')
+
+ c = sqlite3.connect(sqlite_with_multiple_tables)
+ c.execute('create table my_table_1 (x int, y int)').fetchall()
+ c.execute('create table my_table_2 (x int, y int)').fetchall()
+ c.close()
+
+ cmd = '%s "select count(*) from %s"' % (Q_EXECUTABLE,sqlite_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 87)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ print(e[0])
+ self.assertEqual(e[0],b('Could not autodetect table name in sqlite file %s . Existing tables: my_table_1,my_table_2' % sqlite_with_multiple_tables))
+
+ def test_querying_from_multi_table_sqlite_using_explicit_table_name(self):
+ sqlite_with_multiple_tables = self.generate_tmpfile_name(suffix='.sqlite')
+
+ c = sqlite3.connect(sqlite_with_multiple_tables)
+ c.execute('create table my_table_1 (x int, y int)').fetchall()
+ c.execute('insert into my_table_1 (x,y) values (100,200),(300,400)').fetchall()
+ c.execute('commit').fetchall()
+ c.execute('create table my_table_2 (x int, y int)').fetchall()
+ c.close()
+
+ cmd = '%s -d , "select * from %s:::my_table_1"' % (Q_EXECUTABLE,sqlite_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('100,200'))
+ self.assertEqual(o[1],b('300,400'))
+
+ # Check again, this time with a different output delimiter and with explicit column names
+ cmd = '%s -t "select x,y from %s:::my_table_1"' % (Q_EXECUTABLE,sqlite_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('100\t200'))
+ self.assertEqual(o[1],b('300\t400'))
+
+
+ def test_error_when_specifying_nonexistent_table_name_in_multi_table_qsql(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+
+ header = [b('aa'), b('bb'), b('cc')]
+
+ qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
+
+ new_tmp_folder = self.create_folder_with_files({
+ 'filename1': self.arrays_to_csv_file_content(b(','),header,numbers1),
+ 'otherfolder/filename1' : self.arrays_to_csv_file_content(b(','),header,numbers2)
+ },prefix='xx',suffix='yy')
+
+ effective_filename1 = '%s/filename1' % new_tmp_folder
+ effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder
+
+ expected_stored_table_name1 = 'filename1'
+ expected_stored_table_name2 = 'filename1_2'
+
+ cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
+ (effective_filename1,effective_filename2,qsql_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+
+ # Actual tests
+
+ cmd = '%s "select count(*) from %s:::non_existent_table"' % (Q_EXECUTABLE,qsql_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 85)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+ self.assertEqual(e[0],b('Table non_existent_table could not be found in sqlite file %s . Existing table names: %s,%s' % \
+ (qsql_with_multiple_tables,expected_stored_table_name1,expected_stored_table_name2)))
+
+ def test_querying_multi_table_qsql_file(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+
+ header = [b('aa'), b('bb'), b('cc')]
+
+ qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
+
+ new_tmp_folder = self.create_folder_with_files({
+ 'filename1': self.arrays_to_csv_file_content(b(','),header,numbers1),
+ 'otherfolder/filename1' : self.arrays_to_csv_file_content(b(','),header,numbers2)
+ },prefix='xx',suffix='yy')
+
+ effective_filename1 = '%s/filename1' % new_tmp_folder
+ effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder
+
+ expected_stored_table_name1 = 'filename1'
+ expected_stored_table_name2 = 'filename1_2'
+
+ cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
+ (effective_filename1,effective_filename2,qsql_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+
+ # Actual tests
+
+ cmd = '%s "select count(*) from %s:::%s"' % (Q_EXECUTABLE,qsql_with_multiple_tables,expected_stored_table_name1)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('10000'))
+
+ cmd = '%s "select count(*) from %s:::%s"' % (Q_EXECUTABLE,qsql_with_multiple_tables,expected_stored_table_name2)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('10'))
+
+ def test_preventing_db_overwrite(self):
+ db_filename = self.random_tmp_filename('store-to-disk', 'db')
+ self.assertFalse(os.path.exists(db_filename))
+
+ retcode, o, e = run_command('seq 1 1000 | ' + Q_EXECUTABLE + ' "select count(*) from -" -c 1 -S %s' % db_filename)
+
+ self.assertTrue(retcode == 0)
+ self.assertTrue(os.path.exists(db_filename))
+
+ retcode2, o2, e2 = run_command('seq 1 1000 | ' + Q_EXECUTABLE + ' "select count(*) from -" -c 1 -S %s' % db_filename)
+ self.assertTrue(retcode2 != 0)
+ self.assertTrue(e2[0].startswith(b('Going to save data into a disk database')))
+ self.assertTrue(e2[1] == b('Disk database file {} already exists.'.format(db_filename)))
+
+ os.remove(db_filename)
\ No newline at end of file
diff --git a/test/ParsingModeTests.py b/test/ParsingModeTests.py
new file mode 100644
index 00000000..354bfa33
--- /dev/null
+++ b/test/ParsingModeTests.py
@@ -0,0 +1,326 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.test_data import (uneven_ls_output,find_output,header_row,sample_data_rows,sample_data_rows_with_empty_string,sample_data_no_header,sample_data_with_empty_string_no_header,sample_data_with_header,sample_data_with_missing_header_names,generate_sample_data_with_header,sample_quoted_data,double_double_quoted_data,escaped_double_quoted_data,combined_quoted_data,sample_quoted_data2,sample_quoted_data2_with_newline,one_column_data,sample_data_rows_with_spaces,sample_data_with_spaces_no_header,header_row_with_spaces,sample_data_with_spaces_with_header,long_value1,int_value,sample_data_with_long_values)
+from test.base import AbstractQTestCase
+
+from test.utils import b
+from test.utils import DEBUG
+
+class ParsingModeTests(AbstractQTestCase):
+
+ def test_strict_mode_column_count_mismatch_error(self):
+ tmpfile = self.create_file_with_data(uneven_ls_output)
+ cmd = Q_EXECUTABLE + ' -m strict "select count(*) from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+
+ self.assertTrue(b("Column Count is expected to identical") in e[0])
+
+ self.cleanup(tmpfile)
+
+ def test_strict_mode_too_large_specific_column_count(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , -m strict -c 4 "select count(*) from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+
+ self.assertEqual(
+ e[0], b("Strict mode. Column count is expected to be 4 but is 3"))
+
+ self.cleanup(tmpfile)
+
+ def test_strict_mode_too_small_specific_column_count(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , -m strict -c 2 "select count(*) from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+
+ self.assertEqual(
+ e[0], b("Strict mode. Column count is expected to be 2 but is 3"))
+
+ self.cleanup(tmpfile)
+
+ def test_relaxed_mode_missing_columns_in_header(self):
+ tmpfile = self.create_file_with_data(
+ sample_data_with_missing_header_names)
+ cmd = Q_EXECUTABLE + ' -d , -m relaxed "select count(*) from %s" -H -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 7)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0],b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s') % b(tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4],b(' `name` - text'))
+ self.assertEqual(o[5],b(' `value1` - int'))
+ self.assertEqual(o[6],b(' `c3` - int'))
+
+ self.cleanup(tmpfile)
+
+ def test_strict_mode_missing_columns_in_header(self):
+ tmpfile = self.create_file_with_data(
+ sample_data_with_missing_header_names)
+ cmd = Q_EXECUTABLE + ' -d , -m strict "select count(*) from %s" -H -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+
+ self.assertEqual(
+ e[0], b('Strict mode. Header row contains less columns than expected column count(2 vs 3)'))
+
+ self.cleanup(tmpfile)
+
+ def test_output_delimiter_with_missing_fields(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select * from %s" -D ";"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b('a;1;0'))
+ self.assertEqual(o[1], b('b;2;0'))
+ self.assertEqual(o[2], b('c;;0'))
+
+ self.cleanup(tmpfile)
+
+ def test_handling_of_null_integers(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select avg(c2) from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b('1.5'))
+
+ self.cleanup(tmpfile)
+
+ def test_empty_integer_values_converted_to_null(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select * from %s where c2 is null"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b('c,,0'))
+
+ self.cleanup(tmpfile)
+
+ def test_empty_string_values_not_converted_to_null(self):
+ tmpfile = self.create_file_with_data(
+ sample_data_with_empty_string_no_header)
+ cmd = Q_EXECUTABLE + ' -d , "select * from %s where c2 == %s"' % (
+ tmpfile.name, "''")
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b('c,,0'))
+
+ self.cleanup(tmpfile)
+
+ def test_relaxed_mode_detected_columns(self):
+ tmpfile = self.create_file_with_data(uneven_ls_output)
+ cmd = Q_EXECUTABLE + ' -m relaxed "select count(*) from %s" -A' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+
+ column_rows = o[o.index(b(' Fields:'))+1:]
+
+ self.assertEqual(len(column_rows), 11)
+
+ column_tuples = [x.strip().split(b(" ")) for x in column_rows]
+ column_info = [(x[0], x[2]) for x in column_tuples]
+ column_names = [x[0] for x in column_tuples]
+ column_types = [x[2] for x in column_tuples]
+
+ self.assertEqual(column_names, [b('`c{}`'.format(x)) for x in range(1, 12)])
+ self.assertEqual(column_types, list(map(lambda x:b(x),[
+ 'text', 'int', 'text', 'text', 'int', 'text', 'int', 'int', 'text', 'text', 'text'])))
+
+ self.cleanup(tmpfile)
+
+ def test_relaxed_mode_detected_columns_with_specific_column_count(self):
+ tmpfile = self.create_file_with_data(uneven_ls_output)
+ cmd = Q_EXECUTABLE + ' -m relaxed "select count(*) from %s" -A -c 9' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+
+ column_rows = o[o.index(b(' Fields:'))+1:]
+
+ self.assertEqual(len(column_rows), 9)
+
+ column_tuples = [x.strip().split(b(" ")) for x in column_rows]
+ column_info = [(x[0], x[2]) for x in column_tuples]
+ column_names = [x[0] for x in column_tuples]
+ column_types = [x[2] for x in column_tuples]
+
+ self.assertEqual(column_names, [b('`c{}`'.format(x)) for x in range(1, 10)])
+ self.assertEqual(
+ column_types, list(map(lambda x:b(x),['text', 'int', 'text', 'text', 'int', 'text', 'int', 'int', 'text'])))
+
+ self.cleanup(tmpfile)
+
+ def test_relaxed_mode_last_column_data_with_specific_column_count(self):
+ tmpfile = self.create_file_with_data(uneven_ls_output)
+ cmd = Q_EXECUTABLE + ' -m relaxed "select c9 from %s" -c 9' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 9)
+ self.assertEqual(len(e), 0)
+
+ expected_output = list(map(lambda x:b(x),["/selinux", "/mnt", "/srv", "/lost+found", '"/initrd.img.old -> /boot/initrd.img-3.8.0-19-generic"',
+ "/cdrom", "/home", '"/vmlinuz -> boot/vmlinuz-3.8.0-19-generic"', '"/initrd.img -> boot/initrd.img-3.8.0-19-generic"']))
+
+ self.assertEqual(o, expected_output)
+
+ self.cleanup(tmpfile)
+
+ def test_1_column_warning_in_relaxed_mode(self):
+ tmpfile = self.create_file_with_data(one_column_data)
+ cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d ,' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('data without commas 1'))
+ self.assertEqual(o[1],b('data without commas 2'))
+
+ self.cleanup(tmpfile)
+
+ def test_1_column_warning_in_strict_mode(self):
+ tmpfile = self.create_file_with_data(one_column_data)
+ cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d , -m strict' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('data without commas 1'))
+ self.assertEqual(o[1],b('data without commas 2'))
+
+ self.cleanup(tmpfile)
+
+
+ def test_1_column_warning_suppression_in_relaxed_mode_when_column_count_is_specific(self):
+ tmpfile = self.create_file_with_data(one_column_data)
+ cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d , -m relaxed -c 1' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('data without commas 1'))
+ self.assertEqual(o[1],b('data without commas 2'))
+
+ self.cleanup(tmpfile)
+
+ def test_1_column_warning_suppression_in_strict_mode_when_column_count_is_specific(self):
+ tmpfile = self.create_file_with_data(one_column_data)
+ cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d , -m strict -c 1' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('data without commas 1'))
+ self.assertEqual(o[1],b('data without commas 2'))
+
+ self.cleanup(tmpfile)
+
+ def test_fluffy_mode__as_relaxed_mode(self):
+ tmpfile = self.create_file_with_data(uneven_ls_output)
+ cmd = Q_EXECUTABLE + ' -m relaxed "select c9 from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 9)
+ self.assertEqual(len(e), 0)
+
+ expected_output = list(map(lambda x:b(x),["/selinux", "/mnt", "/srv", "/lost+found",
+ "/initrd.img.old", "/cdrom", "/home", "/vmlinuz", "/initrd.img"]))
+
+ self.assertEqual(o, expected_output)
+
+ self.cleanup(tmpfile)
+
+ def test_relaxed_mode_column_count_mismatch__was_previously_fluffy_mode_test(self):
+ data_row = b("column1 column2 column3 column4")
+ data_list = [data_row] * 1000
+ data_list[950] = b("column1 column2 column3 column4 column5")
+ tmpfile = self.create_file_with_data(b("\n").join(data_list))
+
+ cmd = Q_EXECUTABLE + ' -m relaxed "select * from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),1000)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[950],b('column1 column2 column3 "column4 column5"'))
+
+ self.cleanup(tmpfile)
+
+ def test_strict_mode_column_count_mismatch__less_columns(self):
+ data_row = b("column1 column2 column3 column4")
+ data_list = [data_row] * 1000
+ data_list[750] = b("column1 column3 column4")
+ tmpfile = self.create_file_with_data(b("\n").join(data_list))
+
+ cmd = Q_EXECUTABLE + ' -m strict "select * from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode,0)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+ self.assertTrue(e[0].startswith(b("Strict mode - Expected 4 columns instead of 3 columns")))
+ self.assertTrue(b(' row 751.') in e[0])
+
+ self.cleanup(tmpfile)
+
+ def test_strict_mode_column_count_mismatch__more_columns(self):
+ data_row = b("column1 column2 column3 column4")
+ data_list = [data_row] * 1000
+ data_list[750] = b("column1 column2 column3 column4 column5")
+ tmpfile = self.create_file_with_data(b("\n").join(data_list))
+
+ cmd = Q_EXECUTABLE + ' -m strict "select * from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode,0)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+ self.assertTrue(e[0].startswith(b("Strict mode - Expected 4 columns instead of 5 columns")))
+ self.assertTrue(b(' row 751.') in e[0])
+
+ self.cleanup(tmpfile)
\ No newline at end of file
diff --git a/test/QrcTests.py b/test/QrcTests.py
new file mode 100644
index 00000000..ea6c9fb0
--- /dev/null
+++ b/test/QrcTests.py
@@ -0,0 +1,166 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.test_data import sample_data_no_header, sample_data_rows
+from test.base import AbstractQTestCase
+
+from test.utils import DEBUG,b
+
+import os
+import sys
+import uuid
+
+
+class QrcTests(AbstractQTestCase):
+
+ def test_explicit_qrc_filename_not_found(self):
+ non_existent_filename = str(uuid.uuid4())
+ env_to_inject = { 'QRC_FILENAME': non_existent_filename}
+ cmd = Q_EXECUTABLE + ' "select 1"'
+ retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)
+
+ self.assertEqual(retcode, 244)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertTrue(e[0] == b('QRC_FILENAME env var exists, but cannot find qrc file at %s' % non_existent_filename))
+
+ def test_explicit_qrc_filename_that_exists(self):
+ tmp_qrc_file = self.create_file_with_data(b('''[options]
+output_delimiter=|
+'''))
+ env_to_inject = { 'QRC_FILENAME': tmp_qrc_file.name}
+ cmd = Q_EXECUTABLE + ' "select 1,2"'
+ retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertTrue(o[0] == b('1|2'))
+
+ self.cleanup(tmp_qrc_file)
+
+ def test_all_default_options(self):
+ # Create a qrc file that contains all default values inside the qrc file, but with some different values than the regular defaults
+ tmp_qrc_file = self.create_file_with_data(b('''[options]
+analyze_only=True
+beautify=True
+caching_mode=readwrite
+column_count=32
+delimiter=,
+disable_column_type_detection=True
+disable_double_double_quoting=False
+disable_escaped_double_quoting=False
+encoding=ascii
+formatting=xxx
+gzipped=True
+input_quoting_mode=all
+keep_leading_whitespace_in_values=True
+list_user_functions=True
+max_attached_sqlite_databases=888
+max_column_length_limit=8888
+mode=strict
+output_delimiter=|
+output_encoding=utf-8
+output_header=True
+output_quoting_mode=all
+overwrite_qsql=False
+pipe_delimited=True
+pipe_delimited_output=True
+query_encoding=ascii
+query_filename=query-filename
+save_db_to_disk_filename=save-db-to-disk-filename
+skip_header=True
+tab_delimited=True
+tab_delimited_output=true
+verbose=True
+with_universal_newlines=True
+'''))
+ env_to_inject = { 'QRC_FILENAME': tmp_qrc_file.name}
+ cmd = Q_EXECUTABLE + ' --dump-defaults'
+ retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 34)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0],b('[options]'))
+ o = o[1:]
+
+ m = {}
+ for r in o:
+ key,val = r.split(b("="),1)
+ m[key] = val
+
+ self.assertEqual(m[b('analyze_only')],b('True'))
+ self.assertEqual(m[b('beautify')],b('True'))
+ self.assertEqual(m[b('caching_mode')],b('readwrite'))
+ self.assertEqual(m[b('column_count')],b('32'))
+ self.assertEqual(m[b('delimiter')],b(','))
+ self.assertEqual(m[b('disable_column_type_detection')],b('True'))
+ self.assertEqual(m[b('disable_double_double_quoting')],b('False'))
+ self.assertEqual(m[b('disable_escaped_double_quoting')],b('False'))
+ self.assertEqual(m[b('encoding')],b('ascii'))
+ self.assertEqual(m[b('formatting')],b('xxx'))
+ self.assertEqual(m[b('gzipped')],b('True'))
+ self.assertEqual(m[b('input_quoting_mode')],b('all'))
+ self.assertEqual(m[b('keep_leading_whitespace_in_values')],b('True'))
+ self.assertEqual(m[b('list_user_functions')],b('True'))
+ self.assertEqual(m[b('max_attached_sqlite_databases')],b('888'))
+ self.assertEqual(m[b('max_column_length_limit')],b('8888'))
+ self.assertEqual(m[b('mode')],b('strict'))
+ self.assertEqual(m[b('output_delimiter')],b('|'))
+ self.assertEqual(m[b('output_encoding')],b('utf-8'))
+ self.assertEqual(m[b('output_header')],b('True'))
+ self.assertEqual(m[b('output_quoting_mode')],b('all'))
+ self.assertEqual(m[b('overwrite_qsql')],b('False'))
+ self.assertEqual(m[b('pipe_delimited')],b('True'))
+ self.assertEqual(m[b('pipe_delimited_output')],b('True'))
+ self.assertEqual(m[b('query_encoding')],b('ascii'))
+ self.assertEqual(m[b('query_filename')],b('query-filename'))
+ self.assertEqual(m[b('save_db_to_disk_filename')],b('save-db-to-disk-filename'))
+ self.assertEqual(m[b('skip_header')],b('True'))
+ self.assertEqual(m[b('tab_delimited')],b('True'))
+ self.assertEqual(m[b('tab_delimited_output')],b('True'))
+ self.assertEqual(m[b('verbose')],b('True'))
+ self.assertEqual(m[b('with_universal_newlines')],b('True'))
+
+ self.cleanup(tmp_qrc_file)
+
+ def test_caching_readwrite_using_qrc_file(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+ tmpfile_folder = os.path.dirname(tmpfile.name)
+ tmpfile_filename = os.path.basename(tmpfile.name)
+ expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -d , "select * from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),3)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('a,1,0'))
+ self.assertEqual(o[1],b('b,2,0'))
+ self.assertEqual(o[2],b('c,,0'))
+
+ # Ensure default does not create a cache file
+ self.assertTrue(not os.path.exists(expected_cache_filename))
+
+ tmp_qrc_file = self.create_file_with_data(b('''[options]
+caching_mode=readwrite
+'''))
+ env_to_inject = { 'QRC_FILENAME': tmp_qrc_file.name }
+ print(sys.version_info)
+ cmd = Q_EXECUTABLE + ' -d , "select * from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),3)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('a,1,0'))
+ self.assertEqual(o[1],b('b,2,0'))
+ self.assertEqual(o[2],b('c,,0'))
+
+ # Ensure that qrc file caching is being used and caching is activated (cache file should exist)
+ self.assertTrue(os.path.exists(expected_cache_filename))
+
+ self.cleanup(tmp_qrc_file)
+ self.cleanup(tmpfile)
\ No newline at end of file
diff --git a/test/QsqlUsageTests.py b/test/QsqlUsageTests.py
new file mode 100644
index 00000000..1b36f0a2
--- /dev/null
+++ b/test/QsqlUsageTests.py
@@ -0,0 +1,608 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+
+
+from test.utils import b
+from test.utils import DEBUG
+from test.base import AbstractQTestCase
+import os
+import sqlite3
+
+
+class QsqlUsageTests(AbstractQTestCase):
+
+ def test_concatenate_same_qsql_file_with_single_table(self):
+ numbers = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+
+ qsql_file_data = self.arrays_to_qsql_file_content([b('aa'), b('bb'), b('cc')], numbers)
+
+ tmpfile = self.create_file_with_data(qsql_file_data,suffix='.qsql')
+
+ cmd = Q_EXECUTABLE + ' -t "select count(*) from (select * from %s union all select * from %s)"' % (tmpfile.name,tmpfile.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('20000'))
+
+ def test_query_qsql_with_single_table(self):
+ numbers = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+
+ qsql_file_data = self.arrays_to_qsql_file_content([b('aa'), b('bb'), b('cc')], numbers)
+
+ tmpfile = self.create_file_with_data(qsql_file_data)
+
+ cmd = Q_EXECUTABLE + ' -t "select sum(aa),sum(bb),sum(cc) from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('50005000\t50005000\t50005000'))
+
+ def test_query_qsql_with_single_table_with_explicit_non_existent_tablename(self):
+ numbers = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+
+ qsql_file_data = self.arrays_to_qsql_file_content([b('aa'), b('bb'), b('cc')], numbers)
+
+ tmpfile = self.create_file_with_data(qsql_file_data)
+
+ c = sqlite3.connect(tmpfile.name)
+ actual_table_name = c.execute('select temp_table_name from _qcatalog').fetchall()[0][0]
+ c.close()
+
+
+ cmd = '%s -t "select sum(aa),sum(bb),sum(cc) from %s:::non-existent"' % (Q_EXECUTABLE,tmpfile.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 84)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+ self.assertEqual(e[0],b('Table non-existent could not be found in qsql file %s . Existing table names: %s' % (tmpfile.name,actual_table_name)))
+
+ def test_query_qsql_with_single_table_with_explicit_table_name(self):
+ numbers = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+
+ qsql_file_data = self.arrays_to_qsql_file_content([b('aa'), b('bb'), b('cc')], numbers)
+
+ tmpfile = self.create_file_with_data(qsql_file_data)
+
+ c = sqlite3.connect(tmpfile.name)
+ actual_table_name = c.execute('select temp_table_name from _qcatalog').fetchall()[0][0]
+ c.close()
+
+
+ cmd = '%s -t "select sum(aa),sum(bb),sum(cc) from %s:::%s"' % (Q_EXECUTABLE,tmpfile.name,actual_table_name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('50005000\t50005000\t50005000'))
+
+ def test_query_multi_qsql_with_single_table(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ qsql_file_data1 = self.arrays_to_qsql_file_content([b('aa'), b('bb'), b('cc')], numbers1)
+ tmpfile1 = self.create_file_with_data(qsql_file_data1,suffix='.qsql')
+
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+ qsql_file_data2 = self.arrays_to_qsql_file_content([b('aa'), b('bb'), b('cc')], numbers2)
+ tmpfile2 = self.create_file_with_data(qsql_file_data2,suffix='.qsql')
+
+ cmd = Q_EXECUTABLE + ' -t "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)"' % (tmpfile2.name,tmpfile1.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('55\t55\t55'))
+
+ def test_query_concatenated_qsqls_each_with_single_table(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ qsql_file_data1 = self.arrays_to_qsql_file_content([b('aa'), b('bb'), b('cc')], numbers1)
+ tmpfile1 = self.create_file_with_data(qsql_file_data1,suffix='.qsql')
+
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+ qsql_file_data2 = self.arrays_to_qsql_file_content([b('aa'), b('bb'), b('cc')], numbers2)
+ tmpfile2 = self.create_file_with_data(qsql_file_data2,suffix='.qsql')
+
+ cmd = Q_EXECUTABLE + ' -t "select sum(aa),sum(bb),sum(cc) from (select * from %s union all select * from %s)"' % (tmpfile2.name,tmpfile1.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('50005055\t50005055\t50005055'))
+
+ def test_concatenated_qsql_and_data_stream__column_names_mismatch(self):
+ N1 = 10000
+ N2 = 100
+
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, N1 + 1)]
+ csv_file_data1 = self.arrays_to_csv_file_content(b('\t'),[b('aa'), b('bb'), b('cc')], numbers1)
+ tmpfile1 = self.create_file_with_data(csv_file_data1)
+ expected_cache_filename1 = '%s.qsql' % tmpfile1.name
+
+ cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+
+ cmd = 'seq 1 %s | %s -c 1 "select count(*) from (select * from %s UNION ALL select * from -)"' % (N2, Q_EXECUTABLE,expected_cache_filename1)
+
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 1)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+ self.assertEqual(e[0],b('query error: SELECTs to the left and right of UNION ALL do not have the same number of result columns'))
+
+ def test_concatenated_qsql_and_data_stream(self):
+ N1 = 10000
+ N2 = 100
+
+ numbers1 = [[b(str(i))] for i in range(1, N1 + 1)]
+ csv_file_data1 = self.arrays_to_csv_file_content(b('\t'),[b('c1')], numbers1)
+ tmpfile1 = self.create_file_with_data(csv_file_data1)
+ expected_cache_filename1 = '%s.qsql' % tmpfile1.name
+
+ cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+
+ cmd = 'seq 1 %s | %s -t -c 1 "select count(*),sum(c1) from (select * from %s UNION ALL select * from -)"' % (N2, Q_EXECUTABLE,expected_cache_filename1)
+
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('%s\t%s' % (N1+N2,sum(range(1,N1+1)) + sum(range(1,N2+1)))))
+
+ def test_concatenated_qsql_and_data_stream__explicit_table_name(self):
+ N1 = 10000
+ N2 = 100
+
+ numbers1 = [[b(str(i))] for i in range(1, N1 + 1)]
+ csv_file_data1 = self.arrays_to_csv_file_content(b('\t'),[b('c1')], numbers1)
+ tmpfile1 = self.create_file_with_data(csv_file_data1)
+ tmpfile1_expected_table_name = os.path.basename(tmpfile1.name)
+
+ expected_cache_filename1 = '%s.qsql' % tmpfile1.name
+
+ cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+
+ cmd = 'seq 1 %s | %s -t -c 1 "select count(*),sum(c1) from (select * from %s:::%s UNION ALL select * from -)"' % (N2, Q_EXECUTABLE,expected_cache_filename1,tmpfile1_expected_table_name)
+
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('%s\t%s' % (N1+N2,sum(range(1,N1+1)) + sum(range(1,N2+1)))))
+
+ def test_write_to_qsql__check_chosen_table_name(self):
+ numbers1 = [[b(str(i))] for i in range(1, 10001)]
+ csv_file_data1 = self.arrays_to_csv_file_content(b('\t'),[b('c1')], numbers1)
+ tmpfile1 = self.create_file_with_data(csv_file_data1)
+ expected_cache_filename1 = '%s.qsql' % tmpfile1.name
+
+ cmd = Q_EXECUTABLE + ' -c 1 -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+
+ c = sqlite3.connect(expected_cache_filename1)
+ qcatalog_entries = c.execute('select temp_table_name from _qcatalog').fetchall()
+ self.assertEqual(len(qcatalog_entries),1)
+ self.assertEqual(qcatalog_entries[0][0],os.path.basename(tmpfile1.name))
+
+ def test_concatenated_mixes_qsql_with_single_table_and_csv(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ csv_file_data1 = self.arrays_to_csv_file_content(b('\t'),[b('aa'), b('bb'), b('cc')], numbers1)
+ tmpfile1 = self.create_file_with_data(csv_file_data1)
+ expected_cache_filename1 = '%s.qsql' % tmpfile1.name
+
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+ csv_file_data2 = self.arrays_to_csv_file_content(b('\t'),[b('aa'), b('bb'), b('cc')], numbers2)
+ tmpfile2 = self.create_file_with_data(csv_file_data2)
+ expected_cache_filename2 = '%s.qsql' % tmpfile2.name
+
+
+ cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+
+ cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile2.name
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertTrue(os.path.exists(expected_cache_filename2))
+
+ # csv and qsql files prepared. now test all four combinations
+
+ cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s union all select * from %s)"' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),2)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('cnt\tsum_aa\tsum_bb\tsum_cc'))
+ self.assertEqual(o[1],b('10010\t50005055\t50005055\t50005055'))
+
+ cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s union all select * from %s.qsql)"' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),2)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('cnt\tsum_aa\tsum_bb\tsum_cc'))
+ self.assertEqual(o[1],b('10010\t50005055\t50005055\t50005055'))
+
+ cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s.qsql union all select * from %s)"' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),2)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('cnt\tsum_aa\tsum_bb\tsum_cc'))
+ self.assertEqual(o[1],b('10010\t50005055\t50005055\t50005055'))
+
+ cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s.qsql union all select * from %s.qsql)"' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),2)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('cnt\tsum_aa\tsum_bb\tsum_cc'))
+ self.assertEqual(o[1],b('10010\t50005055\t50005055\t50005055'))
+
+ def test_analysis_of_concatenated_mixes_qsql_with_single_table_and_csv(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ csv_file_data1 = self.arrays_to_csv_file_content(b('\t'),[b('aa'), b('bb'), b('cc')], numbers1)
+ tmpfile1 = self.create_file_with_data(csv_file_data1)
+ expected_cache_filename1 = '%s.qsql' % tmpfile1.name
+
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+ csv_file_data2 = self.arrays_to_csv_file_content(b('\t'),[b('aa'), b('bb'), b('cc')], numbers2)
+ tmpfile2 = self.create_file_with_data(csv_file_data2)
+ expected_cache_filename2 = '%s.qsql' % tmpfile2.name
+
+ cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertTrue(os.path.exists(expected_cache_filename1))
+
+ cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile2.name
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertTrue(os.path.exists(expected_cache_filename2))
+
+ # csv and qsql files prepared
+
+ # Test function, will be used multiple times, each time with a different combination
+
+ def do_check(caching_mode,
+ file1_source_type,file1_table_postfix,file1_postfix,
+ file2_source_type,file2_table_postfix,file2_postfix):
+ cmd = '%s -C %s -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s%s UNION ALL select * from %s%s)" -A' % (
+ Q_EXECUTABLE,
+ caching_mode,
+ tmpfile1.name,
+ file1_table_postfix,
+ tmpfile2.name,
+ file2_table_postfix)
+
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),14)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o, [
+ b('Table: %s%s' % (tmpfile1.name,file1_table_postfix)),
+ b(' Sources:'),
+ b(' source_type: %s source: %s%s' % (file1_source_type,tmpfile1.name,file1_postfix)),
+ b(' Fields:'),
+ b(' `aa` - int'),
+ b(' `bb` - int'),
+ b(' `cc` - int'),
+ b('Table: %s%s' % (tmpfile2.name,file2_table_postfix)),
+ b(' Sources:'),
+ b(' source_type: %s source: %s%s' % (file2_source_type,tmpfile2.name,file2_postfix)),
+ b(' Fields:'),
+ b(' `aa` - int'),
+ b(' `bb` - int'),
+ b(' `cc` - int')])
+
+ # now test *the analysis results* of all four combinations, adding `-C read`, so the
+ # qsql will be used. Running with `-C none`, would have caused the qsql not to be used even if the qsql file exists
+
+ do_check(caching_mode='read',
+ file1_source_type='qsql-file-with-original',file1_table_postfix='',file1_postfix='.qsql',
+ file2_source_type='qsql-file-with-original',file2_table_postfix='',file2_postfix='.qsql')
+ do_check('read',
+ file1_source_type='qsql-file-with-original',file1_table_postfix='',file1_postfix='.qsql',
+ file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')
+ do_check('read',
+ file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
+ file2_source_type='qsql-file-with-original',file2_table_postfix='',file2_postfix='.qsql')
+ do_check('read',
+ file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
+ file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')
+
+ # Now test the all combinations again, this time with `-C none`, to make sure that by
+ # default, the qsql file is not used, and -A shows that fact
+
+ do_check(caching_mode='none',
+ file1_source_type='file-with-unused-qsql',file1_table_postfix='',file1_postfix='',
+ file2_source_type='file-with-unused-qsql',file2_table_postfix='',file2_postfix='')
+ do_check('none',
+ file1_source_type='file-with-unused-qsql',file1_table_postfix='',file1_postfix='',
+ file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')
+ do_check('none',
+ file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
+ file2_source_type='file-with-unused-qsql',file2_table_postfix='',file2_postfix='')
+ do_check('none',
+ file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
+ file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')
+
+ def test_mixed_qsql_with_single_table_and_csv__missing_header_parameter_for_csv(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ qsql_file_data1 = self.arrays_to_qsql_file_content([b('aa'), b('bb'), b('cc')], numbers1)
+ tmpfile1 = self.create_file_with_data(qsql_file_data1,suffix='.qsql')
+
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+ csv_file_data2 = self.arrays_to_csv_file_content(b('\t'),[b('aa'), b('bb'), b('cc')], numbers2)
+ tmpfile2 = self.create_file_with_data(csv_file_data2)
+
+ cmd = Q_EXECUTABLE + ' -t "select sum(aa),sum(bb),sum(cc) from (select * from %s union all select * from %s)"' % (tmpfile1.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0],b('Warning - There seems to be header line in the file, but -H has not been specified. All fields will be detected as text fields, and the header line will appear as part of the data'))
+ self.assertEqual(o[0],b('50005055.0\t50005055.0\t50005055.0'))
+
+ def test_qsql_with_multiple_tables_direct_use(self):
+ numbers1 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 10001)]
+ qsql_filename1 = self.create_qsql_file_with_content_and_return_filename([b('aa'), b('bb'), b('cc')],numbers1)
+ expected_stored_table_name1 = os.path.basename(qsql_filename1)[:-5]
+
+ numbers2 = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 11)]
+ qsql_filename2 = self.create_qsql_file_with_content_and_return_filename([b('aa'), b('bb'), b('cc')],numbers2)
+ expected_stored_table_name2 = os.path.basename(qsql_filename2)[:-5]
+
+ qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
+
+ cmd = '%s -t "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s large_file left join %s small_file on (large_file.aa == small_file.bb)" -S %s' % \
+ (Q_EXECUTABLE,qsql_filename1,qsql_filename2,qsql_with_multiple_tables)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+ self.assertEqual(e[0], b('Going to save data into a disk database: %s' % qsql_with_multiple_tables))
+ self.assertTrue(e[1].startswith(b('Data has been saved into %s . Saving has taken' % qsql_with_multiple_tables)))
+ self.assertEqual(e[2],b('Query to run on the database: select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s large_file left join %s small_file on (large_file.aa == small_file.bb);' % \
+ (expected_stored_table_name1,expected_stored_table_name2)))
+ self.assertEqual(e[3],b('You can run the query directly from the command line using the following command: echo "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s large_file left join %s small_file on (large_file.aa == small_file.bb)" | sqlite3 %s' % \
+ (expected_stored_table_name1,expected_stored_table_name2,qsql_with_multiple_tables)))
+
+ cmd = '%s -d , "select count(*) cnt,sum(aa),sum(bb),sum(cc) from %s:::%s"' % (Q_EXECUTABLE,qsql_with_multiple_tables,expected_stored_table_name1)
+ r, o, e = run_command(cmd)
+
+ self.assertEqual(r,0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('10000,50005000,50005000,50005000'))
+
+ def test_direct_use_of_sqlite_db_with_one_table(self):
+ tmpfile = self.create_file_with_data(b(''),suffix='.sqlite')
+ os.remove(tmpfile.name)
+ c = sqlite3.connect(tmpfile.name)
+ c.execute(' create table mytable (x int, y int)').fetchall()
+ c.execute(' insert into mytable (x,y) values (100,200),(300,400)').fetchall()
+ c.commit()
+ c.close()
+
+ cmd = Q_EXECUTABLE + ' -t "select sum(x),sum(y) from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('400\t600'))
+
+ cmd = Q_EXECUTABLE + ' -t "select sum(x),sum(y) from %s:::mytable"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('400\t600'))
+
+ def test_direct_use_of_sqlite_db_with_one_table__nonexistent_table(self):
+ tmpfile = self.create_file_with_data(b(''),suffix='.sqlite')
+ os.remove(tmpfile.name)
+ c = sqlite3.connect(tmpfile.name)
+ c.execute(' create table some_numbers (x int, y int)').fetchall()
+ c.execute(' insert into some_numbers (x,y) values (100,200),(300,400)').fetchall()
+ c.commit()
+ c.close()
+
+ cmd = Q_EXECUTABLE + ' -t "select sum(x),sum(y) from %s:::non_existent"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 85)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0],b('Table non_existent could not be found in sqlite file %s . Existing table names: some_numbers' % (tmpfile.name)))
+
+
+ def test_qsql_creation_and_direct_use(self):
+ numbers = [[b(str(i)),b(str(i)),b(str(i))] for i in range(1,10001)]
+
+ file_data = self.arrays_to_csv_file_content(b('\t'),[b('aa'),b('bb'),b('cc')],numbers)
+
+ tmpfile = self.create_file_with_data(file_data)
+ tmpfile_folder = os.path.dirname(tmpfile.name)
+ tmpfile_filename = os.path.basename(tmpfile.name)
+ expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -t "select sum(aa),sum(bb),sum(cc) from %s" -H -C readwrite' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('50005000\t50005000\t50005000'))
+
+ self.assertTrue(os.path.exists(expected_cache_filename))
+
+ self.cleanup(tmpfile)
+
+ # Get the data using a comma delimiter, to make sure that column parsing was done correctlyAdding to qcatalog table:
+ cmd = Q_EXECUTABLE + ' -D , "select count(*),sum(aa),sum(bb),sum(cc) from %s"' % expected_cache_filename
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('10000,50005000,50005000,50005000'))
+
+ def test_analysis_of_qsql_direct_usage(self):
+ numbers = [[b(str(i)),b(str(i)),b(str(i))] for i in range(1,10001)]
+
+ file_data = self.arrays_to_csv_file_content(b('\t'),[b('aa'),b('bb'),b('cc')],numbers)
+
+ tmpfile = self.create_file_with_data(file_data)
+ tmpfile_folder = os.path.dirname(tmpfile.name)
+ tmpfile_filename = os.path.basename(tmpfile.name)
+ expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -t "select sum(aa),sum(bb),sum(cc) from %s" -H -C readwrite' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('50005000\t50005000\t50005000'))
+
+ self.assertTrue(os.path.exists(expected_cache_filename))
+
+ self.cleanup(tmpfile)
+
+ cmd = Q_EXECUTABLE + ' "select * from %s" -A' % expected_cache_filename
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 7)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('Table: %s' % expected_cache_filename))
+ self.assertEqual(o[1],b(" Sources:"))
+ self.assertEqual(o[2],b(' source_type: qsql-file source: %s' % expected_cache_filename))
+ self.assertEqual(o[3],b(" Fields:"))
+ self.assertEqual(o[4],b(' `aa` - int'))
+ self.assertEqual(o[5],b(' `bb` - int'))
+ self.assertEqual(o[6],b(' `cc` - int'))
+
+ def test_analysis_of_qsql_direct_usage2(self):
+ numbers = [[b(str(i)),b(str(i)),b(str(i))] for i in range(1,10001)]
+
+ file_data = self.arrays_to_csv_file_content(b('\t'),[b('aa'),b('bb'),b('cc')],numbers)
+
+ tmpfile = self.create_file_with_data(file_data)
+ tmpfile_folder = os.path.dirname(tmpfile.name)
+ tmpfile_filename = os.path.basename(tmpfile.name)
+ expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
+
+ cmd = Q_EXECUTABLE + ' -H -t "select sum(aa),sum(bb),sum(cc) from %s" -H -C readwrite' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('50005000\t50005000\t50005000'))
+
+ self.assertTrue(os.path.exists(expected_cache_filename))
+
+ self.cleanup(tmpfile)
+
+ cmd = Q_EXECUTABLE + ' "select * from %s" -A' % expected_cache_filename
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 7)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b('Table: %s' % expected_cache_filename))
+ self.assertEqual(o[1],b(" Sources:"))
+ self.assertEqual(o[2],b(' source_type: qsql-file source: %s' % expected_cache_filename))
+ self.assertEqual(o[3],b(" Fields:"))
+ self.assertEqual(o[4],b(' `aa` - int'))
+ self.assertEqual(o[5],b(' `bb` - int'))
+ self.assertEqual(o[6],b(' `cc` - int'))
+
+ def test_direct_qsql_usage_for_single_table_qsql_file(self):
+ disk_db_filename = self.random_tmp_filename('save-to-db','qsql')
+
+ cmd = 'seq 1 10000 | %s -t "select sum(aa),sum(bb),sum(cc) from -" -S %s' % (Q_EXECUTABLE,disk_db_filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+
+ cmd = '%s -D, "select count(*),sum(c1) from %s:::data_stream_stdin"' % (Q_EXECUTABLE,disk_db_filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('10000,50005000'))
+
+ def test_direct_qsql_usage_for_single_table_qsql_file__nonexistent_table(self):
+ disk_db_filename = self.random_tmp_filename('save-to-db','qsql')
+
+ cmd = 'seq 1 10000 | %s -t "select sum(aa),sum(bb),sum(cc) from -" -S %s' % (Q_EXECUTABLE,disk_db_filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+
+ cmd = '%s -D, "select count(*),sum(c1) from %s:::unknown_table_name"' % (Q_EXECUTABLE,disk_db_filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 85)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+ self.assertEqual(e[0],b('Table unknown_table_name could not be found in sqlite file %s . Existing table names: data_stream_stdin' % (disk_db_filename)))
+
+ def test_direct_qsql_usage_from_written_data_stream(self):
+ disk_db_filename = self.random_tmp_filename('save-to-db','qsql')
+
+ cmd = 'seq 1 10000 | %s -t "select sum(aa),sum(bb),sum(cc) from -" -S %s' % (Q_EXECUTABLE,disk_db_filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+
+ cmd = '%s -D, "select count(*),sum(c1) from %s:::data_stream_stdin"' % (Q_EXECUTABLE,disk_db_filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('10000,50005000'))
+
+ def test_direct_qsql_self_join(self):
+ disk_db_filename = self.random_tmp_filename('save-to-db','qsql')
+
+ N = 100
+ cmd = 'seq 1 %s | %s -t "select count(*),sum(c1) from -" -S %s' % (N,Q_EXECUTABLE,disk_db_filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+
+ cmd = '%s -D, "select count(*),sum(a.c1),sum(b.c1) from %s:::data_stream_stdin a left join %s:::data_stream_stdin b"' % (Q_EXECUTABLE,disk_db_filename,disk_db_filename)
+ retcode, o, e = run_command(cmd)
+
+ expected_sum = sum(range(1,N+1))*N
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o[0],b('10000,%s,%s' % (expected_sum,expected_sum)))
\ No newline at end of file
diff --git a/test/QuotingTests.py b/test/QuotingTests.py
new file mode 100644
index 00000000..dc929b34
--- /dev/null
+++ b/test/QuotingTests.py
@@ -0,0 +1,466 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.test_data import sample_quoted_data
+from test.test_data import sample_quoted_data2
+from test.test_data import sample_quoted_data2_with_newline
+from test.test_data import double_double_quoted_data
+from test.test_data import escaped_double_quoted_data
+from test.test_data import combined_quoted_data
+
+from test.utils import DEBUG
+from test.utils import b
+
+class QuotingTests(AbstractQTestCase):
+ def test_non_quoted_values_in_quoted_data(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data)
+
+ cmd = Q_EXECUTABLE + ' -d " " "select c1 from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),4)
+
+ self.assertTrue(o[0],'non_quoted')
+ self.assertTrue(o[1],'control-value-1')
+ self.assertTrue(o[2],'non-quoted-value')
+ self.assertTrue(o[3],'control-value-1')
+
+ self.cleanup(tmp_data_file)
+
+ def test_regular_quoted_values_in_quoted_data(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data)
+
+ cmd = Q_EXECUTABLE + ' -d " " "select c2 from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),4)
+
+ self.assertTrue(o[0],'regular_double_quoted')
+ self.assertTrue(o[1],'control-value-2')
+ self.assertTrue(o[2],'this is a quoted value')
+ self.assertTrue(o[3],'control-value-2')
+
+ self.cleanup(tmp_data_file)
+
+ def test_double_double_quoted_values_in_quoted_data(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data)
+
+ cmd = Q_EXECUTABLE + ' -d " " "select c3 from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),4)
+
+ self.assertTrue(o[0],'double_double_quoted')
+ self.assertTrue(o[1],'control-value-3')
+ self.assertTrue(o[2],'this is a "double double" quoted value')
+ self.assertTrue(o[3],'control-value-3')
+
+ self.cleanup(tmp_data_file)
+
+ def test_escaped_double_quoted_values_in_quoted_data(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data)
+
+ cmd = Q_EXECUTABLE + ' -d " " "select c4 from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),4)
+
+ self.assertTrue(o[0],'escaped_double_quoted')
+ self.assertTrue(o[1],'control-value-4')
+ self.assertTrue(o[2],'this is an escaped "quoted value"')
+ self.assertTrue(o[3],'control-value-4')
+
+ self.cleanup(tmp_data_file)
+
+ def test_none_input_quoting_mode_in_relaxed_mode(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+ cmd = Q_EXECUTABLE + ' -d " " -m relaxed -D , -w none -W none "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('"quoted,data",23'))
+ self.assertEqual(o[1],b('unquoted-data,54,'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_none_input_quoting_mode_in_strict_mode(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+ cmd = Q_EXECUTABLE + ' -d " " -m strict -D , -w none "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode,0)
+ self.assertEqual(len(e),1)
+ self.assertEqual(len(o),0)
+
+ self.assertTrue(e[0].startswith(b('Strict mode. Column Count is expected to identical')))
+
+ self.cleanup(tmp_data_file)
+
+ def test_minimal_input_quoting_mode(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+ cmd = Q_EXECUTABLE + ' -d " " -D , -w minimal "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('quoted data,23'))
+ self.assertEqual(o[1],b('unquoted-data,54'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_all_input_quoting_mode(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+ cmd = Q_EXECUTABLE + ' -d " " -D , -w all "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('quoted data,23'))
+ self.assertEqual(o[1],b('unquoted-data,54'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_incorrect_input_quoting_mode(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+ cmd = Q_EXECUTABLE + ' -d " " -D , -w unknown_wrapping_mode "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEqual(retcode,0)
+ self.assertEqual(len(e),1)
+ self.assertEqual(len(o),0)
+
+ self.assertTrue(e[0].startswith(b('Input quoting mode can only be one of all,minimal,none')))
+ self.assertTrue(b('unknown_wrapping_mode') in e[0])
+
+ self.cleanup(tmp_data_file)
+
+ def test_none_output_quoting_mode(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+ cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W none "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('quoted data,23'))
+ self.assertEqual(o[1],b('unquoted-data,54'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_minimal_output_quoting_mode__without_need_to_quote_in_output(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+ cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W minimal "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('quoted data,23'))
+ self.assertEqual(o[1],b('unquoted-data,54'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_minimal_output_quoting_mode__with_need_to_quote_in_output_due_to_delimiter(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+ # output delimiter is set to space, so the output will contain it
+ cmd = Q_EXECUTABLE + ' -d " " -D " " -w all -W minimal "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('"quoted data" 23'))
+ self.assertEqual(o[1],b('unquoted-data 54'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_minimal_output_quoting_mode__with_need_to_quote_in_output_due_to_newline(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2_with_newline)
+
+ # Delimiter is set to colon (:), so it will not be inside the data values (this will make sure that the newline is the one causing the quoting)
+ cmd = Q_EXECUTABLE + " -d ':' -w all -W minimal \"select c1,c2,replace(c1,'with' || x'0a' || 'a new line inside it','NEWLINE-REMOVED') from %s\"" % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),3)
+
+ self.assertEqual(o[0],b('"quoted data with'))
+ # Notice that the third column here is not quoted, because we replaced the newline with something else
+ self.assertEqual(o[1],b('a new line inside it":23:quoted data NEWLINE-REMOVED'))
+ self.assertEqual(o[2],b('unquoted-data:54:unquoted-data'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_nonnumeric_output_quoting_mode(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+ cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W nonnumeric "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('"quoted data",23'))
+ self.assertEqual(o[1],b('"unquoted-data",54'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_all_output_quoting_mode(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+ cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W all "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('"quoted data","23"'))
+ self.assertEqual(o[1],b('"unquoted-data","54"'))
+
+ self.cleanup(tmp_data_file)
+
+ def _internal_test_consistency_of_chaining_output_to_input(self,input_data,input_wrapping_mode,output_wrapping_mode):
+
+ tmp_data_file = self.create_file_with_data(input_data)
+
+ basic_cmd = Q_EXECUTABLE + ' -w %s -W %s "select * from -"' % (input_wrapping_mode,output_wrapping_mode)
+ chained_cmd = 'cat %s | %s | %s | %s' % (tmp_data_file.name,basic_cmd,basic_cmd,basic_cmd)
+
+ retcode, o, e = run_command(chained_cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(b("\n").join(o),input_data)
+
+ self.cleanup(tmp_data_file)
+
+ def test_consistency_of_chaining_minimal_wrapping_to_minimal_wrapping(self):
+ input_data = b('"quoted data" 23\nunquoted-data 54')
+ self._internal_test_consistency_of_chaining_output_to_input(input_data,'minimal','minimal')
+
+ def test_consistency_of_chaining_all_wrapping_to_all_wrapping(self):
+ input_data = b('"quoted data" "23"\n"unquoted-data" "54"')
+ self._internal_test_consistency_of_chaining_output_to_input(input_data,'all','all')
+
+ def test_input_field_quoting_and_data_types_with_encoding(self):
+ OUTPUT_ENCODING = 'utf-8'
+
+ # Checks combination of minimal input field quoting, with special characters that need to be decoded -
+ # Both content and proper data types are verified
+ data = b('111,22.22,"testing text with special characters - citt\xc3\xa0 ",http://somekindofurl.com,12.13.14.15,12.1\n')
+ tmp_data_file = self.create_file_with_data(data)
+
+ cmd = Q_EXECUTABLE + ' -d , "select * from %s" -E %s' % (tmp_data_file.name,OUTPUT_ENCODING)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),1)
+
+ self.assertEqual(o[0].decode('utf-8'),u'111,22.22,testing text with special characters - citt\xe0 ,http://somekindofurl.com,12.13.14.15,12.1')
+
+ cmd = Q_EXECUTABLE + ' -d , "select * from %s" -A' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),10)
+
+ self.assertEqual(o[0],b('Table: %s' % tmp_data_file.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s' % tmp_data_file.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4],b(' `c1` - int'))
+ self.assertEqual(o[5],b(' `c2` - real'))
+ self.assertEqual(o[6],b(' `c3` - text'))
+ self.assertEqual(o[7],b(' `c4` - text'))
+ self.assertEqual(o[8],b(' `c5` - text'))
+ self.assertEqual(o[9],b(' `c6` - real'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_multiline_double_double_quoted_values_in_quoted_data(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data)
+
+ # FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
+ cmd = Q_EXECUTABLE + ' -d " " "select replace(c5,X\'0A\',\'::\') from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),4)
+
+ self.assertTrue(o[0],b('multiline_double_double_quoted'))
+ self.assertTrue(o[1],b('control-value-5'))
+ self.assertTrue(o[2],b('this is a double double quoted "multiline\n value".'))
+ self.assertTrue(o[3],b('control-value-5'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_multiline_escaped_double_quoted_values_in_quoted_data(self):
+ tmp_data_file = self.create_file_with_data(sample_quoted_data)
+
+ # FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
+ cmd = Q_EXECUTABLE + ' -d " " "select replace(c6,X\'0A\',\'::\') from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),4)
+
+ self.assertTrue(o[0],'multiline_escaped_double_quoted')
+ self.assertTrue(o[1],'control-value-6')
+ self.assertTrue(o[2],'this is an escaped "multiline:: value".')
+ self.assertTrue(o[3],'control-value-6')
+
+ self.cleanup(tmp_data_file)
+
+ def test_disable_double_double_quoted_data_flag__values(self):
+ # This test (and flag) is meant to verify backward comptibility only. It is possible that
+ # this flag will be removed completely in the future
+
+ tmp_data_file = self.create_file_with_data(double_double_quoted_data)
+
+ cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('double_double_quoted'))
+ self.assertEqual(o[1],b('this is a quoted value with "double'))
+
+ cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select c3 from %s" -W none' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b(''))
+ self.assertEqual(o[1],b('double'))
+
+ cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select c4 from %s" -W none' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b(''))
+ self.assertEqual(o[1],b('quotes"""'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_disable_escaped_double_quoted_data_flag__values(self):
+ # This test (and flag) is meant to verify backward comptibility only. It is possible that
+ # this flag will be removed completely in the future
+
+ tmp_data_file = self.create_file_with_data(escaped_double_quoted_data)
+
+ cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b('escaped_double_quoted'))
+ self.assertEqual(o[1],b('this is a quoted value with \\escaped'))
+
+ cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select c3 from %s" -W none' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b(''))
+ self.assertEqual(o[1],b('double'))
+
+ cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select c4 from %s" -W none' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),2)
+
+ self.assertEqual(o[0],b(''))
+ self.assertEqual(o[1],b('quotes\\""'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_combined_quoted_data_flags__number_of_columns_detected(self):
+ # This test (and flags) is meant to verify backward comptibility only. It is possible that
+ # these flags will be removed completely in the future
+ tmp_data_file = self.create_file_with_data(combined_quoted_data)
+
+ cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting --disable-escaped-double-quoting "select * from %s" -A' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ o = o[o.index(b(' Fields:'))+1:]
+
+ self.assertEqual(len(o),7) # found 7 fields
+
+ cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select * from %s" -A' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ o = o[o.index(b(' Fields:'))+1:]
+
+ self.assertEqual(len(o),5) # found 5 fields
+
+ cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select * from %s" -A' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ o = o[o.index(b(' Fields:'))+1:]
+
+ self.assertEqual(len(o),5) # found 5 fields
+
+ cmd = Q_EXECUTABLE + ' -d " " "select * from %s" -A' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(e),0)
+ o = o[o.index(b(' Fields:'))+1:]
+
+ self.assertEqual(len(o),3) # found only 3 fields, which is the correct amount
+
+ self.cleanup(tmp_data_file)
\ No newline at end of file
diff --git a/test/SaveToSqliteTests.py b/test/SaveToSqliteTests.py
new file mode 100644
index 00000000..f1203f1e
--- /dev/null
+++ b/test/SaveToSqliteTests.py
@@ -0,0 +1,605 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, batch, get_sqlite_table_list, partition, run_command
+
+from test.utils import b
+from test.utils import DEBUG
+from test.base import AbstractQTestCase
+import collections
+import glob
+import os
+import sqlite3
+
+
+class SaveToSqliteTests(AbstractQTestCase):
+
+ # Returns a folder with files and a header in each, one column named 'a'
+ def generate_files_in_folder(self,batch_size, file_count):
+ numbers = list(range(1, 1 + batch_size * file_count))
+ numbers_as_text = batch([str(x) for x in numbers], n=batch_size)
+
+ content_list = list(map(b, ['a\n' + "\n".join(x) + '\n' for x in numbers_as_text]))
+
+ filename_list = list(map(lambda x: 'file-%s' % x, range(file_count)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d, 'split-files', 'sqlite-stuff')
+ return (tmpfolder,filename_list)
+
+ # 11074 3.8.2021 10:53 bin/q.py "select count(*) from xxxx/file-95 left join xxxx/file-96 left join xxxx/file-97 left join xxxx/file-97 left join xxxx/file-98 left join xxxx/*" -c 1 -C readwrite -A
+ # # fails because it takes qsql files as well
+
+ def test_save_glob_files_to_sqlite(self):
+ BATCH_SIZE = 50
+ FILE_COUNT = 5
+
+ tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
+
+ output_sqlite_file = self.random_tmp_filename("x","sqlite")
+
+ cmd = '%s -H "select count(*) from %s/*" -c 1 -S %s' % (Q_EXECUTABLE,tmpfolder,output_sqlite_file)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+
+ c = sqlite3.connect(output_sqlite_file)
+ results = c.execute('select a from file_dash_0').fetchall()
+ self.assertEqual(len(results),BATCH_SIZE*FILE_COUNT)
+ self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1,BATCH_SIZE*FILE_COUNT+1)))
+ tables = get_sqlite_table_list(c)
+ self.assertEqual(len(tables),1)
+
+ c.close()
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_save_multiple_files_to_sqlite(self):
+ BATCH_SIZE = 50
+ FILE_COUNT = 5
+
+ tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
+
+ output_sqlite_file = self.random_tmp_filename("x","sqlite")
+
+ tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
+ cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+
+ c = sqlite3.connect(output_sqlite_file)
+
+ tables = get_sqlite_table_list(c)
+ self.assertEqual(len(tables), FILE_COUNT)
+
+ for i in range(FILE_COUNT):
+ results = c.execute('select a from file_dash_%s' % i).fetchall()
+ self.assertEqual(len(results),BATCH_SIZE)
+ self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
+
+ c.close()
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_save_multiple_files_to_sqlite_without_duplicates(self):
+ BATCH_SIZE = 50
+ FILE_COUNT = 5
+
+ tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
+
+ output_sqlite_file = self.random_tmp_filename("x","sqlite")
+
+ tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
+
+ # duplicate the left-joins for all the files, so the query will contain each filename twice
+ tables_as_str = "%s left join %s" % (tables_as_str,tables_as_str)
+
+ cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+
+ c = sqlite3.connect(output_sqlite_file)
+
+ tables = get_sqlite_table_list(c)
+ # total table count should still be FILE_COUNT, even with the duplications
+ self.assertEqual(len(tables), FILE_COUNT)
+
+ for i in range(FILE_COUNT):
+ results = c.execute('select a from file_dash_%s' % i).fetchall()
+ self.assertEqual(len(results),BATCH_SIZE)
+ self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
+
+ c.close()
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_sqlite_file_is_not_created_if_some_table_does_not_exist(self):
+ BATCH_SIZE = 50
+ FILE_COUNT = 5
+
+ tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
+
+ output_sqlite_file = self.random_tmp_filename("x","sqlite")
+
+ tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
+
+ tables_as_str = tables_as_str + ' left join %s/non_existent_table' % (tmpfolder)
+
+ cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 30)
+ self.assertEqual(len(e), 2)
+ self.assertEqual(e[0],b("Going to save data into a disk database: %s" % output_sqlite_file))
+ self.assertEqual(e[1],b("No files matching '%s/non_existent_table' have been found" % tmpfolder))
+
+ self.assertTrue(not os.path.exists(output_sqlite_file))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_recurring_glob_and_separate_files_in_same_query_when_writing_to_sqlite(self):
+ BATCH_SIZE = 50
+ FILE_COUNT = 5
+
+ tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
+
+ output_sqlite_file = self.random_tmp_filename("x","sqlite")
+
+ tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
+ # The same files are left-joined in the query as an additional "left join /*". This should create an additional table
+ # in the sqlite file, with all the data in it
+ cmd = '%s -H "select count(*) from %s left join %s/*" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,tmpfolder,output_sqlite_file)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+
+ c = sqlite3.connect(output_sqlite_file)
+
+ tables = get_sqlite_table_list(c)
+ # plus the additional table from the glob
+ self.assertEqual(len(tables), FILE_COUNT+1)
+
+ # check all the per-file tables
+ for i in range(FILE_COUNT):
+ results = c.execute('select a from file_dash_%s' % i).fetchall()
+ self.assertEqual(len(results),BATCH_SIZE)
+ self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
+
+ # ensure the glob-based table exists, with an _2 added to the name, as the original "file_dash_0" already exists in the sqlite db
+ results = c.execute('select a from file_dash_0_2').fetchall()
+ self.assertEqual(len(results),FILE_COUNT*BATCH_SIZE)
+ self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1,1+FILE_COUNT*BATCH_SIZE)))
+ c.close()
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_empty_sqlite_handling(self):
+ fn = self.generate_tmpfile_name("empty",".sqlite")
+
+ c = sqlite3.connect(fn)
+ c.execute('create table x (a int)').fetchall()
+ c.execute('drop table x').fetchall()
+ c.close()
+
+ cmd = '%s "select * from %s"' % (Q_EXECUTABLE,fn)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,88)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),1)
+ self.assertEqual(e[0],b('sqlite file %s has no tables' % fn))
+
+ def test_storing_to_disk_too_many_qsql_files(self):
+ BATCH_SIZE = 10
+ MAX_ATTACHED_DBS = 5
+ FILE_COUNT = MAX_ATTACHED_DBS + 4
+
+ numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)
+
+ content_list = map(b, ["\n".join(x) for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d, 'split-files', 'attach-limit')
+
+ for fn in filename_list:
+ cmd = '%s -c 1 "select count(*) from %s/%s" -C readwrite' % (Q_EXECUTABLE,tmpfolder, fn)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+
+ output_sqlite_file = self.generate_tmpfile_name("many-sqlites",".sqlite")
+
+ table_refs = list(['select * from %s/%s.qsql' % (tmpfolder,x) for x in filename_list])
+ table_refs_str = " UNION ALL ".join(table_refs)
+ # Limit max attached dbs according to the parameter (must be below the hardcoded sqlite limit, which is 10 when having a standard version compiled)
+ cmd = '%s "select * from (%s)" -S %s --max-attached-sqlite-databases=%s' % (Q_EXECUTABLE,table_refs_str,output_sqlite_file,MAX_ATTACHED_DBS)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),4)
+
+ c = sqlite3.connect(output_sqlite_file)
+ tables_results = c.execute("select tbl_name from sqlite_master where type='table'").fetchall()
+ table_names = list(sorted([x[0] for x in tables_results]))
+ self.assertEqual(len(table_names),FILE_COUNT)
+
+ for i,tn in enumerate(table_names):
+ self.assertEqual(tn,'file_dash_%s' % i)
+
+ table_content = c.execute('select * from %s' % tn).fetchall()
+ self.assertEqual(len(table_content),BATCH_SIZE)
+
+ cmd = '%s "select * from %s:::%s"' % (Q_EXECUTABLE,output_sqlite_file,tn)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),BATCH_SIZE)
+ self.assertEqual(o,list([b(str(x)) for x in range(1 + i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)]))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_storing_to_disk_too_many_sqlite_files(self):
+ # a variation of test_storing_to_disk_too_many_qsql_files, which deletes the qcatalog file from the caches,
+ # so they'll be just regular sqlite files
+
+ BATCH_SIZE = 10
+ MAX_ATTACHED_DBS = 5
+ FILE_COUNT = MAX_ATTACHED_DBS + 4
+
+ numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)
+
+ content_list = map(b, ["\n".join(x) for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d, 'split-files', 'attach-limit')
+
+ for fn in filename_list:
+ cmd = '%s -c 1 "select count(*) from %s/%s" -C readwrite' % (Q_EXECUTABLE,tmpfolder, fn)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+
+ c = sqlite3.connect('%s/%s.qsql' % (tmpfolder,fn))
+ c.execute('drop table _qcatalog').fetchall()
+ c.close()
+ os.rename('%s/%s.qsql' % (tmpfolder,fn),'%s/%s.sqlite' % (tmpfolder,fn))
+
+ output_sqlite_file = self.generate_tmpfile_name("many-sqlites",".sqlite")
+
+ table_refs = list(['select * from %s/%s.sqlite' % (tmpfolder,x) for x in filename_list])
+ table_refs_str = " UNION ALL ".join(table_refs)
+ # Limit max attached dbs according to the parameter (must be below the hardcoded sqlite limit, which is 10 when having a standard version compiled)
+ cmd = '%s "select * from (%s)" -S %s --max-attached-sqlite-databases=%s' % (Q_EXECUTABLE,table_refs_str,output_sqlite_file,MAX_ATTACHED_DBS)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),4)
+
+ c = sqlite3.connect(output_sqlite_file)
+ tables_results = c.execute("select tbl_name from sqlite_master where type='table'").fetchall()
+ table_names = list(sorted([x[0] for x in tables_results]))
+ self.assertEqual(len(table_names),FILE_COUNT)
+
+ for i,tn in enumerate(table_names):
+ self.assertEqual(tn,'file_dash_%s' % i)
+
+ table_content = c.execute('select * from %s' % tn).fetchall()
+ self.assertEqual(len(table_content),BATCH_SIZE)
+
+ cmd = '%s "select * from %s:::%s"' % (Q_EXECUTABLE,output_sqlite_file,tn)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e),0)
+ self.assertEqual(len(o),BATCH_SIZE)
+ self.assertEqual(o,list([b(str(x)) for x in range(1 + i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)]))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_storing_to_disk_too_many_sqlite_files__over_the_sqlite_limit(self):
+ # a variation of test_storing_to_disk_too_many_sqlite_files, but with a limit above the sqlite hardcoded limit
+ MAX_ATTACHED_DBS = 20 # standard sqlite limit is 10, so q should throw an error
+
+ BATCH_SIZE = 10
+ FILE_COUNT = MAX_ATTACHED_DBS + 4
+
+ numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)
+
+ content_list = map(b, ["\n".join(x) for x in numbers_as_text])
+
+ filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
+ d = collections.OrderedDict(zip(filename_list, content_list))
+
+ tmpfolder = self.create_folder_with_files(d, 'split-files', 'attach-limit')
+
+ for fn in filename_list:
+ cmd = '%s -c 1 "select count(*) from %s/%s" -C readwrite' % (Q_EXECUTABLE,tmpfolder, fn)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+
+ c = sqlite3.connect('%s/%s.qsql' % (tmpfolder,fn))
+ c.execute('drop table _qcatalog').fetchall()
+ c.close()
+ os.rename('%s/%s.qsql' % (tmpfolder,fn),'%s/%s.sqlite' % (tmpfolder,fn))
+
+ output_sqlite_file = self.generate_tmpfile_name("many-sqlites",".sqlite")
+
+ table_refs = list(['select * from %s/%s.sqlite' % (tmpfolder,x) for x in filename_list])
+ table_refs_str = " UNION ALL ".join(table_refs)
+ # Limit max attached dbs according to the parameter (must be below the hardcoded sqlite limit, which is 10 when having a standard version compiled)
+ cmd = '%s "select * from (%s)" -S %s --max-attached-sqlite-databases=%s' % (Q_EXECUTABLE,table_refs_str,output_sqlite_file,MAX_ATTACHED_DBS)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode,89)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),2)
+ self.assertTrue(e[0].startswith(b('Going to save data into')))
+ self.assertTrue(e[1].startswith(b('There are too many attached databases. Use a proper --max-attached-sqlite-databases parameter which is below the maximum')))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_qtable_name_normalization__starting_with_a_digit(self):
+ numbers = [[b(str(i)), b(str(i)), b(str(i))] for i in range(1, 101)]
+
+ header = [b('aa'), b('bb'), b('cc')]
+
+ base_filename_with_digits = '010'
+
+ new_tmp_folder = self.create_folder_with_files({
+ base_filename_with_digits : self.arrays_to_csv_file_content(b(','),header,numbers)
+ },prefix='xx',suffix='digits')
+
+ effective_filename = '%s/010' % new_tmp_folder
+
+ output_sqlite_filename = self.generate_tmpfile_name("starting-with-digit",".sqlite")
+ cmd = '%s -d , -H "select count(aa),count(bb),count(cc) from %s" -S %s' % (Q_EXECUTABLE,effective_filename,output_sqlite_filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),0)
+ self.assertEqual(len(e),4)
+
+ c = sqlite3.connect(output_sqlite_filename)
+ results = c.execute('select aa,bb,cc from t_%s' % base_filename_with_digits).fetchall()
+ self.assertEqual(results,list([(x,x,x) for x in range(1,101)]))
+ c.close()
+
+ self.cleanup_folder(new_tmp_folder)
+
+ def test_qtable_name_normalization(self):
+ x = [b(a) for a in map(str, range(1, 101))]
+ large_file_data = b("val\n") + b("\n").join(x)
+ tmpfile = self.create_file_with_data(large_file_data)
+
+ tmpfile_folder = os.path.dirname(tmpfile.name)
+ tmpfile_basename = os.path.basename(tmpfile.name)
+
+ cmd = 'cd %s && %s -c 1 -H -D , -O "select a.val,b.val from %s a cross join ./%s b on (a.val = b.val * 2)"' % (tmpfile_folder,Q_EXECUTABLE,tmpfile_basename,tmpfile_basename)
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 51)
+
+ evens = list(filter(lambda x: x%2 == 0,range(1,101)))
+ expected_result_rows = [b('val,val')] + [b('%d,%d' % (x,x / 2)) for x in evens]
+ self.assertEqual(o,expected_result_rows)
+
+ def test_qtable_name_normalization2(self):
+ cmd = '%s "select * from"' % Q_EXECUTABLE
+
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 118)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0],b('FROM/JOIN is missing a table name after it'))
+
+ def test_qtable_name_normalization3(self):
+ # with a space after the from
+ cmd = '%s "select * from "' % Q_EXECUTABLE
+
+ retcode, o, e = run_command(cmd)
+ self.assertEqual(retcode, 118)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0],b('FROM/JOIN is missing a table name after it'))
+
+ def test_save_multiple_files_to_sqlite_while_caching_them(self):
+ BATCH_SIZE = 50
+ FILE_COUNT = 5
+
+ tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
+
+ output_sqlite_file = self.random_tmp_filename("x","sqlite")
+
+ tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
+ cmd = '%s -H "select count(*) from %s" -c 1 -S %s -C readwrite' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+
+ c = sqlite3.connect(output_sqlite_file)
+
+ tables = get_sqlite_table_list(c)
+ self.assertEqual(len(tables), FILE_COUNT)
+
+ for i,filename in enumerate(filename_list):
+ matching_table_name = 'file_dash_%s' % i
+
+ results = c.execute('select a from %s' % matching_table_name).fetchall()
+ self.assertEqual(len(results),BATCH_SIZE)
+ self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
+
+ # check actual resulting qsql file for the file
+ cmd = '%s -c 1 -H "select a from %s/%s"' % (Q_EXECUTABLE,tmpfolder,filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), BATCH_SIZE)
+ self.assertEqual(sum(map(int,o)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
+ self.assertEqual(len(e), 0)
+
+ # check analysis returns proper file-with-unused-qsql for each file, since by default `-C none` which means don't read the cache
+ # even if it exists
+ cmd = '%s -c 1 -H "select a from %s/%s" -A' % (Q_EXECUTABLE,tmpfolder,filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 5)
+ self.assertEqual(o,[
+ b('Table: %s/file-%s' % (tmpfolder,i)),
+ b(' Sources:'),
+ b(' source_type: file-with-unused-qsql source: %s/file-%s' % (tmpfolder,i)),
+ b(' Fields:'),
+ b(' `a` - int')
+ ])
+
+ cmd = '%s -c 1 -H "select a from %s/%s" -A -C read' % (Q_EXECUTABLE,tmpfolder,filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 5)
+ self.assertEqual(o,[
+ b('Table: %s/file-%s' % (tmpfolder,i)),
+ b(' Sources:'),
+ b(' source_type: qsql-file-with-original source: %s/file-%s.qsql' % (tmpfolder,i)),
+ b(' Fields:'),
+ b(' `a` - int')
+ ])
+
+ # check qsql file is readable directly through q
+ cmd = '%s -c 1 -H "select a from %s/%s.qsql"' % (Q_EXECUTABLE,tmpfolder,filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), BATCH_SIZE)
+ self.assertEqual(sum(map(int,o)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
+ self.assertEqual(len(e), 0)
+
+ # check analysis returns proper qsql-with-original for each file when running directly against the qsql file
+ cmd = '%s -c 1 -H "select a from %s/%s.qsql" -A' % (Q_EXECUTABLE,tmpfolder,filename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 5)
+ self.assertEqual(o,[
+ b('Table: %s/file-%s.qsql' % (tmpfolder,i)),
+ b(' Sources:'),
+ b(' source_type: qsql-file source: %s/file-%s.qsql' % (tmpfolder,i)),
+ b(' Fields:'),
+ b(' `a` - int')
+ ])
+ c.close()
+
+ import glob
+ filename_list_with_qsql = list(map(lambda x: x+'.qsql',filename_list))
+
+ files_in_folder = glob.glob('%s/*' % tmpfolder)
+ regular_files,qsql_files = partition(lambda x: x.endswith('.qsql'),files_in_folder)
+
+ self.assertEqual(len(files_in_folder),2*FILE_COUNT)
+ self.assertEqual(sorted(list(map(os.path.basename,regular_files))),sorted(list(map(os.path.basename,filename_list))))
+ self.assertEqual(sorted(list(map(os.path.basename,qsql_files))),sorted(list(map(os.path.basename,filename_list_with_qsql))))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_globs_ignore_matching_qsql_files(self):
+ BATCH_SIZE = 10
+ FILE_COUNT = 5
+
+ tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
+
+ tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
+ cmd = '%s -H "select count(*) from %s" -c 1 -C readwrite' % (Q_EXECUTABLE,tables_as_str)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(o[0],b(str(pow(BATCH_SIZE,FILE_COUNT))))
+
+ cmd = '%s -H "select a from %s/*" -c 1 -C read' % (Q_EXECUTABLE,tmpfolder)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), BATCH_SIZE*FILE_COUNT)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(sum(map(int,o)),sum(range(1,1+BATCH_SIZE*FILE_COUNT)))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_error_on_reading_from_multi_table_sqlite_without_explicit_table_name(self):
+ BATCH_SIZE = 50
+ FILE_COUNT = 5
+
+ tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
+
+ output_sqlite_file = self.random_tmp_filename("x","sqlite")
+
+ tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
+ cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 4)
+
+
+ cmd = '%s -H "select count(*) from %s"' % (Q_EXECUTABLE,output_sqlite_file)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 87)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0],b("Could not autodetect table name in sqlite file %s . Existing tables: file_dash_0,file_dash_1,file_dash_2,file_dash_3,file_dash_4" % output_sqlite_file))
+
+ self.cleanup_folder(tmpfolder)
+
+ def test_error_on_trying_to_specify_an_explicit_non_existent_qsql_file(self):
+ cmd = '%s -H "select count(*) from /non-existent-folder/non-existent.qsql:::mytable"' % (Q_EXECUTABLE)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 30)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0],b("Could not find file /non-existent-folder/non-existent.qsql"))
+
+ def test_error_on_providing_a_non_qsql_file_when_specifying_an_explicit_table(self):
+ data = b("\x1f\x8b\x08\x00\tZ\x0ea\x00\x03\xed\x93\xdd\n\xc20\x0cF\xf3(}\x01ij\x93\xf6y:\xd9P\x10)\xb3\xbe\xbf\x9d\x1d\xbbQ\xc6\x06F\x10rn\xbe\x9b\xd0\xfc\x1c\x9a-\x88\x83\x88\x91\xd9\xbc2\xb4\xc4#\xb5\x9c1\x8e\x1czb\x8a\xd1\x19t\xdeS\x00\xc3\xf2\xa3\x01<\xee%\x8du\x94s\x1a\xfbk\xd7\xdf\x0e\xa9\x94Kz\xaf\xabe\xc3\xb0\xf2\xce\xbc\xc7\x92\x7fB\xb6\x1fv\xfd2\xf5\x1e\x81h\xa3\xff\x10'\xff\x8c\x04\x06\xc5'\x03\xf5oO\xe2=v\xf9o\xff\x9f\xd1\xa9\xff_\x90m'\xdec\x9f\x7f\x9c\xfc\xd7T\xff\x8a\xa2(\x92<\x01WY\x0c\x06\x00\x0c\x00\x00")
+ tmpfilename = self.random_tmp_filename('xx','yy')
+ f = open(tmpfilename,'wb')
+ f.write(data)
+ f.close()
+
+ cmd = '%s -H "select count(*) from %s:::mytable1"' % (Q_EXECUTABLE,tmpfilename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 95)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertEqual(e[0],b("Cannot detect the type of table %s:::mytable1" % tmpfilename))
+
+ def test_error_on_providing_a_non_qsql_file_when_not_specifying_an_explicit_table(self):
+ data = b("\x1f\x8b\x08\x00\tZ\x0ea\x00\x03\xed\x93\xdd\n\xc20\x0cF\xf3(}\x01ij\x93\xf6y:\xd9P\x10)\xb3\xbe\xbf\x9d\x1d\xbbQ\xc6\x06F\x10rn\xbe\x9b\xd0\xfc\x1c\x9a-\x88\x83\x88\x91\xd9\xbc2\xb4\xc4#\xb5\x9c1\x8e\x1czb\x8a\xd1\x19t\xdeS\x00\xc3\xf2\xa3\x01<\xee%\x8du\x94s\x1a\xfbk\xd7\xdf\x0e\xa9\x94Kz\xaf\xabe\xc3\xb0\xf2\xce\xbc\xc7\x92\x7fB\xb6\x1fv\xfd2\xf5\x1e\x81h\xa3\xff\x10'\xff\x8c\x04\x06\xc5'\x03\xf5oO\xe2=v\xf9o\xff\x9f\xd1\xa9\xff_\x90m'\xdec\x9f\x7f\x9c\xfc\xd7T\xff\x8a\xa2(\x92<\x01WY\x0c\x06\x00\x0c\x00\x00")
+ tmpfilename = self.random_tmp_filename('xx','yy')
+ f = open(tmpfilename,'wb')
+ f.write(data)
+ f.close()
+
+ cmd = '%s -H "select count(*) from %s"' % (Q_EXECUTABLE,tmpfilename)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 59)
+ self.assertEqual(len(o), 0)
+ self.assertEqual(len(e), 1)
+ self.assertTrue(e[0].startswith(b("Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error:")))
\ No newline at end of file
diff --git a/test/SqlTests.py b/test/SqlTests.py
new file mode 100644
index 00000000..55375122
--- /dev/null
+++ b/test/SqlTests.py
@@ -0,0 +1,169 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.test_data import (uneven_ls_output,find_output,header_row,sample_data_rows,sample_data_rows_with_empty_string,sample_data_no_header,sample_data_with_empty_string_no_header,sample_data_with_header,sample_data_with_missing_header_names,generate_sample_data_with_header,sample_quoted_data,double_double_quoted_data,escaped_double_quoted_data,combined_quoted_data,sample_quoted_data2,sample_quoted_data2_with_newline,one_column_data,sample_data_rows_with_spaces,sample_data_with_spaces_no_header,header_row_with_spaces,sample_data_with_spaces_with_header,long_value1,int_value,sample_data_with_long_values,EXAMPLES)
+from test.base import AbstractQTestCase
+
+from test.utils import b
+from test.utils import DEBUG
+
+class SqlTests(AbstractQTestCase):
+
+ def test_find_example(self):
+ tmpfile = self.create_file_with_data(find_output)
+ cmd = Q_EXECUTABLE + ' "select c5,c6,sum(c7)/1024.0/1024 as total from %s group by c5,c6 order by total desc"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], b('mapred mapred 0.9389581680297852'))
+ self.assertEqual(o[1], b('root root 0.02734375'))
+ self.assertEqual(o[2], b('harel harel 0.010888099670410156'))
+
+ self.cleanup(tmpfile)
+
+ def test_join_example(self):
+ cmd = Q_EXECUTABLE + ' "select myfiles.c8,emails.c2 from {0}/exampledatafile myfiles join {0}/group-emails-example emails on (myfiles.c4 = emails.c1) where myfiles.c8 = \'ppp\'"'.format(EXAMPLES)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 2)
+
+ self.assertEqual(o[0], b('ppp dip.1@otherdomain.com'))
+ self.assertEqual(o[1], b('ppp dip.2@otherdomain.com'))
+
+ def test_join_example_with_output_header(self):
+ cmd = Q_EXECUTABLE + ' -O "select myfiles.c8 aaa,emails.c2 bbb from {0}/exampledatafile myfiles join {0}/group-emails-example emails on (myfiles.c4 = emails.c1) where myfiles.c8 = \'ppp\'"'.format(EXAMPLES)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+
+ self.assertEqual(o[0], b('aaa bbb'))
+ self.assertEqual(o[1], b('ppp dip.1@otherdomain.com'))
+ self.assertEqual(o[2], b('ppp dip.2@otherdomain.com'))
+
+ def test_self_join1(self):
+ tmpfile = self.create_file_with_data(b("\n").join([b("{} 9000".format(i)) for i in range(0,10)]))
+ cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c1 = a2.c1)"' % (tmpfile.name,tmpfile.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 10)
+
+ self.cleanup(tmpfile)
+
+ def test_self_join_reuses_table(self):
+ tmpfile = self.create_file_with_data(b("\n").join([b("{} 9000".format(i)) for i in range(0,10)]))
+ cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c1 = a2.c1)" -A' % (tmpfile.name,tmpfile.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 6)
+
+ self.assertEqual(o[0],b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s') % b(tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4],b(' `c1` - int'))
+ self.assertEqual(o[5],b(' `c2` - int'))
+
+ self.cleanup(tmpfile)
+
+ def test_self_join2(self):
+ tmpfile1 = self.create_file_with_data(b("\n").join([b("{} 9000".format(i)) for i in range(0,10)]))
+ cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c2 = a2.c2)"' % (tmpfile1.name,tmpfile1.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 10*10)
+
+ self.cleanup(tmpfile1)
+
+ tmpfile2 = self.create_file_with_data(b("\n").join([b("{} 9000".format(i)) for i in range(0,10)]))
+ cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c2 = a2.c2) join %s a3 on (a1.c2 = a3.c2)"' % (tmpfile2.name,tmpfile2.name,tmpfile2.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 10*10*10)
+
+ self.cleanup(tmpfile2)
+
+ def test_disable_column_type_detection(self):
+ tmpfile = self.create_file_with_data(b('''regular_text,text_with_digits1,text_with_digits2,float_number
+"regular text 1",67,"67",12.3
+"regular text 2",067,"067",22.3
+"regular text 3",123,"123",33.4
+"regular text 4",-123,"-123",0122.2
+'''))
+
+ # Check original column type detection
+ cmd = Q_EXECUTABLE + ' -A -d , -H "select * from %s"' % (tmpfile.name)
+
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 8)
+
+ self.assertEqual(o[0],b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1], b(' Sources:'))
+ self.assertEqual(o[2], b(' source_type: file source: %s') % b(tmpfile.name))
+ self.assertEqual(o[3], b(' Fields:'))
+ self.assertEqual(o[4], b(' `regular_text` - text'))
+ self.assertEqual(o[5], b(' `text_with_digits1` - int'))
+ self.assertEqual(o[6], b(' `text_with_digits2` - int'))
+ self.assertEqual(o[7], b(' `float_number` - real'))
+
+ # Check column types detected when actual detection is disabled
+ cmd = Q_EXECUTABLE + ' -A -d , -H --as-text "select * from %s"' % (tmpfile.name)
+
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 8)
+
+ self.assertEqual(o[0],b('Table: %s' % tmpfile.name))
+ self.assertEqual(o[1],b(' Sources:'))
+ self.assertEqual(o[2],b(' source_type: file source: %s') % b(tmpfile.name))
+ self.assertEqual(o[3],b(' Fields:'))
+ self.assertEqual(o[4],b(' `regular_text` - text'))
+ self.assertEqual(o[5],b(' `text_with_digits1` - text'))
+ self.assertEqual(o[6],b(' `text_with_digits2` - text'))
+ self.assertEqual(o[7],b(' `float_number` - text'))
+
+ # Get actual data with regular detection
+ cmd = Q_EXECUTABLE + ' -d , -H "select * from %s"' % (tmpfile.name)
+
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 4)
+
+ self.assertEqual(o[0],b("regular text 1,67,67,12.3"))
+ self.assertEqual(o[1],b("regular text 2,67,67,22.3"))
+ self.assertEqual(o[2],b("regular text 3,123,123,33.4"))
+ self.assertEqual(o[3],b("regular text 4,-123,-123,122.2"))
+
+ # Get actual data without detection
+ cmd = Q_EXECUTABLE + ' -d , -H --as-text "select * from %s"' % (tmpfile.name)
+
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(e), 0)
+ self.assertEqual(len(o), 4)
+
+ self.assertEqual(o[0],b("regular text 1,67,67,12.3"))
+ self.assertEqual(o[1],b("regular text 2,067,067,22.3"))
+ self.assertEqual(o[2],b("regular text 3,123,123,33.4"))
+ self.assertEqual(o[3],b("regular text 4,-123,-123,0122.2"))
+
+ self.cleanup(tmpfile)
\ No newline at end of file
diff --git a/test/StdInTests.py b/test/StdInTests.py
new file mode 100644
index 00000000..13a68e2e
--- /dev/null
+++ b/test/StdInTests.py
@@ -0,0 +1,36 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.test_data import sample_data_no_header, sample_data_rows
+from test.base import AbstractQTestCase
+
+from test.utils import b
+from test.utils import DEBUG
+
+class StdInTests(AbstractQTestCase):
+
+ def test_stdin_input(self):
+ cmd = b('printf "%s" | ' + Q_EXECUTABLE + ' -d , "select c1,c2,c3 from -"') % sample_data_no_header
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 3)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0], sample_data_rows[0])
+ self.assertEqual(o[1], sample_data_rows[1])
+ self.assertEqual(o[2], sample_data_rows[2])
+
+ def test_attempt_to_unzip_stdin(self):
+ tmpfile = self.create_file_with_data(
+ b('\x1f\x8b\x08\x08\xf2\x18\x12S\x00\x03xxxxxx\x003\xe42\xe22\xe62\xe12\xe52\xe32\xe7\xb2\xe0\xb2\xe424\xe0\x02\x00\xeb\xbf\x8a\x13\x15\x00\x00\x00'))
+
+ cmd = 'cat %s | ' % tmpfile.name + Q_EXECUTABLE + ' -z "select sum(c1),avg(c1) from -"'
+
+ retcode, o, e = run_command(cmd)
+ self.assertTrue(retcode != 0)
+ self.assertTrue(len(o) == 0)
+ self.assertTrue(len(e) == 1)
+
+ self.assertEqual(e[0],b('Cannot decompress standard input. Pipe the input through zcat in order to decompress.'))
+
+ self.cleanup(tmpfile)
\ No newline at end of file
diff --git a/test/UserFunctionTests.py b/test/UserFunctionTests.py
new file mode 100644
index 00000000..3ef4220f
--- /dev/null
+++ b/test/UserFunctionTests.py
@@ -0,0 +1,182 @@
+from test.base import AbstractQTestCase
+from test.utils import Q_EXECUTABLE, run_command
+from test.test_data import sample_data_no_header
+from test.base import AbstractQTestCase
+
+from test.utils import b
+
+
+class UserFunctionTests(AbstractQTestCase):
+ def test_regexp_int_data_handling(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+
+ cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0],b("1"))
+
+ self.cleanup(tmpfile)
+
+ def test_percentile_func(self):
+ cmd = 'seq 1000 1999 | %s "select substr(c1,0,3),percentile(c1,0),percentile(c1,0.5),percentile(c1,1) from - group by substr(c1,0,3)" -c 1' % Q_EXECUTABLE
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 10)
+ self.assertEqual(len(e), 0)
+
+ output_table = [l.split(b(" ")) for l in o]
+ group_labels = [int(row[0]) for row in output_table]
+ minimum_values = [float(row[1]) for row in output_table]
+ median_values = [float(row[2]) for row in output_table]
+ max_values = [float(row[3]) for row in output_table]
+
+ base_values = list(range(1000,2000,100))
+
+ self.assertEqual(group_labels,list(range(10,20)))
+ self.assertEqual(minimum_values,base_values)
+ self.assertEqual(median_values,list(map(lambda x: x + 49.5,base_values)))
+ self.assertEqual(max_values,list(map(lambda x: x + 99,base_values)))
+
+ def test_regexp_null_data_handling(self):
+ tmpfile = self.create_file_with_data(sample_data_no_header)
+
+ cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode, 0)
+ self.assertEqual(len(o), 1)
+ self.assertEqual(len(e), 0)
+
+ self.assertEqual(o[0],b("2"))
+
+ self.cleanup(tmpfile)
+
+ def test_md5_function(self):
+ cmd = 'seq 1 4 | %s -c 1 -d , "select c1,md5(c1,\'utf-8\') from -"' % Q_EXECUTABLE
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),4)
+ self.assertEqual(len(e),0)
+
+ self.assertEqual(tuple(o[0].split(b(','),1)),(b('1'),b('c4ca4238a0b923820dcc509a6f75849b')))
+ self.assertEqual(tuple(o[1].split(b(','),1)),(b('2'),b('c81e728d9d4c2f636f067f89cc14862c')))
+ self.assertEqual(tuple(o[2].split(b(','),1)),(b('3'),b('eccbc87e4b5ce2fe28308fd9f2a7baf3')))
+ self.assertEqual(tuple(o[3].split(b(','),1)),(b('4'),b('a87ff679a2f3e71d9181a67b7542122c')))
+
+ def test_stddev_functions(self):
+ tmpfile = self.create_file_with_data(b("\n".join(map(str,[234,354,3234,123,4234,234,634,56,65]))))
+
+ cmd = '%s -c 1 -d , "select round(stddev_pop(c1),10),round(stddev_sample(c1),10) from %s"' % (Q_EXECUTABLE,tmpfile.name)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+
+ self.assertEqual(o[0],b('1479.7015464838,1569.4604964764'))
+
+ self.cleanup(tmpfile)
+
+ def test_sqrt_function(self):
+ cmd = 'seq 1 5 | %s -c 1 -d , "select round(sqrt(c1),10) from -"' % Q_EXECUTABLE
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),5)
+ self.assertEqual(len(e),0)
+
+ self.assertEqual(o[0],b('1.0'))
+ self.assertEqual(o[1],b('1.4142135624'))
+ self.assertEqual(o[2],b('1.7320508076'))
+ self.assertEqual(o[3],b('2.0'))
+ self.assertEqual(o[4],b('2.2360679775'))
+
+ def test_power_function(self):
+ cmd = 'seq 1 5 | %s -c 1 -d , "select round(power(c1,2.5),10) from -"' % Q_EXECUTABLE
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),5)
+ self.assertEqual(len(e),0)
+
+ self.assertEqual(o[0],b('1.0'))
+ self.assertEqual(o[1],b('5.6568542495'))
+ self.assertEqual(o[2],b('15.5884572681'))
+ self.assertEqual(o[3],b('32.0'))
+ self.assertEqual(o[4],b('55.9016994375'))
+
+ def test_file_functions(self):
+ filenames = [
+ "file1",
+ "file2.csv",
+ "/var/tmp/file3",
+ "/var/tmp/file4.gz",
+ ""
+ ]
+ data = "\n".join(filenames)
+
+ cmd = 'echo "%s" | %s -c 1 -d , "select file_folder(c1),file_ext(c1),file_basename(c1),file_basename_no_ext(c1) from -"' % (data,Q_EXECUTABLE)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),5)
+ self.assertEqual(len(e),0)
+ self.assertEqual(o,[
+ b',,file1,file1',
+ b',.csv,file2.csv,file2',
+ b'/var/tmp,,file3,file3',
+ b'/var/tmp,.gz,file4.gz,file4',
+ b',,,'
+ ])
+
+
+ def test_sha1_function(self):
+ cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha1(c1) from -"' % Q_EXECUTABLE
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),4)
+ self.assertEqual(len(e),0)
+
+ self.assertEqual(o[0],b('1,356a192b7913b04c54574d18c28d46e6395428ab'))
+ self.assertEqual(o[1],b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0'))
+ self.assertEqual(o[2],b('3,77de68daecd823babbb58edb1c8e14d7106e83bb'))
+ self.assertEqual(o[3],b('4,1b6453892473a467d07372d45eb05abc2031647a'))
+
+ def test_regexp_extract_function(self):
+ query = """
+ select
+ regexp_extract('was ([0-9]+) seconds and ([0-9]+) ms',c1,0),
+ regexp_extract('was ([0-9]+) seconds and ([0-9]+) ms',c1,1),
+ regexp_extract('non-existent-(regexp)',c1,0)
+ from
+ -
+ """
+
+ cmd = 'echo "Duration was 322 seconds and 240 ms" | %s -c 1 -d , "%s"' % (Q_EXECUTABLE,query)
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),1)
+ self.assertEqual(len(e),0)
+
+ self.assertEqual(o[0],b('322,240,'))
+
+ def test_sha_function(self):
+ cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha(c1,1,\'utf-8\') as sha1,sha(c1,224,\'utf-8\') as sha224,sha(c1,256,\'utf-8\') as sha256 from -"' % Q_EXECUTABLE
+ retcode, o, e = run_command(cmd)
+
+ self.assertEqual(retcode,0)
+ self.assertEqual(len(o),4)
+ self.assertEqual(len(e),0)
+
+ self.assertEqual(o[0],b('1,356a192b7913b04c54574d18c28d46e6395428ab,e25388fde8290dc286a6164fa2d97e551b53498dcbf7bc378eb1f178,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b'))
+ self.assertEqual(o[1],b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0,58b2aaa0bfae7acc021b3260e941117b529b2e69de878fd7d45c61a9,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35'))
+ self.assertEqual(o[2],b('3,77de68daecd823babbb58edb1c8e14d7106e83bb,4cfc3a1811fe40afa401b25ef7fa0379f1f7c1930a04f8755d678474,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce'))
+ self.assertEqual(o[3],b('4,1b6453892473a467d07372d45eb05abc2031647a,271f93f45e9b4067327ed5c8cd30a034730aaace4382803c3e1d6c2f,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a'))
\ No newline at end of file
diff --git a/test/base.py b/test/base.py
new file mode 100644
index 00000000..25264144
--- /dev/null
+++ b/test/base.py
@@ -0,0 +1,94 @@
+from test.utils import Q_EXECUTABLE, run_command
+
+
+
+import os
+import random
+import unittest
+from tempfile import NamedTemporaryFile
+from test.utils import DEBUG,b
+
+class AbstractQTestCase(unittest.TestCase):
+
+ def create_file_with_data(self, data, encoding=None,prefix=None,suffix=None,use_real_path=True):
+ if encoding is not None:
+ raise Exception('Deprecated: Encoding must be none')
+ tmpfile = NamedTemporaryFile(delete=False,prefix=prefix,suffix=suffix)
+ tmpfile.write(data)
+ tmpfile.close()
+ if use_real_path:
+ tmpfile.name = os.path.realpath(tmpfile.name)
+ return tmpfile
+
+ def generate_tmpfile_name(self,prefix=None,suffix=None):
+ tmpfile = NamedTemporaryFile(delete=False,prefix=prefix,suffix=suffix)
+ os.remove(tmpfile.name)
+ return os.path.realpath(tmpfile.name)
+
+ def arrays_to_csv_file_content(self,delimiter,header_row_list,cell_list):
+ all_rows = [delimiter.join(row) for row in [header_row_list] + cell_list]
+ return b("\n").join(all_rows)
+
+ def create_qsql_file_with_content_and_return_filename(self, header_row,cell_list):
+ csv_content = self.arrays_to_csv_file_content(b(','),header_row,cell_list)
+ tmpfile = self.create_file_with_data(csv_content)
+
+ cmd = '%s -d , -H "select count(*) from %s" -C readwrite' % (Q_EXECUTABLE,tmpfile.name)
+ r, o, e = run_command(cmd)
+ self.assertEqual(r, 0)
+
+ created_qsql_filename = '%s.qsql' % tmpfile.name
+ self.assertTrue(os.path.exists(created_qsql_filename))
+
+ return created_qsql_filename
+
+ def arrays_to_qsql_file_content(self, header_row,cell_list):
+ csv_content = self.arrays_to_csv_file_content(b(','),header_row,cell_list)
+ tmpfile = self.create_file_with_data(csv_content)
+
+ cmd = '%s -d , -H "select count(*) from %s" -C readwrite' % (Q_EXECUTABLE,tmpfile.name)
+ r, o, e = run_command(cmd)
+ self.assertEqual(r, 0)
+
+ matching_qsql_filename = '%s.qsql' % tmpfile.name
+ f = open(matching_qsql_filename,'rb')
+ qsql_file_bytes = f.read()
+ f.close()
+
+ self.assertEqual(matching_qsql_filename,'%s.qsql' % tmpfile.name)
+
+ return qsql_file_bytes
+
+ def write_file(self,filename,data):
+ f = open(filename,'wb')
+ f.write(data)
+ f.close()
+
+ def create_folder_with_files(self,filename_to_content_dict,prefix, suffix):
+ name = self.random_tmp_filename(prefix,suffix)
+ os.makedirs(name)
+ for filename,content in filename_to_content_dict.items():
+ if os.path.sep in filename:
+ os.makedirs('%s/%s' % (name,os.path.split(filename)[0]))
+ f = open(os.path.join(name,filename),'wb')
+ f.write(content)
+ f.close()
+ return name
+
+ def cleanup_folder(self,tmpfolder):
+ if not tmpfolder.startswith(os.path.realpath('/var/tmp')):
+ raise Exception('Guard against accidental folder deletions: %s' % tmpfolder)
+ global DEBUG
+ if not DEBUG:
+ print("should have removed tmpfolder %s. Not doing it for the sake of safety. # TODO re-add" % tmpfolder)
+ pass # os.remove(tmpfolder)
+
+ def cleanup(self, tmpfile):
+ global DEBUG
+ if not DEBUG:
+ os.remove(tmpfile.name)
+
+ def random_tmp_filename(self,prefix,postfix):
+ # TODO Use more robust method for this
+ path = '/var/tmp'
+ return os.path.realpath('%s/%s-%s.%s' % (path,prefix,random.randint(0,1000000000),postfix))
\ No newline at end of file
diff --git a/test/test_data.py b/test/test_data.py
new file mode 100644
index 00000000..a873117e
--- /dev/null
+++ b/test/test_data.py
@@ -0,0 +1,85 @@
+
+import os
+from test.utils import b
+
+EXAMPLES = os.path.abspath(os.path.join(os.getcwd(), 'examples'))
+
+
+uneven_ls_output = b("""drwxr-xr-x 2 root root 4096 Jun 11 2012 /selinux
+drwxr-xr-x 2 root root 4096 Apr 19 2013 /mnt
+drwxr-xr-x 2 root root 4096 Apr 24 2013 /srv
+drwx------ 2 root root 16384 Jun 21 2013 /lost+found
+lrwxrwxrwx 1 root root 33 Jun 21 2013 /initrd.img.old -> /boot/initrd.img-3.8.0-19-generic
+drwxr-xr-x 2 root root 4096 Jun 21 2013 /cdrom
+drwxr-xr-x 3 root root 4096 Jun 21 2013 /home
+lrwxrwxrwx 1 root root 29 Jun 21 2013 /vmlinuz -> boot/vmlinuz-3.8.0-19-generic
+lrwxrwxrwx 1 root root 32 Jun 21 2013 /initrd.img -> boot/initrd.img-3.8.0-19-generic
+""")
+
+
+find_output = b("""8257537 32 drwxrwxrwt 218 root root 28672 Mar 1 11:00 /tmp
+8299123 4 drwxrwxr-x 2 harel harel 4096 Feb 27 10:06 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/stormdist/testTopology3fad644a-54c0-4def-b19e-77ca97941595-1-1393513576
+8263229 964 -rw-rw-r-- 1 mapred mapred 984569 Feb 27 10:06 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/stormdist/testTopology3fad644a-54c0-4def-b19e-77ca97941595-1-1393513576/stormcode.ser
+8263230 4 -rw-rw-r-- 1 harel harel 1223 Feb 27 10:06 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/stormdist/testTopology3fad644a-54c0-4def-b19e-77ca97941595-1-1393513576/stormconf.ser
+8299113 4 drwxrwxr-x 2 harel harel 4096 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate
+8263406 4 -rw-rw-r-- 1 harel harel 2002 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514168746
+8263476 0 -rw-rw-r-- 1 harel harel 0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514168746.version
+8263607 0 -rw-rw-r-- 1 harel harel 0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514169735.version
+8263533 0 -rw-rw-r-- 1 harel harel 0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514172733.version
+8263604 0 -rw-rw-r-- 1 harel harel 0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514175754.version
+""")
+
+
+header_row = b('name,value1,value2')
+sample_data_rows = [b('a,1,0'), b('b,2,0'), b('c,,0')]
+sample_data_rows_with_empty_string = [b('a,aaa,0'), b('b,bbb,0'), b('c,,0')]
+sample_data_no_header = b("\n").join(sample_data_rows) + b("\n")
+sample_data_with_empty_string_no_header = b("\n").join(
+ sample_data_rows_with_empty_string) + b("\n")
+sample_data_with_header = header_row + b("\n") + sample_data_no_header
+sample_data_with_missing_header_names = b("name,value1\n") + sample_data_no_header
+
+def generate_sample_data_with_header(header):
+ return header + b("\n") + sample_data_no_header
+
+sample_quoted_data = b('''non_quoted regular_double_quoted double_double_quoted escaped_double_quoted multiline_double_double_quoted multiline_escaped_double_quoted
+control-value-1 "control-value-2" control-value-3 "control-value-4" control-value-5 "control-value-6"
+non-quoted-value "this is a quoted value" "this is a ""double double"" quoted value" "this is an escaped \\"quoted value\\"" "this is a double double quoted ""multiline
+ value""." "this is an escaped \\"multiline
+ value\\"."
+control-value-1 "control-value-2" control-value-3 "control-value-4" control-value-5 "control-value-6"
+''')
+
+double_double_quoted_data = b('''regular_double_quoted double_double_quoted
+"this is a quoted value" "this is a quoted value with ""double double quotes"""
+''')
+
+escaped_double_quoted_data = b('''regular_double_quoted escaped_double_quoted
+"this is a quoted value" "this is a quoted value with \\"escaped double quotes\\""
+''')
+
+combined_quoted_data = b('''regular_double_quoted double_double_quoted escaped_double_quoted
+"this is a quoted value" "this is a quoted value with ""double double quotes""" "this is a quoted value with \\"escaped double quotes\\""
+''')
+
+sample_quoted_data2 = b('"quoted data" 23\nunquoted-data 54')
+
+sample_quoted_data2_with_newline = b('"quoted data with\na new line inside it":23\nunquoted-data:54')
+
+one_column_data = b('''data without commas 1
+data without commas 2
+''')
+
+# Values with leading whitespace
+sample_data_rows_with_spaces = [b('a,1,0'), b(' b, 2,0'), b('c,,0')]
+sample_data_with_spaces_no_header = b("\n").join(
+ sample_data_rows_with_spaces) + b("\n")
+
+header_row_with_spaces = b('name,value 1,value2')
+sample_data_with_spaces_with_header = header_row_with_spaces + \
+ b("\n") + sample_data_with_spaces_no_header
+
+long_value1 = "23683289372328372328373"
+int_value = "2328372328373"
+sample_data_with_long_values = "%s\n%s\n%s" % (long_value1,int_value,int_value)
+
diff --git a/test/test_suite.py b/test/test_suite.py
deleted file mode 100755
index aaa7476f..00000000
--- a/test/test_suite.py
+++ /dev/null
@@ -1,5764 +0,0 @@
-#!/usr/bin/env python3
-
-#
-# test suite for q.
-#
-# Prefer end-to-end tests, running the actual q command and testing stdout/stderr, and the return code.
-# Some utilities are provided for making that easy, see other tests for examples.
-#
-# Q_EXECUTABLE env var can be used to inject the path of q. This allows full e2e testing of the resulting executable
-# instead of just testing the python code.
-#
-# Tests are compatible with Linux and OSX (path separators, tmp folder, etc.).
-
-from __future__ import print_function
-
-import collections
-import functools
-import tempfile
-import unittest
-import random
-import json
-import uuid
-from collections import OrderedDict
-from json import JSONEncoder
-from subprocess import PIPE, Popen, STDOUT
-import sys
-import os
-import time
-from tempfile import NamedTemporaryFile
-import locale
-import pprint
-import six
-from six.moves import range
-import codecs
-import itertools
-from gzip import GzipFile
-import pytest
-import uuid
-import sqlite3
-import re
-import collections
-
-sys.path.append(os.path.join(os.path.abspath(os.path.dirname(sys.argv[0])),'..','bin'))
-from bin.q import QTextAsData, QOutput, QOutputPrinter, QInputParams, DataStream, Sqlite3DB
-
-# q uses this encoding as the default output encoding. Some of the tests use it in order to
-# make sure that the output is correctly encoded
-SYSTEM_ENCODING = locale.getpreferredencoding()
-
-EXAMPLES = os.path.abspath(os.path.join(os.getcwd(), 'examples'))
-
-Q_EXECUTABLE = os.getenv('Q_EXECUTABLE', os.path.abspath('./bin/q.py'))
-Q_SKIP_EXECUTABLE_VALIDATION = os.getenv('Q_SKIP_EXECUTABLE_VALIDATION','false')
-
-if not Q_SKIP_EXECUTABLE_VALIDATION == 'true':
- Q_EXECUTABLE = os.path.abspath(Q_EXECUTABLE)
- if not os.path.exists(Q_EXECUTABLE):
- raise Exception("q executable must reside in {}".format(Q_EXECUTABLE))
-else:
- Q_EXECUTABLE = os.getenv('Q_EXECUTABLE')
- # Skip checking of executable (useful for testing that q is in the path)
- pass
-
-DEBUG = '-v' in sys.argv
-if os.environ.get('Q_DEBUG'):
- DEBUG = True
-
-def batch(iterable, n=1):
- r = []
- l = len(iterable)
- for ndx in range(0, l, n):
- r += [iterable[ndx:min(ndx + n, l)]]
- return r
-
-def partition(pred, iterable):
- t1, t2 = itertools.tee(iterable)
- return list(itertools.filterfalse(pred, t1)), list(filter(pred, t2))
-
-def run_command(cmd_to_run,env_to_inject=None):
- global DEBUG
- if DEBUG:
- print("CMD: {}".format(cmd_to_run))
-
- if env_to_inject is None:
- env_to_inject = os.environ
-
- env = env_to_inject
-
- p = Popen(cmd_to_run, stdout=PIPE, stderr=PIPE, shell=True,env=env)
- o, e = p.communicate()
- # remove last newline
- o = o.rstrip()
- e = e.strip()
- # split rows
- if o != six.b(''):
- o = o.split(six.b(os.linesep))
- else:
- o = []
- if e != six.b(''):
- e = e.split(six.b(os.linesep))
- else:
- e = []
-
- res = (p.returncode, o, e)
- if DEBUG:
- print("RESULT:{}".format(res))
- return res
-
-
-uneven_ls_output = six.b("""drwxr-xr-x 2 root root 4096 Jun 11 2012 /selinux
-drwxr-xr-x 2 root root 4096 Apr 19 2013 /mnt
-drwxr-xr-x 2 root root 4096 Apr 24 2013 /srv
-drwx------ 2 root root 16384 Jun 21 2013 /lost+found
-lrwxrwxrwx 1 root root 33 Jun 21 2013 /initrd.img.old -> /boot/initrd.img-3.8.0-19-generic
-drwxr-xr-x 2 root root 4096 Jun 21 2013 /cdrom
-drwxr-xr-x 3 root root 4096 Jun 21 2013 /home
-lrwxrwxrwx 1 root root 29 Jun 21 2013 /vmlinuz -> boot/vmlinuz-3.8.0-19-generic
-lrwxrwxrwx 1 root root 32 Jun 21 2013 /initrd.img -> boot/initrd.img-3.8.0-19-generic
-""")
-
-
-find_output = six.b("""8257537 32 drwxrwxrwt 218 root root 28672 Mar 1 11:00 /tmp
-8299123 4 drwxrwxr-x 2 harel harel 4096 Feb 27 10:06 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/stormdist/testTopology3fad644a-54c0-4def-b19e-77ca97941595-1-1393513576
-8263229 964 -rw-rw-r-- 1 mapred mapred 984569 Feb 27 10:06 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/stormdist/testTopology3fad644a-54c0-4def-b19e-77ca97941595-1-1393513576/stormcode.ser
-8263230 4 -rw-rw-r-- 1 harel harel 1223 Feb 27 10:06 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/stormdist/testTopology3fad644a-54c0-4def-b19e-77ca97941595-1-1393513576/stormconf.ser
-8299113 4 drwxrwxr-x 2 harel harel 4096 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate
-8263406 4 -rw-rw-r-- 1 harel harel 2002 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514168746
-8263476 0 -rw-rw-r-- 1 harel harel 0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514168746.version
-8263607 0 -rw-rw-r-- 1 harel harel 0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514169735.version
-8263533 0 -rw-rw-r-- 1 harel harel 0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514172733.version
-8263604 0 -rw-rw-r-- 1 harel harel 0 Feb 27 10:16 /tmp/1628a3fd-b9fe-4dd1-bcdc-7eb869fe7461/supervisor/localstate/1393514175754.version
-""")
-
-
-header_row = six.b('name,value1,value2')
-sample_data_rows = [six.b('a,1,0'), six.b('b,2,0'), six.b('c,,0')]
-sample_data_rows_with_empty_string = [six.b('a,aaa,0'), six.b('b,bbb,0'), six.b('c,,0')]
-sample_data_no_header = six.b("\n").join(sample_data_rows) + six.b("\n")
-sample_data_with_empty_string_no_header = six.b("\n").join(
- sample_data_rows_with_empty_string) + six.b("\n")
-sample_data_with_header = header_row + six.b("\n") + sample_data_no_header
-sample_data_with_missing_header_names = six.b("name,value1\n") + sample_data_no_header
-
-def generate_sample_data_with_header(header):
- return header + six.b("\n") + sample_data_no_header
-
-sample_quoted_data = six.b('''non_quoted regular_double_quoted double_double_quoted escaped_double_quoted multiline_double_double_quoted multiline_escaped_double_quoted
-control-value-1 "control-value-2" control-value-3 "control-value-4" control-value-5 "control-value-6"
-non-quoted-value "this is a quoted value" "this is a ""double double"" quoted value" "this is an escaped \\"quoted value\\"" "this is a double double quoted ""multiline
- value""." "this is an escaped \\"multiline
- value\\"."
-control-value-1 "control-value-2" control-value-3 "control-value-4" control-value-5 "control-value-6"
-''')
-
-double_double_quoted_data = six.b('''regular_double_quoted double_double_quoted
-"this is a quoted value" "this is a quoted value with ""double double quotes"""
-''')
-
-escaped_double_quoted_data = six.b('''regular_double_quoted escaped_double_quoted
-"this is a quoted value" "this is a quoted value with \\"escaped double quotes\\""
-''')
-
-combined_quoted_data = six.b('''regular_double_quoted double_double_quoted escaped_double_quoted
-"this is a quoted value" "this is a quoted value with ""double double quotes""" "this is a quoted value with \\"escaped double quotes\\""
-''')
-
-sample_quoted_data2 = six.b('"quoted data" 23\nunquoted-data 54')
-
-sample_quoted_data2_with_newline = six.b('"quoted data with\na new line inside it":23\nunquoted-data:54')
-
-one_column_data = six.b('''data without commas 1
-data without commas 2
-''')
-
-# Values with leading whitespace
-sample_data_rows_with_spaces = [six.b('a,1,0'), six.b(' b, 2,0'), six.b('c,,0')]
-sample_data_with_spaces_no_header = six.b("\n").join(
- sample_data_rows_with_spaces) + six.b("\n")
-
-header_row_with_spaces = six.b('name,value 1,value2')
-sample_data_with_spaces_with_header = header_row_with_spaces + \
- six.b("\n") + sample_data_with_spaces_no_header
-
-long_value1 = "23683289372328372328373"
-int_value = "2328372328373"
-sample_data_with_long_values = "%s\n%s\n%s" % (long_value1,int_value,int_value)
-
-
-def one_column_warning(e):
- return e[0].startswith(six.b('Warning: column count is one'))
-
-def sqlite_dict_factory(cursor, row):
- d = {}
- for idx, col in enumerate(cursor.description):
- d[col[0]] = row[idx]
- return d
-
-class AbstractQTestCase(unittest.TestCase):
-
- def create_file_with_data(self, data, encoding=None,prefix=None,suffix=None,use_real_path=True):
- if encoding is not None:
- raise Exception('Deprecated: Encoding must be none')
- tmpfile = NamedTemporaryFile(delete=False,prefix=prefix,suffix=suffix)
- tmpfile.write(data)
- tmpfile.close()
- if use_real_path:
- tmpfile.name = os.path.realpath(tmpfile.name)
- return tmpfile
-
- def generate_tmpfile_name(self,prefix=None,suffix=None):
- tmpfile = NamedTemporaryFile(delete=False,prefix=prefix,suffix=suffix)
- os.remove(tmpfile.name)
- return os.path.realpath(tmpfile.name)
-
- def arrays_to_csv_file_content(self,delimiter,header_row_list,cell_list):
- all_rows = [delimiter.join(row) for row in [header_row_list] + cell_list]
- return six.b("\n").join(all_rows)
-
- def create_qsql_file_with_content_and_return_filename(self, header_row,cell_list):
- csv_content = self.arrays_to_csv_file_content(six.b(','),header_row,cell_list)
- tmpfile = self.create_file_with_data(csv_content)
-
- cmd = '%s -d , -H "select count(*) from %s" -C readwrite' % (Q_EXECUTABLE,tmpfile.name)
- r, o, e = run_command(cmd)
- self.assertEqual(r, 0)
-
- created_qsql_filename = '%s.qsql' % tmpfile.name
- self.assertTrue(os.path.exists(created_qsql_filename))
-
- return created_qsql_filename
-
- def arrays_to_qsql_file_content(self, header_row,cell_list):
- csv_content = self.arrays_to_csv_file_content(six.b(','),header_row,cell_list)
- tmpfile = self.create_file_with_data(csv_content)
-
- cmd = '%s -d , -H "select count(*) from %s" -C readwrite' % (Q_EXECUTABLE,tmpfile.name)
- r, o, e = run_command(cmd)
- self.assertEqual(r, 0)
-
- matching_qsql_filename = '%s.qsql' % tmpfile.name
- f = open(matching_qsql_filename,'rb')
- qsql_file_bytes = f.read()
- f.close()
-
- self.assertEqual(matching_qsql_filename,'%s.qsql' % tmpfile.name)
-
- return qsql_file_bytes
-
- def write_file(self,filename,data):
- f = open(filename,'wb')
- f.write(data)
- f.close()
-
- def create_folder_with_files(self,filename_to_content_dict,prefix, suffix):
- name = self.random_tmp_filename(prefix,suffix)
- os.makedirs(name)
- for filename,content in six.iteritems(filename_to_content_dict):
- if os.path.sep in filename:
- os.makedirs('%s/%s' % (name,os.path.split(filename)[0]))
- f = open(os.path.join(name,filename),'wb')
- f.write(content)
- f.close()
- return name
-
- def cleanup_folder(self,tmpfolder):
- if not tmpfolder.startswith(os.path.realpath('/var/tmp')):
- raise Exception('Guard against accidental folder deletions: %s' % tmpfolder)
- global DEBUG
- if not DEBUG:
- print("should have removed tmpfolder %s. Not doing it for the sake of safety. # TODO re-add" % tmpfolder)
- pass # os.remove(tmpfolder)
-
- def cleanup(self, tmpfile):
- global DEBUG
- if not DEBUG:
- os.remove(tmpfile.name)
-
- def random_tmp_filename(self,prefix,postfix):
- # TODO Use more robust method for this
- path = '/var/tmp'
- return os.path.realpath('%s/%s-%s.%s' % (path,prefix,random.randint(0,1000000000),postfix))
-
-
-
-def get_sqlite_table_list(c,exclude_qcatalog=True):
- if exclude_qcatalog:
- r = c.execute("select tbl_name from sqlite_master where type='table' and tbl_name != '_qcatalog'").fetchall()
- else:
- r = c.execute("select tbl_name from sqlite_master where type='table'").fetchall()
-
- return r
-
-class SaveToSqliteTests(AbstractQTestCase):
-
- # Returns a folder with files and a header in each, one column named 'a'
- def generate_files_in_folder(self,batch_size, file_count):
- numbers = list(range(1, 1 + batch_size * file_count))
- numbers_as_text = batch([str(x) for x in numbers], n=batch_size)
-
- content_list = list(map(six.b, ['a\n' + "\n".join(x) + '\n' for x in numbers_as_text]))
-
- filename_list = list(map(lambda x: 'file-%s' % x, range(file_count)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d, 'split-files', 'sqlite-stuff')
- return (tmpfolder,filename_list)
-
- # 11074 3.8.2021 10:53 bin/q.py "select count(*) from xxxx/file-95 left join xxxx/file-96 left join xxxx/file-97 left join xxxx/file-97 left join xxxx/file-98 left join xxxx/*" -c 1 -C readwrite -A
- # # fails because it takes qsql files as well
-
- def test_save_glob_files_to_sqlite(self):
- BATCH_SIZE = 50
- FILE_COUNT = 5
-
- tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
-
- output_sqlite_file = self.random_tmp_filename("x","sqlite")
-
- cmd = '%s -H "select count(*) from %s/*" -c 1 -S %s' % (Q_EXECUTABLE,tmpfolder,output_sqlite_file)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
-
- c = sqlite3.connect(output_sqlite_file)
- results = c.execute('select a from file_dash_0').fetchall()
- self.assertEqual(len(results),BATCH_SIZE*FILE_COUNT)
- self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1,BATCH_SIZE*FILE_COUNT+1)))
- tables = get_sqlite_table_list(c)
- self.assertEqual(len(tables),1)
-
- c.close()
-
- self.cleanup_folder(tmpfolder)
-
- def test_save_multiple_files_to_sqlite(self):
- BATCH_SIZE = 50
- FILE_COUNT = 5
-
- tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
-
- output_sqlite_file = self.random_tmp_filename("x","sqlite")
-
- tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
- cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
-
- c = sqlite3.connect(output_sqlite_file)
-
- tables = get_sqlite_table_list(c)
- self.assertEqual(len(tables), FILE_COUNT)
-
- for i in range(FILE_COUNT):
- results = c.execute('select a from file_dash_%s' % i).fetchall()
- self.assertEqual(len(results),BATCH_SIZE)
- self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
-
- c.close()
-
- self.cleanup_folder(tmpfolder)
-
- def test_save_multiple_files_to_sqlite_without_duplicates(self):
- BATCH_SIZE = 50
- FILE_COUNT = 5
-
- tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
-
- output_sqlite_file = self.random_tmp_filename("x","sqlite")
-
- tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
-
- # duplicate the left-joins for all the files, so the query will contain each filename twice
- tables_as_str = "%s left join %s" % (tables_as_str,tables_as_str)
-
- cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
-
- c = sqlite3.connect(output_sqlite_file)
-
- tables = get_sqlite_table_list(c)
- # total table count should still be FILE_COUNT, even with the duplications
- self.assertEqual(len(tables), FILE_COUNT)
-
- for i in range(FILE_COUNT):
- results = c.execute('select a from file_dash_%s' % i).fetchall()
- self.assertEqual(len(results),BATCH_SIZE)
- self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
-
- c.close()
-
- self.cleanup_folder(tmpfolder)
-
- def test_sqlite_file_is_not_created_if_some_table_does_not_exist(self):
- BATCH_SIZE = 50
- FILE_COUNT = 5
-
- tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
-
- output_sqlite_file = self.random_tmp_filename("x","sqlite")
-
- tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
-
- tables_as_str = tables_as_str + ' left join %s/non_existent_table' % (tmpfolder)
-
- cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 30)
- self.assertEqual(len(e), 2)
- self.assertEqual(e[0],six.b("Going to save data into a disk database: %s" % output_sqlite_file))
- self.assertEqual(e[1],six.b("No files matching '%s/non_existent_table' have been found" % tmpfolder))
-
- self.assertTrue(not os.path.exists(output_sqlite_file))
-
- self.cleanup_folder(tmpfolder)
-
- def test_recurring_glob_and_separate_files_in_same_query_when_writing_to_sqlite(self):
- BATCH_SIZE = 50
- FILE_COUNT = 5
-
- tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
-
- output_sqlite_file = self.random_tmp_filename("x","sqlite")
-
- tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
- # The same files are left-joined in the query as an additional "left join /*". This should create an additional table
- # in the sqlite file, with all the data in it
- cmd = '%s -H "select count(*) from %s left join %s/*" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,tmpfolder,output_sqlite_file)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
-
- c = sqlite3.connect(output_sqlite_file)
-
- tables = get_sqlite_table_list(c)
- # plus the additional table from the glob
- self.assertEqual(len(tables), FILE_COUNT+1)
-
- # check all the per-file tables
- for i in range(FILE_COUNT):
- results = c.execute('select a from file_dash_%s' % i).fetchall()
- self.assertEqual(len(results),BATCH_SIZE)
- self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
-
- # ensure the glob-based table exists, with an _2 added to the name, as the original "file_dash_0" already exists in the sqlite db
- results = c.execute('select a from file_dash_0_2').fetchall()
- self.assertEqual(len(results),FILE_COUNT*BATCH_SIZE)
- self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1,1+FILE_COUNT*BATCH_SIZE)))
- c.close()
-
- self.cleanup_folder(tmpfolder)
-
- def test_empty_sqlite_handling(self):
- fn = self.generate_tmpfile_name("empty",".sqlite")
-
- c = sqlite3.connect(fn)
- c.execute('create table x (a int)').fetchall()
- c.execute('drop table x').fetchall()
- c.close()
-
- cmd = '%s "select * from %s"' % (Q_EXECUTABLE,fn)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,88)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
- self.assertEqual(e[0],six.b('sqlite file %s has no tables' % fn))
-
- def test_storing_to_disk_too_many_qsql_files(self):
- BATCH_SIZE = 10
- MAX_ATTACHED_DBS = 5
- FILE_COUNT = MAX_ATTACHED_DBS + 4
-
- numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)
-
- content_list = map(six.b, ["\n".join(x) for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d, 'split-files', 'attach-limit')
-
- for fn in filename_list:
- cmd = '%s -c 1 "select count(*) from %s/%s" -C readwrite' % (Q_EXECUTABLE,tmpfolder, fn)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
-
- output_sqlite_file = self.generate_tmpfile_name("many-sqlites",".sqlite")
-
- table_refs = list(['select * from %s/%s.qsql' % (tmpfolder,x) for x in filename_list])
- table_refs_str = " UNION ALL ".join(table_refs)
- # Limit max attached dbs according to the parameter (must be below the hardcoded sqlite limit, which is 10 when having a standard version compiled)
- cmd = '%s "select * from (%s)" -S %s --max-attached-sqlite-databases=%s' % (Q_EXECUTABLE,table_refs_str,output_sqlite_file,MAX_ATTACHED_DBS)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),4)
-
- c = sqlite3.connect(output_sqlite_file)
- tables_results = c.execute("select tbl_name from sqlite_master where type='table'").fetchall()
- table_names = list(sorted([x[0] for x in tables_results]))
- self.assertEqual(len(table_names),FILE_COUNT)
-
- for i,tn in enumerate(table_names):
- self.assertEqual(tn,'file_dash_%s' % i)
-
- table_content = c.execute('select * from %s' % tn).fetchall()
- self.assertEqual(len(table_content),BATCH_SIZE)
-
- cmd = '%s "select * from %s:::%s"' % (Q_EXECUTABLE,output_sqlite_file,tn)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),BATCH_SIZE)
- self.assertEqual(o,list([six.b(str(x)) for x in range(1 + i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)]))
-
- self.cleanup_folder(tmpfolder)
-
- def test_storing_to_disk_too_many_sqlite_files(self):
- # a variation of test_storing_to_disk_too_many_qsql_files, which deletes the qcatalog file from the caches,
- # so they'll be just regular sqlite files
-
- BATCH_SIZE = 10
- MAX_ATTACHED_DBS = 5
- FILE_COUNT = MAX_ATTACHED_DBS + 4
-
- numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)
-
- content_list = map(six.b, ["\n".join(x) for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d, 'split-files', 'attach-limit')
-
- for fn in filename_list:
- cmd = '%s -c 1 "select count(*) from %s/%s" -C readwrite' % (Q_EXECUTABLE,tmpfolder, fn)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
-
- c = sqlite3.connect('%s/%s.qsql' % (tmpfolder,fn))
- c.execute('drop table _qcatalog').fetchall()
- c.close()
- os.rename('%s/%s.qsql' % (tmpfolder,fn),'%s/%s.sqlite' % (tmpfolder,fn))
-
- output_sqlite_file = self.generate_tmpfile_name("many-sqlites",".sqlite")
-
- table_refs = list(['select * from %s/%s.sqlite' % (tmpfolder,x) for x in filename_list])
- table_refs_str = " UNION ALL ".join(table_refs)
- # Limit max attached dbs according to the parameter (must be below the hardcoded sqlite limit, which is 10 when having a standard version compiled)
- cmd = '%s "select * from (%s)" -S %s --max-attached-sqlite-databases=%s' % (Q_EXECUTABLE,table_refs_str,output_sqlite_file,MAX_ATTACHED_DBS)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),4)
-
- c = sqlite3.connect(output_sqlite_file)
- tables_results = c.execute("select tbl_name from sqlite_master where type='table'").fetchall()
- table_names = list(sorted([x[0] for x in tables_results]))
- self.assertEqual(len(table_names),FILE_COUNT)
-
- for i,tn in enumerate(table_names):
- self.assertEqual(tn,'file_dash_%s' % i)
-
- table_content = c.execute('select * from %s' % tn).fetchall()
- self.assertEqual(len(table_content),BATCH_SIZE)
-
- cmd = '%s "select * from %s:::%s"' % (Q_EXECUTABLE,output_sqlite_file,tn)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),BATCH_SIZE)
- self.assertEqual(o,list([six.b(str(x)) for x in range(1 + i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)]))
-
- self.cleanup_folder(tmpfolder)
-
- def test_storing_to_disk_too_many_sqlite_files__over_the_sqlite_limit(self):
- # a variation of test_storing_to_disk_too_many_sqlite_files, but with a limit above the sqlite hardcoded limit
- MAX_ATTACHED_DBS = 20 # standard sqlite limit is 10, so q should throw an error
-
- BATCH_SIZE = 10
- FILE_COUNT = MAX_ATTACHED_DBS + 4
-
- numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)
-
- content_list = map(six.b, ["\n".join(x) for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d, 'split-files', 'attach-limit')
-
- for fn in filename_list:
- cmd = '%s -c 1 "select count(*) from %s/%s" -C readwrite' % (Q_EXECUTABLE,tmpfolder, fn)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
-
- c = sqlite3.connect('%s/%s.qsql' % (tmpfolder,fn))
- c.execute('drop table _qcatalog').fetchall()
- c.close()
- os.rename('%s/%s.qsql' % (tmpfolder,fn),'%s/%s.sqlite' % (tmpfolder,fn))
-
- output_sqlite_file = self.generate_tmpfile_name("many-sqlites",".sqlite")
-
- table_refs = list(['select * from %s/%s.sqlite' % (tmpfolder,x) for x in filename_list])
- table_refs_str = " UNION ALL ".join(table_refs)
- # Limit max attached dbs according to the parameter (must be below the hardcoded sqlite limit, which is 10 when having a standard version compiled)
- cmd = '%s "select * from (%s)" -S %s --max-attached-sqlite-databases=%s' % (Q_EXECUTABLE,table_refs_str,output_sqlite_file,MAX_ATTACHED_DBS)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode,89)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),2)
- self.assertTrue(e[0].startswith(six.b('Going to save data into')))
- self.assertTrue(e[1].startswith(six.b('There are too many attached databases. Use a proper --max-attached-sqlite-databases parameter which is below the maximum')))
-
- self.cleanup_folder(tmpfolder)
-
- def test_qtable_name_normalization__starting_with_a_digit(self):
- numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 101)]
-
- header = [six.b('aa'), six.b('bb'), six.b('cc')]
-
- base_filename_with_digits = '010'
-
- new_tmp_folder = self.create_folder_with_files({
- base_filename_with_digits : self.arrays_to_csv_file_content(six.b(','),header,numbers)
- },prefix='xx',suffix='digits')
-
- effective_filename = '%s/010' % new_tmp_folder
-
- output_sqlite_filename = self.generate_tmpfile_name("starting-with-digit",".sqlite")
- cmd = '%s -d , -H "select count(aa),count(bb),count(cc) from %s" -S %s' % (Q_EXECUTABLE,effective_filename,output_sqlite_filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),4)
-
- c = sqlite3.connect(output_sqlite_filename)
- results = c.execute('select aa,bb,cc from t_%s' % base_filename_with_digits).fetchall()
- self.assertEqual(results,list([(x,x,x) for x in range(1,101)]))
- c.close()
-
- self.cleanup_folder(new_tmp_folder)
-
- def test_qtable_name_normalization(self):
- x = [six.b(a) for a in map(str, range(1, 101))]
- large_file_data = six.b("val\n") + six.b("\n").join(x)
- tmpfile = self.create_file_with_data(large_file_data)
-
- tmpfile_folder = os.path.dirname(tmpfile.name)
- tmpfile_basename = os.path.basename(tmpfile.name)
-
- cmd = 'cd %s && %s -c 1 -H -D , -O "select a.val,b.val from %s a cross join ./%s b on (a.val = b.val * 2)"' % (tmpfile_folder,Q_EXECUTABLE,tmpfile_basename,tmpfile_basename)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 51)
-
- evens = list(filter(lambda x: x%2 == 0,range(1,101)))
- expected_result_rows = [six.b('val,val')] + [six.b('%d,%d' % (x,x / 2)) for x in evens]
- self.assertEqual(o,expected_result_rows)
-
- def test_qtable_name_normalization2(self):
- cmd = '%s "select * from"' % Q_EXECUTABLE
-
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 118)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0],six.b('FROM/JOIN is missing a table name after it'))
-
- def test_qtable_name_normalization3(self):
- # with a space after the from
- cmd = '%s "select * from "' % Q_EXECUTABLE
-
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 118)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0],six.b('FROM/JOIN is missing a table name after it'))
-
- def test_save_multiple_files_to_sqlite_while_caching_them(self):
- BATCH_SIZE = 50
- FILE_COUNT = 5
-
- tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
-
- output_sqlite_file = self.random_tmp_filename("x","sqlite")
-
- tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
- cmd = '%s -H "select count(*) from %s" -c 1 -S %s -C readwrite' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
-
- c = sqlite3.connect(output_sqlite_file)
-
- tables = get_sqlite_table_list(c)
- self.assertEqual(len(tables), FILE_COUNT)
-
- for i,filename in enumerate(filename_list):
- matching_table_name = 'file_dash_%s' % i
-
- results = c.execute('select a from %s' % matching_table_name).fetchall()
- self.assertEqual(len(results),BATCH_SIZE)
- self.assertEqual(sum(map(lambda x:x[0],results)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
-
- # check actual resulting qsql file for the file
- cmd = '%s -c 1 -H "select a from %s/%s"' % (Q_EXECUTABLE,tmpfolder,filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), BATCH_SIZE)
- self.assertEqual(sum(map(int,o)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
- self.assertEqual(len(e), 0)
-
- # check analysis returns proper file-with-unused-qsql for each file, since by default `-C none` which means don't read the cache
- # even if it exists
- cmd = '%s -c 1 -H "select a from %s/%s" -A' % (Q_EXECUTABLE,tmpfolder,filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 5)
- self.assertEqual(o,[
- six.b('Table: %s/file-%s' % (tmpfolder,i)),
- six.b(' Sources:'),
- six.b(' source_type: file-with-unused-qsql source: %s/file-%s' % (tmpfolder,i)),
- six.b(' Fields:'),
- six.b(' `a` - int')
- ])
-
- cmd = '%s -c 1 -H "select a from %s/%s" -A -C read' % (Q_EXECUTABLE,tmpfolder,filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 5)
- self.assertEqual(o,[
- six.b('Table: %s/file-%s' % (tmpfolder,i)),
- six.b(' Sources:'),
- six.b(' source_type: qsql-file-with-original source: %s/file-%s.qsql' % (tmpfolder,i)),
- six.b(' Fields:'),
- six.b(' `a` - int')
- ])
-
- # check qsql file is readable directly through q
- cmd = '%s -c 1 -H "select a from %s/%s.qsql"' % (Q_EXECUTABLE,tmpfolder,filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), BATCH_SIZE)
- self.assertEqual(sum(map(int,o)),sum(range(1+i*BATCH_SIZE,1+(i+1)*BATCH_SIZE)))
- self.assertEqual(len(e), 0)
-
- # check analysis returns proper qsql-with-original for each file when running directly against the qsql file
- cmd = '%s -c 1 -H "select a from %s/%s.qsql" -A' % (Q_EXECUTABLE,tmpfolder,filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 5)
- self.assertEqual(o,[
- six.b('Table: %s/file-%s.qsql' % (tmpfolder,i)),
- six.b(' Sources:'),
- six.b(' source_type: qsql-file source: %s/file-%s.qsql' % (tmpfolder,i)),
- six.b(' Fields:'),
- six.b(' `a` - int')
- ])
- c.close()
-
- import glob
- filename_list_with_qsql = list(map(lambda x: x+'.qsql',filename_list))
-
- files_in_folder = glob.glob('%s/*' % tmpfolder)
- regular_files,qsql_files = partition(lambda x: x.endswith('.qsql'),files_in_folder)
-
- self.assertEqual(len(files_in_folder),2*FILE_COUNT)
- self.assertEqual(sorted(list(map(os.path.basename,regular_files))),sorted(list(map(os.path.basename,filename_list))))
- self.assertEqual(sorted(list(map(os.path.basename,qsql_files))),sorted(list(map(os.path.basename,filename_list_with_qsql))))
-
- self.cleanup_folder(tmpfolder)
-
- def test_globs_ignore_matching_qsql_files(self):
- BATCH_SIZE = 10
- FILE_COUNT = 5
-
- tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
-
- tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
- cmd = '%s -H "select count(*) from %s" -c 1 -C readwrite' % (Q_EXECUTABLE,tables_as_str)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b(str(pow(BATCH_SIZE,FILE_COUNT))))
-
- cmd = '%s -H "select a from %s/*" -c 1 -C read' % (Q_EXECUTABLE,tmpfolder)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), BATCH_SIZE*FILE_COUNT)
- self.assertEqual(len(e), 0)
- self.assertEqual(sum(map(int,o)),sum(range(1,1+BATCH_SIZE*FILE_COUNT)))
-
- self.cleanup_folder(tmpfolder)
-
- def test_error_on_reading_from_multi_table_sqlite_without_explicit_table_name(self):
- BATCH_SIZE = 50
- FILE_COUNT = 5
-
- tmpfolder,filename_list = self.generate_files_in_folder(BATCH_SIZE,FILE_COUNT)
-
- output_sqlite_file = self.random_tmp_filename("x","sqlite")
-
- tables_as_str = " left join ".join(["%s/%s" % (tmpfolder,x) for x in filename_list])
- cmd = '%s -H "select count(*) from %s" -c 1 -S %s' % (Q_EXECUTABLE,tables_as_str,output_sqlite_file)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
-
-
- cmd = '%s -H "select count(*) from %s"' % (Q_EXECUTABLE,output_sqlite_file)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 87)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0],six.b("Could not autodetect table name in sqlite file %s . Existing tables: file_dash_0,file_dash_1,file_dash_2,file_dash_3,file_dash_4" % output_sqlite_file))
-
- self.cleanup_folder(tmpfolder)
-
- def test_error_on_trying_to_specify_an_explicit_non_existent_qsql_file(self):
- cmd = '%s -H "select count(*) from /non-existent-folder/non-existent.qsql:::mytable"' % (Q_EXECUTABLE)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 30)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0],six.b("Could not find file /non-existent-folder/non-existent.qsql"))
-
- def test_error_on_providing_a_non_qsql_file_when_specifying_an_explicit_table(self):
- data = six.b("\x1f\x8b\x08\x00\tZ\x0ea\x00\x03\xed\x93\xdd\n\xc20\x0cF\xf3(}\x01ij\x93\xf6y:\xd9P\x10)\xb3\xbe\xbf\x9d\x1d\xbbQ\xc6\x06F\x10rn\xbe\x9b\xd0\xfc\x1c\x9a-\x88\x83\x88\x91\xd9\xbc2\xb4\xc4#\xb5\x9c1\x8e\x1czb\x8a\xd1\x19t\xdeS\x00\xc3\xf2\xa3\x01<\xee%\x8du\x94s\x1a\xfbk\xd7\xdf\x0e\xa9\x94Kz\xaf\xabe\xc3\xb0\xf2\xce\xbc\xc7\x92\x7fB\xb6\x1fv\xfd2\xf5\x1e\x81h\xa3\xff\x10'\xff\x8c\x04\x06\xc5'\x03\xf5oO\xe2=v\xf9o\xff\x9f\xd1\xa9\xff_\x90m'\xdec\x9f\x7f\x9c\xfc\xd7T\xff\x8a\xa2(\x92<\x01WY\x0c\x06\x00\x0c\x00\x00")
- tmpfilename = self.random_tmp_filename('xx','yy')
- f = open(tmpfilename,'wb')
- f.write(data)
- f.close()
-
- cmd = '%s -H "select count(*) from %s:::mytable1"' % (Q_EXECUTABLE,tmpfilename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 95)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0],six.b("Cannot detect the type of table %s:::mytable1" % tmpfilename))
-
- def test_error_on_providing_a_non_qsql_file_when_not_specifying_an_explicit_table(self):
- data = six.b("\x1f\x8b\x08\x00\tZ\x0ea\x00\x03\xed\x93\xdd\n\xc20\x0cF\xf3(}\x01ij\x93\xf6y:\xd9P\x10)\xb3\xbe\xbf\x9d\x1d\xbbQ\xc6\x06F\x10rn\xbe\x9b\xd0\xfc\x1c\x9a-\x88\x83\x88\x91\xd9\xbc2\xb4\xc4#\xb5\x9c1\x8e\x1czb\x8a\xd1\x19t\xdeS\x00\xc3\xf2\xa3\x01<\xee%\x8du\x94s\x1a\xfbk\xd7\xdf\x0e\xa9\x94Kz\xaf\xabe\xc3\xb0\xf2\xce\xbc\xc7\x92\x7fB\xb6\x1fv\xfd2\xf5\x1e\x81h\xa3\xff\x10'\xff\x8c\x04\x06\xc5'\x03\xf5oO\xe2=v\xf9o\xff\x9f\xd1\xa9\xff_\x90m'\xdec\x9f\x7f\x9c\xfc\xd7T\xff\x8a\xa2(\x92<\x01WY\x0c\x06\x00\x0c\x00\x00")
- tmpfilename = self.random_tmp_filename('xx','yy')
- f = open(tmpfilename,'wb')
- f.write(data)
- f.close()
-
- cmd = '%s -H "select count(*) from %s"' % (Q_EXECUTABLE,tmpfilename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 59)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertTrue(e[0].startswith(six.b("Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error:")))
-
-class OldSaveDbToDiskTests(AbstractQTestCase):
-
- def test_join_with_stdin_and_save(self):
- x = [six.b(a) for a in map(str,range(1,101))]
- large_file_data = six.b("val\n") + six.b("\n").join(x)
- tmpfile = self.create_file_with_data(large_file_data)
- tmpfile_expected_table_name = os.path.basename(tmpfile.name)
-
- disk_db_filename = self.random_tmp_filename('save-to-db','sqlite')
-
- cmd = '(echo id ; seq 1 2 10) | ' + Q_EXECUTABLE + ' -c 1 -H -O "select stdin.*,f.* from - stdin left join %s f on (stdin.id * 10 = f.val)" -S %s' % \
- (tmpfile.name,disk_db_filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
-
- self.assertEqual(e[0],six.b('Going to save data into a disk database: %s' % disk_db_filename))
- self.assertTrue(e[1].startswith(six.b('Data has been saved into %s . Saving has taken ' % disk_db_filename)))
- self.assertEqual(e[2],six.b('Query to run on the database: select stdin.*,f.* from data_stream_stdin stdin left join %s f on (stdin.id * 10 = f.val);' % \
- tmpfile_expected_table_name))
- self.assertEqual(e[3],six.b('You can run the query directly from the command line using the following command: echo "select stdin.*,f.* from data_stream_stdin stdin left join %s f on (stdin.id * 10 = f.val)" | sqlite3 %s' %
- (tmpfile_expected_table_name,disk_db_filename)))
-
- P = re.compile(six.b("^Query to run on the database: (?P.*)$"))
- m = P.search(e[2])
- query_to_run_on_db = m.groupdict()['query_to_run_on_db']
-
- self.assertTrue(os.path.exists(disk_db_filename))
-
- # validate disk db content natively
- c = sqlite3.connect(disk_db_filename)
- c.row_factory = sqlite_dict_factory
- t0_results = c.execute('select * from data_stream_stdin').fetchall()
- self.assertEqual(len(t0_results),5)
- self.assertEqual(sorted(list(t0_results[0].keys())), ['id'])
- self.assertEqual(list(map(lambda x:x['id'],t0_results)),[1,3,5,7,9])
- t1_results = c.execute('select * from %s' % tmpfile_expected_table_name).fetchall()
- self.assertEqual(len(t1_results),100)
- self.assertEqual(sorted(list(t1_results[0].keys())), ['val'])
- self.assertEqual("\n".join(list(map(lambda x:str(x['val']),t1_results))),"\n".join(map(str,range(1,101))))
-
- query_results = c.execute(query_to_run_on_db.decode('utf-8')).fetchall()
-
- self.assertEqual(query_results[0],{ 'id': 1 , 'val': 10})
- self.assertEqual(query_results[1],{ 'id': 3 , 'val': 30})
- self.assertEqual(query_results[2],{ 'id': 5 , 'val': 50})
- self.assertEqual(query_results[3],{ 'id': 7 , 'val': 70})
- self.assertEqual(query_results[4],{ 'id': 9 , 'val': 90})
-
- self.cleanup(tmpfile)
-
- def test_join_with_qsql_file(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
-
- header = [six.b('aa'), six.b('bb'), six.b('cc')]
-
- new_tmp_folder = self.create_folder_with_files({
- 'some_csv_file': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
- 'some_qsql_database.qsql' : self.arrays_to_qsql_file_content(header,numbers2)
- },prefix='xx',suffix='yy')
-
- effective_filename1 = '%s/some_csv_file' % new_tmp_folder
- effective_filename2 = '%s/some_qsql_database.qsql' % new_tmp_folder
-
- cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(small_file.aa) from %s large_file left join %s small_file on (small_file.aa == large_file.bb)"' % \
- (effective_filename1,effective_filename2)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('50005000,55'))
-
- # TODO RLRL Check if needed anymore
-
- # def test_creation_of_qsql_database(self):
- # numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
- # header = [six.b('aa'), six.b('bb'), six.b('cc')]
- #
- # qsql_filename = self.create_qsql_file_with_content_and_return_filename(header,numbers)
- #
- # conn = sqlite3.connect(qsql_filename)
- # qcatalog = conn.execute('select temp_table_name,source_type,source from _qcatalog').fetchall()
- # print(qcatalog)
- #
- # cmd = '%s "select count(*) from %s" -A' % (Q_EXECUTABLE,qsql_filename)
- # retcode, o, e = run_command(cmd)
- # print(o)
-
- def test_join_with_qsql_file_and_save(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
-
- header = [six.b('aa'), six.b('bb'), six.b('cc')]
-
- saved_qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
-
- new_tmp_folder = self.create_folder_with_files({
- 'some_csv_file': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
- 'some_qsql_database' : self.arrays_to_csv_file_content(six.b(','),header,numbers2)
- },prefix='xx',suffix='yy')
- cmd = '%s -d , -H "select count(*) from %s/some_qsql_database" -C readwrite' % (Q_EXECUTABLE,new_tmp_folder)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode,0)
- os.remove('%s/some_qsql_database' % new_tmp_folder)
-
- effective_filename1 = '%s/some_csv_file' % new_tmp_folder
- effective_filename2 = '%s/some_qsql_database.qsql' % new_tmp_folder
-
- cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(small_file.aa) from %s large_file left join %s small_file on (small_file.aa == large_file.bb)" -S %s' % \
- (effective_filename1,effective_filename2,saved_qsql_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
-
- conn = sqlite3.connect(saved_qsql_with_multiple_tables)
- c1 = conn.execute('select count(*) from some_csv_file').fetchall()
- c2 = conn.execute('select count(*) from some_qsql_database').fetchall()
-
- self.assertEqual(c1[0][0],10000)
- self.assertEqual(c2[0][0],10)
-
-
- def test_saving_to_db_with_same_basename_files(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
-
- header = [six.b('aa'), six.b('bb'), six.b('cc')]
-
- qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
-
- new_tmp_folder = self.create_folder_with_files({
- 'filename1': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
- 'otherfolder/filename1' : self.arrays_to_csv_file_content(six.b(','),header,numbers2)
- },prefix='xx',suffix='yy')
-
- effective_filename1 = '%s/filename1' % new_tmp_folder
- effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder
-
- expected_stored_table_name1 = 'filename1'
- expected_stored_table_name2 = 'filename1_2'
-
- cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
- (effective_filename1,effective_filename2,qsql_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
- self.assertEqual(e[0], six.b('Going to save data into a disk database: %s' % qsql_with_multiple_tables))
- self.assertTrue(e[1].startswith(six.b('Data has been saved into %s . Saving has taken' % qsql_with_multiple_tables)))
- self.assertEqual(e[2],six.b('Query to run on the database: select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb);' % \
- (expected_stored_table_name1,expected_stored_table_name2)))
- self.assertEqual(e[3],six.b('You can run the query directly from the command line using the following command: echo "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" | sqlite3 %s' % \
- (expected_stored_table_name1,expected_stored_table_name2,qsql_with_multiple_tables)))
-
- #self.assertTrue(False) # pxpx - need to actually test reading from the saved db file
- conn = sqlite3.connect(qsql_with_multiple_tables)
- c1 = conn.execute('select count(*) from filename1').fetchall()
- c2 = conn.execute('select count(*) from filename1_2').fetchall()
-
- self.assertEqual(c1[0][0],10000)
- self.assertEqual(c2[0][0],10)
-
-
- def test_error_when_not_specifying_table_name_in_multi_table_qsql(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
-
- header = [six.b('aa'), six.b('bb'), six.b('cc')]
-
- qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
-
- new_tmp_folder = self.create_folder_with_files({
- 'filename1': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
- 'otherfolder/filename1' : self.arrays_to_csv_file_content(six.b(','),header,numbers2)
- },prefix='xx',suffix='yy')
-
- effective_filename1 = '%s/filename1' % new_tmp_folder
- effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder
-
- expected_stored_table_name1 = 'filename1'
- expected_stored_table_name2 = 'filename1_2'
-
- cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
- (effective_filename1,effective_filename2,qsql_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
-
- # Actual tests
-
- cmd = '%s "select count(*) from %s"' % (Q_EXECUTABLE,qsql_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 87)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
- self.assertEqual(e[0],six.b('Could not autodetect table name in sqlite file %s . Existing tables: %s,%s' % (qsql_with_multiple_tables,expected_stored_table_name1,expected_stored_table_name2)))
-
- def test_error_when_not_specifying_table_name_in_multi_table_sqlite(self):
- sqlite_with_multiple_tables = self.generate_tmpfile_name(suffix='.sqlite')
-
- c = sqlite3.connect(sqlite_with_multiple_tables)
- c.execute('create table my_table_1 (x int, y int)').fetchall()
- c.execute('create table my_table_2 (x int, y int)').fetchall()
- c.close()
-
- cmd = '%s "select count(*) from %s"' % (Q_EXECUTABLE,sqlite_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 87)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- print(e[0])
- self.assertEqual(e[0],six.b('Could not autodetect table name in sqlite file %s . Existing tables: my_table_1,my_table_2' % sqlite_with_multiple_tables))
-
- def test_querying_from_multi_table_sqlite_using_explicit_table_name(self):
- sqlite_with_multiple_tables = self.generate_tmpfile_name(suffix='.sqlite')
-
- c = sqlite3.connect(sqlite_with_multiple_tables)
- c.execute('create table my_table_1 (x int, y int)').fetchall()
- c.execute('insert into my_table_1 (x,y) values (100,200),(300,400)').fetchall()
- c.execute('commit').fetchall()
- c.execute('create table my_table_2 (x int, y int)').fetchall()
- c.close()
-
- cmd = '%s -d , "select * from %s:::my_table_1"' % (Q_EXECUTABLE,sqlite_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('100,200'))
- self.assertEqual(o[1],six.b('300,400'))
-
- # Check again, this time with a different output delimiter and with explicit column names
- cmd = '%s -t "select x,y from %s:::my_table_1"' % (Q_EXECUTABLE,sqlite_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('100\t200'))
- self.assertEqual(o[1],six.b('300\t400'))
-
-
- def test_error_when_specifying_nonexistent_table_name_in_multi_table_qsql(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
-
- header = [six.b('aa'), six.b('bb'), six.b('cc')]
-
- qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
-
- new_tmp_folder = self.create_folder_with_files({
- 'filename1': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
- 'otherfolder/filename1' : self.arrays_to_csv_file_content(six.b(','),header,numbers2)
- },prefix='xx',suffix='yy')
-
- effective_filename1 = '%s/filename1' % new_tmp_folder
- effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder
-
- expected_stored_table_name1 = 'filename1'
- expected_stored_table_name2 = 'filename1_2'
-
- cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
- (effective_filename1,effective_filename2,qsql_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
-
- # Actual tests
-
- cmd = '%s "select count(*) from %s:::non_existent_table"' % (Q_EXECUTABLE,qsql_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 85)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
- self.assertEqual(e[0],six.b('Table non_existent_table could not be found in sqlite file %s . Existing table names: %s,%s' % \
- (qsql_with_multiple_tables,expected_stored_table_name1,expected_stored_table_name2)))
-
- def test_querying_multi_table_qsql_file(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
-
- header = [six.b('aa'), six.b('bb'), six.b('cc')]
-
- qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
-
- new_tmp_folder = self.create_folder_with_files({
- 'filename1': self.arrays_to_csv_file_content(six.b(','),header,numbers1),
- 'otherfolder/filename1' : self.arrays_to_csv_file_content(six.b(','),header,numbers2)
- },prefix='xx',suffix='yy')
-
- effective_filename1 = '%s/filename1' % new_tmp_folder
- effective_filename2 = '%s/otherfolder/filename1' % new_tmp_folder
-
- expected_stored_table_name1 = 'filename1'
- expected_stored_table_name2 = 'filename1_2'
-
- cmd = Q_EXECUTABLE + ' -d , -H "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)" -S %s' % \
- (effective_filename1,effective_filename2,qsql_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
-
- # Actual tests
-
- cmd = '%s "select count(*) from %s:::%s"' % (Q_EXECUTABLE,qsql_with_multiple_tables,expected_stored_table_name1)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('10000'))
-
- cmd = '%s "select count(*) from %s:::%s"' % (Q_EXECUTABLE,qsql_with_multiple_tables,expected_stored_table_name2)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('10'))
-
- def test_preventing_db_overwrite(self):
- db_filename = self.random_tmp_filename('store-to-disk', 'db')
- self.assertFalse(os.path.exists(db_filename))
-
- retcode, o, e = run_command('seq 1 1000 | ' + Q_EXECUTABLE + ' "select count(*) from -" -c 1 -S %s' % db_filename)
-
- self.assertTrue(retcode == 0)
- self.assertTrue(os.path.exists(db_filename))
-
- retcode2, o2, e2 = run_command('seq 1 1000 | ' + Q_EXECUTABLE + ' "select count(*) from -" -c 1 -S %s' % db_filename)
- self.assertTrue(retcode2 != 0)
- self.assertTrue(e2[0].startswith(six.b('Going to save data into a disk database')))
- self.assertTrue(e2[1] == six.b('Disk database file {} already exists.'.format(db_filename)))
-
- os.remove(db_filename)
-
-
-class BasicTests(AbstractQTestCase):
-
- def test_basic_aggregation(self):
- retcode, o, e = run_command(
- 'seq 1 10 | ' + Q_EXECUTABLE + ' "select sum(c1),avg(c1) from -"')
- self.assertTrue(retcode == 0)
- self.assertTrue(len(o) == 1)
- self.assertTrue(len(e) == 0)
-
- s = sum(range(1, 11))
- self.assertTrue(o[0] == six.b('%s %s' % (s, s / 10.0)))
-
- def test_select_one_column(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
-
- cmd = Q_EXECUTABLE + ' -d , "select c1 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(six.b(" ").join(o), six.b('a b c'))
-
- self.cleanup(tmpfile)
-
- def test_column_separation(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select c1,c2,c3 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], sample_data_rows[0])
- self.assertEqual(o[1], sample_data_rows[1])
- self.assertEqual(o[2], sample_data_rows[2])
-
- self.cleanup(tmpfile)
-
- def test_header_exception_on_numeric_header_data(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select * from %s" -A -H' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 3)
- self.assertTrue(
- six.b('Bad header row: Header must contain only strings') in e[0])
- self.assertTrue(six.b("Column name must be a string") in e[1])
- self.assertTrue(six.b("Column name must be a string") in e[2])
-
- self.cleanup(tmpfile)
-
- def test_different_header_in_second_file(self):
- folder_name = self.create_folder_with_files({
- 'file1': self.arrays_to_csv_file_content(six.b(','),[six.b('a'),six.b('b')],[[six.b(str(x)),six.b(str(x))] for x in range(1,6)]),
- 'file2': self.arrays_to_csv_file_content(six.b(','),[six.b('c'),six.b('d')],[[six.b(str(x)),six.b(str(x))] for x in range(1,6)])
- },prefix="xx",suffix="aa")
-
- cmd = Q_EXECUTABLE + ' -d , "select * from %s/*" -H' % (folder_name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 35)
- self.assertEqual(len(e),1)
- self.assertEqual(e[0],six.b("Bad header row: Extra header 'c,d' in file '%s/file2' mismatches original header 'a,b' from file '%s/file1'. Table name is '%s/*'" % (folder_name,folder_name,folder_name)))
-
- def test_data_with_header(self):
- tmpfile = self.create_file_with_data(sample_data_with_header)
- cmd = Q_EXECUTABLE + ' -d , "select name from %s" -H' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(six.b(" ").join(o), six.b("a b c"))
-
- self.cleanup(tmpfile)
-
- def test_output_header_when_input_header_exists(self):
- tmpfile = self.create_file_with_data(sample_data_with_header)
- cmd = Q_EXECUTABLE + ' -d , "select name from %s" -H -O' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 4)
- self.assertEqual(o[0],six.b('name'))
- self.assertEqual(o[1],six.b('a'))
- self.assertEqual(o[2],six.b('b'))
- self.assertEqual(o[3],six.b('c'))
-
- self.cleanup(tmpfile)
-
- def test_generated_column_name_warning_when_header_line_exists(self):
- tmpfile = self.create_file_with_data(sample_data_with_header)
- cmd = Q_EXECUTABLE + ' -d , "select c3 from %s" -H' % tmpfile.name
-
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 2)
- self.assertTrue(six.b('no such column: c3') in e[0])
- self.assertTrue(
- e[1].startswith(six.b('Warning - There seems to be a "no such column" error, and -H (header line) exists. Please make sure that you are using the column names from the header line and not the default (cXX) column names')))
-
- self.cleanup(tmpfile)
-
- def test_empty_data(self):
- tmpfile = self.create_file_with_data(six.b(''))
- cmd = Q_EXECUTABLE + ' -d , "select * from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
-
- self.assertTrue(six.b('Warning - data is empty') in e[0])
-
- self.cleanup(tmpfile)
-
- def test_empty_data_with_header_param(self):
- tmpfile = self.create_file_with_data(six.b(''))
- cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -H' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
-
- m = six.b("Header line is expected but missing in file %s" % tmpfile.name)
- self.assertTrue(m in e[0])
-
- self.cleanup(tmpfile)
-
- def test_one_row_of_data_without_header_param(self):
- tmpfile = self.create_file_with_data(header_row)
- cmd = Q_EXECUTABLE + ' -d , "select c2 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b('value1'))
-
- self.cleanup(tmpfile)
-
- def test_one_row_of_data_with_header_param(self):
- tmpfile = self.create_file_with_data(header_row)
- cmd = Q_EXECUTABLE + ' -d , "select name from %s" -H' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
-
- self.assertTrue(six.b('Warning - data is empty') in e[0])
-
- self.cleanup(tmpfile)
-
- def test_dont_leading_keep_whitespace_in_values(self):
- tmpfile = self.create_file_with_data(sample_data_with_spaces_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select c1 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 3)
-
- self.assertEqual(o[0], six.b('a'))
- self.assertEqual(o[1], six.b('b'))
- self.assertEqual(o[2], six.b('c'))
-
- self.cleanup(tmpfile)
-
- def test_keep_leading_whitespace_in_values(self):
- tmpfile = self.create_file_with_data(sample_data_with_spaces_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -k' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 3)
-
- self.assertEqual(o[0], six.b('a'))
- self.assertEqual(o[1], six.b(' b'))
- self.assertEqual(o[2], six.b('c'))
-
- self.cleanup(tmpfile)
-
- def test_no_impact_of_keeping_leading_whitespace_on_integers(self):
- tmpfile = self.create_file_with_data(sample_data_with_spaces_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select c2 from %s" -k -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- f = open("/var/tmp/XXX","wb")
- f.write(six.b("\n").join(o))
- f.write(six.b("STDERR:"))
- f.write(six.b("\n").join(e))
- f.close()
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 7)
-
-
- self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1], six.b(' Sources:'))
- self.assertEqual(o[2], six.b(' source_type: file source: %s') % six.b(tmpfile.name))
- self.assertEqual(o[3], six.b(' Fields:'))
- self.assertEqual(o[4], six.b(' `c1` - text'))
- self.assertEqual(o[5], six.b(' `c2` - int'))
- self.assertEqual(o[6], six.b(' `c3` - int'))
-
-
- self.cleanup(tmpfile)
-
- def test_spaces_in_header_row(self):
- tmpfile = self.create_file_with_data(
- header_row_with_spaces + six.b("\n") + sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select name,\\`value 1\\` from %s" -H' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 3)
-
- self.assertEqual(o[0], six.b('a,1'))
- self.assertEqual(o[1], six.b('b,2'))
- self.assertEqual(o[2], six.b('c,'))
-
- self.cleanup(tmpfile)
-
- def test_no_query_in_command_line(self):
- cmd = Q_EXECUTABLE + ' -d , ""'
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 1)
- self.assertEqual(len(e), 1)
- self.assertEqual(len(o), 0)
-
- self.assertEqual(e[0],six.b('Query cannot be empty (query number 1)'))
-
- def test_empty_query_in_command_line(self):
- cmd = Q_EXECUTABLE + ' -d , " "'
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 1)
- self.assertEqual(len(e), 1)
- self.assertEqual(len(o), 0)
-
- self.assertEqual(e[0],six.b('Query cannot be empty (query number 1)'))
-
- def test_failure_in_query_stops_processing_queries(self):
- cmd = Q_EXECUTABLE + ' -d , "select 500" "select 300" "wrong-query" "select 8000"'
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 1)
- self.assertEqual(len(e), 1)
- self.assertEqual(len(o), 2)
- self.assertEqual(o[0],six.b('500'))
- self.assertEqual(o[1],six.b('300'))
-
- def test_multiple_queries_in_command_line(self):
- cmd = Q_EXECUTABLE + ' -d , "select 500" "select 300+100" "select 300" "select 200"'
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 4)
-
- self.assertEqual(o[0],six.b('500'))
- self.assertEqual(o[1],six.b('400'))
- self.assertEqual(o[2],six.b('300'))
- self.assertEqual(o[3],six.b('200'))
-
- def test_literal_calculation_query(self):
- cmd = Q_EXECUTABLE + ' -d , "select 1+40/6"'
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 1)
-
- self.assertEqual(o[0],six.b('7'))
-
- def test_literal_calculation_query_float_result(self):
- cmd = Q_EXECUTABLE + ' -d , "select 1+40/6.0"'
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 1)
-
- self.assertEqual(o[0],six.b('7.666666666666667'))
-
- def test_use_query_file(self):
- tmp_data_file = self.create_file_with_data(sample_data_with_header)
- tmp_query_file = self.create_file_with_data(six.b("select name from %s" % tmp_data_file.name))
-
- cmd = Q_EXECUTABLE + ' -d , -q %s -H' % tmp_query_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 3)
-
- self.assertEqual(o[0], six.b('a'))
- self.assertEqual(o[1], six.b('b'))
- self.assertEqual(o[2], six.b('c'))
-
- self.cleanup(tmp_data_file)
- self.cleanup(tmp_query_file)
-
- def test_use_query_file_with_incorrect_query_encoding(self):
- tmp_data_file = self.create_file_with_data(sample_data_with_header)
- tmp_query_file = self.create_file_with_data(six.b("select name,'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)
-
- cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q ascii' % tmp_query_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,3)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
-
- self.assertTrue(e[0].startswith(six.b('Could not decode query number 1 using the provided query encoding (ascii)')))
-
- self.cleanup(tmp_data_file)
- self.cleanup(tmp_query_file)
-
- def test_output_header_with_non_ascii_names(self):
- OUTPUT_ENCODING = 'utf-8'
-
- tmp_data_file = self.create_file_with_data(sample_data_with_header)
- tmp_query_file = self.create_file_with_data(six.b("select name,'Hr\xc3\xa1\xc4\x8d' Hr\xc3\xa1\xc4\x8d from %s" % tmp_data_file.name),encoding=None)
-
- cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -O -E %s' % (tmp_query_file.name,OUTPUT_ENCODING)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),4)
- self.assertEqual(len(e),0)
-
- self.assertEqual(o[0].decode(OUTPUT_ENCODING), u'name,Hr\xe1\u010d')
- self.assertEqual(o[1].decode(OUTPUT_ENCODING), u'a,Hr\xe1\u010d')
- self.assertEqual(o[2].decode(OUTPUT_ENCODING), u'b,Hr\xe1\u010d')
- self.assertEqual(o[3].decode(OUTPUT_ENCODING), u'c,Hr\xe1\u010d')
-
- self.cleanup(tmp_data_file)
- self.cleanup(tmp_query_file)
-
- def test_use_query_file_with_query_encoding(self):
- OUTPUT_ENCODING = 'utf-8'
-
- tmp_data_file = self.create_file_with_data(sample_data_with_header)
- tmp_query_file = self.create_file_with_data(six.b("select name,'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)
-
- cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -E %s' % (tmp_query_file.name,OUTPUT_ENCODING)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 3)
-
- self.assertEqual(o[0].decode(OUTPUT_ENCODING), u'a,Hr\xe1\u010d')
- self.assertEqual(o[1].decode(OUTPUT_ENCODING), u'b,Hr\xe1\u010d')
- self.assertEqual(o[2].decode(OUTPUT_ENCODING), u'c,Hr\xe1\u010d')
-
- self.cleanup(tmp_data_file)
- self.cleanup(tmp_query_file)
-
- def test_use_query_file_and_command_line(self):
- tmp_data_file = self.create_file_with_data(sample_data_with_header)
- tmp_query_file = self.create_file_with_data(six.b("select name from %s" % tmp_data_file.name))
-
- cmd = Q_EXECUTABLE + ' -d , -q %s -H "select * from ppp"' % tmp_query_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 1)
- self.assertEqual(len(e), 1)
- self.assertEqual(len(o), 0)
-
- self.assertTrue(e[0].startswith(six.b("Can't provide both a query file and a query on the command line")))
-
- self.cleanup(tmp_data_file)
- self.cleanup(tmp_query_file)
-
- def test_select_output_encoding(self):
- tmp_data_file = self.create_file_with_data(sample_data_with_header)
- tmp_query_file = self.create_file_with_data(six.b("select 'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)
-
- for target_encoding in ['utf-8','ibm852']:
- cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -E %s' % (tmp_query_file.name,target_encoding)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 3)
-
- self.assertEqual(o[0].decode(target_encoding), u'Hr\xe1\u010d')
- self.assertEqual(o[1].decode(target_encoding), u'Hr\xe1\u010d')
- self.assertEqual(o[2].decode(target_encoding), u'Hr\xe1\u010d')
-
- self.cleanup(tmp_data_file)
- self.cleanup(tmp_query_file)
-
- def test_select_failed_output_encoding(self):
- tmp_data_file = self.create_file_with_data(sample_data_with_header)
- tmp_query_file = self.create_file_with_data(six.b("select 'Hr\xc3\xa1\xc4\x8d' from %s" % tmp_data_file.name),encoding=None)
-
- cmd = Q_EXECUTABLE + ' -d , -q %s -H -Q utf-8 -E ascii' % tmp_query_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 3)
- self.assertEqual(len(e), 1)
- self.assertEqual(len(o), 0)
-
- self.assertTrue(e[0].startswith(six.b('Cannot encode data')))
-
- self.cleanup(tmp_data_file)
- self.cleanup(tmp_query_file)
-
-
- def test_use_query_file_with_empty_query(self):
- tmp_query_file = self.create_file_with_data(six.b(" "))
-
- cmd = Q_EXECUTABLE + ' -d , -q %s -H' % tmp_query_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 1)
- self.assertEqual(len(e), 1)
- self.assertEqual(len(o), 0)
-
- self.assertTrue(e[0].startswith(six.b("Query cannot be empty")))
-
- self.cleanup(tmp_query_file)
-
- def test_use_non_existent_query_file(self):
- cmd = Q_EXECUTABLE + ' -d , -q non-existent-query-file -H'
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 1)
- self.assertEqual(len(e), 1)
- self.assertEqual(len(o), 0)
-
- self.assertTrue(e[0].startswith(six.b("Could not read query from file")))
-
- def test_nonexistent_file(self):
- cmd = Q_EXECUTABLE + ' "select * from non-existent-file"'
-
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode,0)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
-
- self.assertEqual(e[0],six.b("No files matching '%s/non-existent-file' have been found" % os.getcwd()))
-
- def test_default_column_max_length_parameter__short_enough(self):
- huge_text = six.b("x" * 131000)
-
- file_data = six.b("a,b,c\n1,{},3\n".format(huge_text))
-
- tmpfile = self.create_file_with_data(file_data)
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0],six.b('1'))
-
- self.cleanup(tmpfile)
-
- def test_default_column_max_length_parameter__too_long(self):
- huge_text = six.b("x") * 132000
-
- file_data = six.b("a,b,c\n1,{},3\n".format(huge_text))
-
- tmpfile = self.create_file_with_data(file_data)
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 31)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
-
- self.assertTrue(e[0].startswith(six.b("Column length is larger than the maximum")))
- self.assertTrue(six.b("Offending file is '{}'".format(tmpfile.name)) in e[0])
- self.assertTrue(six.b('Line is 2') in e[0])
-
- self.cleanup(tmpfile)
-
- def test_column_max_length_parameter(self):
- file_data = six.b("a,b,c\nvery-long-text,2,3\n")
- tmpfile = self.create_file_with_data(file_data)
-
- cmd = Q_EXECUTABLE + ' -H -d , -M 3 "select a from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 31)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
-
- self.assertTrue(e[0].startswith(six.b("Column length is larger than the maximum")))
- self.assertTrue((six.b("Offending file is '%s'" % tmpfile.name)) in e[0])
- self.assertTrue(six.b('Line is 2') in e[0])
-
- cmd2 = Q_EXECUTABLE + ' -H -d , -M 300 -H "select a from %s"' % tmpfile.name
- retcode2, o2, e2 = run_command(cmd2)
-
- self.assertEqual(retcode2, 0)
- self.assertEqual(len(o2), 1)
- self.assertEqual(len(e2), 0)
-
- self.assertEqual(o2[0],six.b('very-long-text'))
-
- self.cleanup(tmpfile)
-
- def test_invalid_column_max_length_parameter(self):
- file_data = six.b("a,b,c\nvery-long-text,2,3\n")
- tmpfile = self.create_file_with_data(file_data)
-
- cmd = Q_EXECUTABLE + ' -H -d , -M xx "select a from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 31)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
-
- self.assertEqual(e[0],six.b('Max column length limit must be an integer larger than 2 (xx)'))
-
- self.cleanup(tmpfile)
-
- def test_duplicate_column_name_detection(self):
- file_data = six.b("a,b,a\n10,20,30\n30,40,50")
- tmpfile = self.create_file_with_data(file_data)
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 35)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 2)
-
- self.assertTrue(e[0].startswith(six.b('Bad header row:')))
- self.assertEqual(e[1],six.b("'a': Column name is duplicated"))
-
- self.cleanup(tmpfile)
-
- def test_join_with_stdin(self):
- x = [six.b(a) for a in map(str,range(1,101))]
- large_file_data = six.b("val\n") + six.b("\n").join(x)
- tmpfile = self.create_file_with_data(large_file_data)
-
- cmd = '(echo id ; seq 1 2 10) | %s -c 1 -H -O "select stdin.*,f.* from - stdin left join %s f on (stdin.id * 10 = f.val)"' % (Q_EXECUTABLE,tmpfile.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 6)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0],six.b('id val'))
- self.assertEqual(o[1],six.b('1 10'))
- self.assertEqual(o[2],six.b('3 30'))
- self.assertEqual(o[3],six.b('5 50'))
- self.assertEqual(o[4],six.b('7 70'))
- self.assertEqual(o[5],six.b('9 90'))
-
- self.cleanup(tmpfile)
-
- def test_concatenated_files(self):
- file_data1 = six.b("a,b,c\n10,11,12\n20,21,22")
- tmpfile1 = self.create_file_with_data(file_data1)
- tmpfile1_folder = os.path.dirname(tmpfile1.name)
- tmpfile1_filename = os.path.basename(tmpfile1.name)
- expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
-
- file_data2 = six.b("a,b,c\n30,31,32\n40,41,42")
- tmpfile2 = self.create_file_with_data(file_data2)
- tmpfile2_folder = os.path.dirname(tmpfile2.name)
- tmpfile2_filename = os.path.basename(tmpfile2.name)
- expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -O -H -d , "select * from %s UNION ALL select * from %s" -C none' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 5)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('a,b,c'))
- self.assertEqual(o[1],six.b('10,11,12'))
- self.assertEqual(o[2],six.b('20,21,22'))
- self.assertEqual(o[3],six.b('30,31,32'))
- self.assertEqual(o[4],six.b('40,41,42'))
-
- self.cleanup(tmpfile1)
- self.cleanup(tmpfile2)
-
- def test_out_of_range_expected_column_count(self):
- cmd = '%s "select count(*) from some_table" -c -1' % Q_EXECUTABLE
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 90)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0], six.b('Column count must be between 1 and 131072'))
-
- def test_out_of_range_expected_column_count__with_explicit_limit(self):
- cmd = '%s "select count(*) from some_table" -c -1 -M 100' % Q_EXECUTABLE
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 90)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0], six.b('Column count must be between 1 and 100'))
-
- def test_other_out_of_range_expected_column_count__with_explicit_limit(self):
- cmd = '%s "select count(*) from some_table" -c 101 -M 100' % Q_EXECUTABLE
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 90)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0], six.b('Column count must be between 1 and 100'))
-
- def test_explicit_limit_of_columns__data_is_ok(self):
- file_data1 = six.b("191\n192\n")
- tmpfile1 = self.create_file_with_data(file_data1)
-
- cmd = '%s "select count(*) from %s" -c 1 -M 3' % (Q_EXECUTABLE,tmpfile1.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0], six.b('2'))
-
- self.cleanup(tmpfile1)
-
-class ManyOpenFilesTests(AbstractQTestCase):
-
-
- def test_multi_file_header_skipping(self):
- BATCH_SIZE = 50
- FILE_COUNT = 5
-
- numbers = list(range(1,1+BATCH_SIZE*FILE_COUNT))
- numbers_as_text = batch([str(x) for x in numbers],n=BATCH_SIZE)
-
- content_list = list(map(six.b,['a\n' + "\n".join(x)+'\n' for x in numbers_as_text]))
-
- filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d,'split-files','multi-header')
-
- cmd = '%s -d , -H -c 1 "select count(a),sum(a) from %s/*" -C none' % (Q_EXECUTABLE,tmpfolder)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0],six.b("%s,%s" % (BATCH_SIZE*FILE_COUNT,sum(numbers))))
-
- self.cleanup_folder(tmpfolder)
-
- def test_that_globs_dont_max_out_sqlite_attached_database_limits(self):
- BATCH_SIZE = 50
- FILE_COUNT = 40
-
- numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
-
- content_list = map(six.b,["\n".join(x)+'\n' for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
- #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- cmd = 'cd %s && %s -c 1 "select count(*) from *" -C none --max-attached-sqlite-databases=10' % (tmpfolder,Q_EXECUTABLE)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))
-
- self.cleanup_folder(tmpfolder)
-
- def test_maxing_out_max_attached_database_limits__regular_files(self):
- BATCH_SIZE = 50
- FILE_COUNT = 40
-
- numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
-
- content_list = map(six.b,["\n".join(x)+'\n' for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
- #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- unioned_subquery = " UNION ALL ".join(["select * from %s/%s" % (tmpfolder,filename) for filename in filename_list])
- cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C none --max-attached-sqlite-databases=10' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))
-
- self.cleanup_folder(tmpfolder)
-
- def test_maxing_out_max_attached_database_limits__with_qsql_files_below_attached_limit(self):
- MAX_ATTACHED_SQLITE_DATABASES = 10
-
- BATCH_SIZE = 50
- FILE_COUNT = MAX_ATTACHED_SQLITE_DATABASES - 1
-
- numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
-
- content_list = map(six.b,["\n".join(x)+'\n' for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
- #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- # Execute the query with -C readwrite, so all qsql files will be created
- unioned_subquery = " UNION ALL ".join(["select * from %s/%s" % (tmpfolder,filename) for filename in filename_list])
- cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite --max-attached-sqlite-databases=%s' % (tmpfolder,Q_EXECUTABLE,unioned_subquery,MAX_ATTACHED_SQLITE_DATABASES)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))
-
- # Now execute the same query with -C readwrite, so all files will be read directly from the qsql files
- cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))
-
- self.cleanup_folder(tmpfolder)
-
- def test_maxing_out_max_attached_database_limits__with_qsql_files_above_attached_limit(self):
- MAX_ATTACHED_SQLITE_DATABASES = 10
-
- BATCH_SIZE = 50
- # Here's the difference from test_maxing_out_max_attached_database_limits__with_qsql_files_below_attached_limit
- # We're trying to cache 2 times the number of files than the number of databases that can be attached.
- # Expectation is that only a part of the files will be cached
- FILE_COUNT = MAX_ATTACHED_SQLITE_DATABASES * 2
-
- numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
-
- content_list = map(six.b,["\n".join(x)+'\n' for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
- #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- # Execute the query with -C readwrite, so all qsql files will be created
- unioned_subquery = " UNION ALL ".join(["select * from %s/%s" % (tmpfolder,filename) for filename in filename_list])
- cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite --max-attached-sqlite-databases=%s' % (tmpfolder,Q_EXECUTABLE,unioned_subquery,MAX_ATTACHED_SQLITE_DATABASES)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))
-
- # Now execute the same query with -C readwrite, so all files will be read directly from the qsql files
- cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))
-
- from glob import glob
- files_in_folder = [os.path.basename(x) for x in glob('%s/*' % (tmpfolder))]
-
- expected_files_in_folder = filename_list + list(map(lambda x: 'file-%s.qsql' % x,range(MAX_ATTACHED_SQLITE_DATABASES-2)))
-
- self.assertEqual(sorted(files_in_folder),sorted(expected_files_in_folder))
-
- self.cleanup_folder(tmpfolder)
-
- def test_maxing_out_max_attached_database_limits__with_directly_using_qsql_files(self):
- MAX_ATTACHED_SQLITE_DATABASES = 10
-
- BATCH_SIZE = 50
- FILE_COUNT = MAX_ATTACHED_SQLITE_DATABASES * 2
-
- numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
-
- content_list = map(six.b,["\n".join(x)+'\n' for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
- #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- # Prepare qsql for each of the files (separately, just for simplicity)
- for fn in filename_list:
- cmd = 'cd %s && %s -c 1 "select count(*) from %s" -C readwrite' % (tmpfolder,Q_EXECUTABLE,fn)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- # Now execute a big query which uses the created qsql files
- unioned_subquery = " UNION ALL ".join(["select * from %s/%s.qsql" % (tmpfolder,filename) for filename in filename_list])
-
- cmd = 'cd %s && %s -c 1 "select count(*) from (%s)" -C readwrite' % (tmpfolder,Q_EXECUTABLE,unioned_subquery)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))
-
- self.cleanup_folder(tmpfolder)
-
- def test_too_many_open_files_for_one_table(self):
- # Previously file opening was parallel, causing too-many-open-files
-
- MAX_ALLOWED_FILES = 500
-
- BATCH_SIZE = 2
- FILE_COUNT = MAX_ALLOWED_FILES + 1
-
- numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
-
- content_list = map(six.b,["\n".join(x) for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
-
- cmd = 'cd %s && %s -c 1 "select count(*) from * where 1 = 1 or c1 != 2" -C none' % (tmpfolder,Q_EXECUTABLE)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 82)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- x = six.b('Maximum source files for table must be %s. Table is name is %s/* Number of actual files is %s' % (MAX_ALLOWED_FILES,os.path.realpath(tmpfolder),FILE_COUNT))
- print(x)
- self.assertEqual(e[0],x)
-
- self.cleanup_folder(tmpfolder)
-
- def test_many_open_files_for_one_table(self):
- # Previously file opening was parallel, causing too-many-open-files
-
- BATCH_SIZE = 2
- FILE_COUNT = 500
-
- numbers_as_text = batch([str(x) for x in range(1,1+BATCH_SIZE*FILE_COUNT)],n=BATCH_SIZE)
-
- content_list = map(six.b,["\n".join(x) for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x,range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder = self.create_folder_with_files(d,'split-files','attach-limit')
- #expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- cmd = 'cd %s && %s -c 1 "select count(*) from * where 1 = 1 or c1 != 2" -C none' % (tmpfolder,Q_EXECUTABLE)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0],six.b(str(BATCH_SIZE*FILE_COUNT)))
-
- self.cleanup_folder(tmpfolder)
-
- def test_many_open_files_for_two_tables(self):
- BATCH_SIZE = 2
- FILE_COUNT = 500
-
- numbers_as_text = batch([str(x) for x in range(1, 1 + BATCH_SIZE * FILE_COUNT)], n=BATCH_SIZE)
-
- content_list = map(six.b, ["\n".join(x) for x in numbers_as_text])
-
- filename_list = list(map(lambda x: 'file-%s' % x, range(FILE_COUNT)))
- d = collections.OrderedDict(zip(filename_list, content_list))
-
- tmpfolder1 = self.create_folder_with_files(d, 'split-files1', 'blah')
- tmpfolder2 = self.create_folder_with_files(d, 'split-files1', 'blah')
-
- cmd = '%s -c 1 "select count(*) from %s/* a left join %s/* b on (a.c1 = b.c1)" -C none' % (
- Q_EXECUTABLE,
- tmpfolder1,
- tmpfolder2)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b(str(BATCH_SIZE * FILE_COUNT)))
-
- self.cleanup_folder(tmpfolder1)
- self.cleanup_folder(tmpfolder2)
-
-
-class GzippingTests(AbstractQTestCase):
-
- def test_gzipped_file(self):
- tmpfile = self.create_file_with_data(
- six.b('\x1f\x8b\x08\x08\xf2\x18\x12S\x00\x03xxxxxx\x003\xe42\xe22\xe62\xe12\xe52\xe32\xe7\xb2\xe0\xb2\xe424\xe0\x02\x00\xeb\xbf\x8a\x13\x15\x00\x00\x00'))
-
- cmd = Q_EXECUTABLE + ' -z "select sum(c1),avg(c1) from %s"' % tmpfile.name
-
- retcode, o, e = run_command(cmd)
- self.assertTrue(retcode == 0)
- self.assertTrue(len(o) == 1)
- self.assertTrue(len(e) == 0)
-
- s = sum(range(1, 11))
- self.assertTrue(o[0] == six.b('%s %s' % (s, s / 10.0)))
-
- self.cleanup(tmpfile)
-
-
-class DelimiterTests(AbstractQTestCase):
-
- def test_delimition_mistake_with_header(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
-
- cmd = Q_EXECUTABLE + ' -d " " "select * from %s" -H' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 2)
-
- self.assertTrue(e[0].startswith(six.b("Bad header row")))
- self.assertTrue(six.b("Column name cannot contain commas") in e[1])
-
- self.cleanup(tmpfile)
-
- def test_tab_delimition_parameter(self):
- tmpfile = self.create_file_with_data(
- sample_data_no_header.replace(six.b(","), six.b("\t")))
- cmd = Q_EXECUTABLE + ' -t "select c1,c2,c3 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("\t")))
- self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("\t")))
- self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("\t")))
-
- self.cleanup(tmpfile)
-
- def test_pipe_delimition_parameter(self):
- tmpfile = self.create_file_with_data(
- sample_data_no_header.replace(six.b(","), six.b("|")))
- cmd = Q_EXECUTABLE + ' -p "select c1,c2,c3 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("|")))
- self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("|")))
- self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("|")))
-
- self.cleanup(tmpfile)
-
- def test_tab_delimition_parameter__with_manual_override_attempt(self):
- tmpfile = self.create_file_with_data(
- sample_data_no_header.replace(six.b(","), six.b("\t")))
- cmd = Q_EXECUTABLE + ' -t -d , "select c1,c2,c3 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 1)
- self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("\t")))
- self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("\t")))
- self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("\t")))
- self.assertEqual(e[0],six.b('Warning: -t parameter overrides -d parameter (,)'))
-
- self.cleanup(tmpfile)
-
- def test_pipe_delimition_parameter__with_manual_override_attempt(self):
- tmpfile = self.create_file_with_data(
- sample_data_no_header.replace(six.b(","), six.b("|")))
- cmd = Q_EXECUTABLE + ' -p -d , "select c1,c2,c3 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 1)
- self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("|")))
- self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("|")))
- self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("|")))
- self.assertEqual(e[0],six.b('Warning: -p parameter overrides -d parameter (,)'))
-
- self.cleanup(tmpfile)
-
- def test_output_delimiter(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , -D "|" "select c1,c2,c3 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("|")))
- self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("|")))
- self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("|")))
-
- self.cleanup(tmpfile)
-
- def test_output_delimiter_tab_parameter(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , -T "select c1,c2,c3 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("\t")))
- self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("\t")))
- self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("\t")))
-
- self.cleanup(tmpfile)
-
- def test_output_delimiter_pipe_parameter(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , -P "select c1,c2,c3 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("|")))
- self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("|")))
- self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("|")))
-
- self.cleanup(tmpfile)
-
- def test_output_delimiter_tab_parameter__with_manual_override_attempt(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , -T -D "|" "select c1,c2,c3 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 1)
-
- self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("\t")))
- self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("\t")))
- self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("\t")))
- self.assertEqual(e[0], six.b('Warning: -T parameter overrides -D parameter (|)'))
-
- self.cleanup(tmpfile)
-
- def test_output_delimiter_pipe_parameter__with_manual_override_attempt(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , -P -D ":" "select c1,c2,c3 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 1)
-
- self.assertEqual(o[0], sample_data_rows[0].replace(six.b(","), six.b("|")))
- self.assertEqual(o[1], sample_data_rows[1].replace(six.b(","), six.b("|")))
- self.assertEqual(o[2], sample_data_rows[2].replace(six.b(","), six.b("|")))
- self.assertEqual(e[0],six.b('Warning: -P parameter overrides -D parameter (:)'))
-
- self.cleanup(tmpfile)
-
-
-class AnalysisTests(AbstractQTestCase):
-
- def test_analyze_result(self):
- d = "\n".join(['%s\t%s\t%s' % (x+1,x+1,x+1) for x in range(100)])
- tmpfile = self.create_file_with_data(six.b(d))
-
- cmd = Q_EXECUTABLE + ' -c 1 "select count(*) from %s" -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 5)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1], six.b(' Sources:'))
- self.assertEqual(o[2], six.b(' source_type: file source: %s' %(tmpfile.name)))
- self.assertEqual(o[3], six.b(' Fields:'))
- self.assertEqual(o[4], six.b(' `c1` - text'))
-
- self.cleanup(tmpfile)
-
- def test_analyze_result_with_data_stream(self):
- d = "\n".join(['%s\t%s\t%s' % (x+1,x+1,x+1) for x in range(100)])
- tmpfile = self.create_file_with_data(six.b(d))
-
- cmd = 'cat %s | %s -c 1 "select count(*) from -" -A' % (tmpfile.name,Q_EXECUTABLE)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 5)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b('Table: -'))
- self.assertEqual(o[1], six.b(' Sources:'))
- self.assertEqual(o[2], six.b(' source_type: data-stream source: stdin'))
- self.assertEqual(o[3], six.b(' Fields:'))
- self.assertEqual(o[4], six.b(' `c1` - text'))
-
- self.cleanup(tmpfile)
-
- def test_column_analysis(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
-
- cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4], six.b(' `c1` - text'))
- self.assertEqual(o[5], six.b(' `c2` - int'))
- self.assertEqual(o[6], six.b(' `c3` - int'))
-
- self.cleanup(tmpfile)
-
- def test_column_analysis_with_mixed_ints_and_floats(self):
- tmpfile = self.create_file_with_data(six.b("""planet_id,name,diameter_km,length_of_day_hours\n1000,Earth,12756,24\n2000,Mars,6792,24.7\n3000,Jupiter,142984,9.9"""))
-
- cmd = Q_EXECUTABLE + ' -d , -H "select * from %s" -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),8)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4], six.b(' `planet_id` - int'))
- self.assertEqual(o[5], six.b(' `name` - text'))
- self.assertEqual(o[6], six.b(' `diameter_km` - int'))
- self.assertEqual(o[7], six.b(' `length_of_day_hours` - real'))
-
- self.cleanup(tmpfile)
-
- def test_column_analysis_with_mixed_ints_and_floats_and_nulls(self):
- tmpfile = self.create_file_with_data(six.b("""planet_id,name,diameter_km,length_of_day_hours\n1000,Earth,12756,24\n2000,Mars,6792,24.7\n2500,Venus,,\n3000,Jupiter,142984,9.9"""))
-
- cmd = Q_EXECUTABLE + ' -d , -H "select * from %s" -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),8)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4], six.b(' `planet_id` - int'))
- self.assertEqual(o[5], six.b(' `name` - text'))
- self.assertEqual(o[6], six.b(' `diameter_km` - int'))
- self.assertEqual(o[7], six.b(' `length_of_day_hours` - real'))
-
- self.cleanup(tmpfile)
-
- def test_column_analysis_no_header(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
-
- cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4], six.b(' `c1` - text'))
- self.assertEqual(o[5], six.b(' `c2` - int'))
- self.assertEqual(o[6], six.b(' `c3` - int'))
-
- def test_column_analysis_with_unexpected_header(self):
- tmpfile = self.create_file_with_data(sample_data_with_header)
- cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 7)
- self.assertEqual(len(e), 1)
-
- self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4],six.b(' `c1` - text'))
- self.assertEqual(o[5],six.b(' `c2` - text'))
- self.assertEqual(o[6],six.b(' `c3` - text'))
-
- self.assertEqual(
- e[0], six.b('Warning - There seems to be header line in the file, but -H has not been specified. All fields will be detected as text fields, and the header line will appear as part of the data'))
-
- self.cleanup(tmpfile)
-
- def test_column_analysis_for_spaces_in_header_row(self):
- tmpfile = self.create_file_with_data(
- header_row_with_spaces + six.b("\n") + sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select name,\\`value 1\\` from %s" -H -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 7)
-
- self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4], six.b(' `name` - text'))
- self.assertEqual(o[5], six.b(' `value 1` - int'))
- self.assertEqual(o[6], six.b(' `value2` - int'))
-
- self.cleanup(tmpfile)
-
- def test_column_analysis_with_header(self):
- tmpfile = self.create_file_with_data(sample_data_with_header)
- cmd = Q_EXECUTABLE + ' -d , "select c1 from %s" -A -H' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode, 0)
- self.assertEqual(len(o),7)
- self.assertEqual(len(e),2)
- self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4], six.b(' `name` - text'))
- self.assertEqual(o[5], six.b(' `value1` - int'))
- self.assertEqual(o[6], six.b(' `value2` - int'))
-
- self.assertEqual(e[0],six.b('query error: no such column: c1'))
- self.assertTrue(e[1].startswith(six.b('Warning - There seems to be a ')))
-
- self.cleanup(tmpfile)
-
-
-
-class StdInTests(AbstractQTestCase):
-
- def test_stdin_input(self):
- cmd = six.b('printf "%s" | ' + Q_EXECUTABLE + ' -d , "select c1,c2,c3 from -"') % sample_data_no_header
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], sample_data_rows[0])
- self.assertEqual(o[1], sample_data_rows[1])
- self.assertEqual(o[2], sample_data_rows[2])
-
- def test_attempt_to_unzip_stdin(self):
- tmpfile = self.create_file_with_data(
- six.b('\x1f\x8b\x08\x08\xf2\x18\x12S\x00\x03xxxxxx\x003\xe42\xe22\xe62\xe12\xe52\xe32\xe7\xb2\xe0\xb2\xe424\xe0\x02\x00\xeb\xbf\x8a\x13\x15\x00\x00\x00'))
-
- cmd = 'cat %s | ' % tmpfile.name + Q_EXECUTABLE + ' -z "select sum(c1),avg(c1) from -"'
-
- retcode, o, e = run_command(cmd)
- self.assertTrue(retcode != 0)
- self.assertTrue(len(o) == 0)
- self.assertTrue(len(e) == 1)
-
- self.assertEqual(e[0],six.b('Cannot decompress standard input. Pipe the input through zcat in order to decompress.'))
-
- self.cleanup(tmpfile)
-
-class QuotingTests(AbstractQTestCase):
- def test_non_quoted_values_in_quoted_data(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
- cmd = Q_EXECUTABLE + ' -d " " "select c1 from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),4)
-
- self.assertTrue(o[0],'non_quoted')
- self.assertTrue(o[1],'control-value-1')
- self.assertTrue(o[2],'non-quoted-value')
- self.assertTrue(o[3],'control-value-1')
-
- self.cleanup(tmp_data_file)
-
- def test_regular_quoted_values_in_quoted_data(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
- cmd = Q_EXECUTABLE + ' -d " " "select c2 from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),4)
-
- self.assertTrue(o[0],'regular_double_quoted')
- self.assertTrue(o[1],'control-value-2')
- self.assertTrue(o[2],'this is a quoted value')
- self.assertTrue(o[3],'control-value-2')
-
- self.cleanup(tmp_data_file)
-
- def test_double_double_quoted_values_in_quoted_data(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
- cmd = Q_EXECUTABLE + ' -d " " "select c3 from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),4)
-
- self.assertTrue(o[0],'double_double_quoted')
- self.assertTrue(o[1],'control-value-3')
- self.assertTrue(o[2],'this is a "double double" quoted value')
- self.assertTrue(o[3],'control-value-3')
-
- self.cleanup(tmp_data_file)
-
- def test_escaped_double_quoted_values_in_quoted_data(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
- cmd = Q_EXECUTABLE + ' -d " " "select c4 from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),4)
-
- self.assertTrue(o[0],'escaped_double_quoted')
- self.assertTrue(o[1],'control-value-4')
- self.assertTrue(o[2],'this is an escaped "quoted value"')
- self.assertTrue(o[3],'control-value-4')
-
- self.cleanup(tmp_data_file)
-
- def test_none_input_quoting_mode_in_relaxed_mode(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2)
-
- cmd = Q_EXECUTABLE + ' -d " " -m relaxed -D , -w none -W none "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('"quoted,data",23'))
- self.assertEqual(o[1],six.b('unquoted-data,54,'))
-
- self.cleanup(tmp_data_file)
-
- def test_none_input_quoting_mode_in_strict_mode(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2)
-
- cmd = Q_EXECUTABLE + ' -d " " -m strict -D , -w none "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode,0)
- self.assertEqual(len(e),1)
- self.assertEqual(len(o),0)
-
- self.assertTrue(e[0].startswith(six.b('Strict mode. Column Count is expected to identical')))
-
- self.cleanup(tmp_data_file)
-
- def test_minimal_input_quoting_mode(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2)
-
- cmd = Q_EXECUTABLE + ' -d " " -D , -w minimal "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('quoted data,23'))
- self.assertEqual(o[1],six.b('unquoted-data,54'))
-
- self.cleanup(tmp_data_file)
-
- def test_all_input_quoting_mode(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2)
-
- cmd = Q_EXECUTABLE + ' -d " " -D , -w all "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('quoted data,23'))
- self.assertEqual(o[1],six.b('unquoted-data,54'))
-
- self.cleanup(tmp_data_file)
-
- def test_incorrect_input_quoting_mode(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2)
-
- cmd = Q_EXECUTABLE + ' -d " " -D , -w unknown_wrapping_mode "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode,0)
- self.assertEqual(len(e),1)
- self.assertEqual(len(o),0)
-
- self.assertTrue(e[0].startswith(six.b('Input quoting mode can only be one of all,minimal,none')))
- self.assertTrue(six.b('unknown_wrapping_mode') in e[0])
-
- self.cleanup(tmp_data_file)
-
- def test_none_output_quoting_mode(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2)
-
- cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W none "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('quoted data,23'))
- self.assertEqual(o[1],six.b('unquoted-data,54'))
-
- self.cleanup(tmp_data_file)
-
- def test_minimal_output_quoting_mode__without_need_to_quote_in_output(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2)
-
- cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W minimal "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('quoted data,23'))
- self.assertEqual(o[1],six.b('unquoted-data,54'))
-
- self.cleanup(tmp_data_file)
-
- def test_minimal_output_quoting_mode__with_need_to_quote_in_output_due_to_delimiter(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2)
-
- # output delimiter is set to space, so the output will contain it
- cmd = Q_EXECUTABLE + ' -d " " -D " " -w all -W minimal "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('"quoted data" 23'))
- self.assertEqual(o[1],six.b('unquoted-data 54'))
-
- self.cleanup(tmp_data_file)
-
- def test_minimal_output_quoting_mode__with_need_to_quote_in_output_due_to_newline(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2_with_newline)
-
- # Delimiter is set to colon (:), so it will not be inside the data values (this will make sure that the newline is the one causing the quoting)
- cmd = Q_EXECUTABLE + " -d ':' -w all -W minimal \"select c1,c2,replace(c1,'with' || x'0a' || 'a new line inside it','NEWLINE-REMOVED') from %s\"" % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),3)
-
- self.assertEqual(o[0],six.b('"quoted data with'))
- # Notice that the third column here is not quoted, because we replaced the newline with something else
- self.assertEqual(o[1],six.b('a new line inside it":23:quoted data NEWLINE-REMOVED'))
- self.assertEqual(o[2],six.b('unquoted-data:54:unquoted-data'))
-
- self.cleanup(tmp_data_file)
-
- def test_nonnumeric_output_quoting_mode(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2)
-
- cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W nonnumeric "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('"quoted data",23'))
- self.assertEqual(o[1],six.b('"unquoted-data",54'))
-
- self.cleanup(tmp_data_file)
-
- def test_all_output_quoting_mode(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data2)
-
- cmd = Q_EXECUTABLE + ' -d " " -D , -w all -W all "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('"quoted data","23"'))
- self.assertEqual(o[1],six.b('"unquoted-data","54"'))
-
- self.cleanup(tmp_data_file)
-
- def _internal_test_consistency_of_chaining_output_to_input(self,input_data,input_wrapping_mode,output_wrapping_mode):
-
- tmp_data_file = self.create_file_with_data(input_data)
-
- basic_cmd = Q_EXECUTABLE + ' -w %s -W %s "select * from -"' % (input_wrapping_mode,output_wrapping_mode)
- chained_cmd = 'cat %s | %s | %s | %s' % (tmp_data_file.name,basic_cmd,basic_cmd,basic_cmd)
-
- retcode, o, e = run_command(chained_cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(six.b("\n").join(o),input_data)
-
- self.cleanup(tmp_data_file)
-
- def test_consistency_of_chaining_minimal_wrapping_to_minimal_wrapping(self):
- input_data = six.b('"quoted data" 23\nunquoted-data 54')
- self._internal_test_consistency_of_chaining_output_to_input(input_data,'minimal','minimal')
-
- def test_consistency_of_chaining_all_wrapping_to_all_wrapping(self):
- input_data = six.b('"quoted data" "23"\n"unquoted-data" "54"')
- self._internal_test_consistency_of_chaining_output_to_input(input_data,'all','all')
-
- def test_input_field_quoting_and_data_types_with_encoding(self):
- OUTPUT_ENCODING = 'utf-8'
-
- # Checks combination of minimal input field quoting, with special characters that need to be decoded -
- # Both content and proper data types are verified
- data = six.b('111,22.22,"testing text with special characters - citt\xc3\xa0 ",http://somekindofurl.com,12.13.14.15,12.1\n')
- tmp_data_file = self.create_file_with_data(data)
-
- cmd = Q_EXECUTABLE + ' -d , "select * from %s" -E %s' % (tmp_data_file.name,OUTPUT_ENCODING)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),1)
-
- self.assertEqual(o[0].decode('utf-8'),u'111,22.22,testing text with special characters - citt\xe0 ,http://somekindofurl.com,12.13.14.15,12.1')
-
- cmd = Q_EXECUTABLE + ' -d , "select * from %s" -A' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),10)
-
- self.assertEqual(o[0],six.b('Table: %s' % tmp_data_file.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmp_data_file.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4],six.b(' `c1` - int'))
- self.assertEqual(o[5],six.b(' `c2` - real'))
- self.assertEqual(o[6],six.b(' `c3` - text'))
- self.assertEqual(o[7],six.b(' `c4` - text'))
- self.assertEqual(o[8],six.b(' `c5` - text'))
- self.assertEqual(o[9],six.b(' `c6` - real'))
-
- self.cleanup(tmp_data_file)
-
- def test_multiline_double_double_quoted_values_in_quoted_data(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
- # FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
- cmd = Q_EXECUTABLE + ' -d " " "select replace(c5,X\'0A\',\'::\') from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),4)
-
- self.assertTrue(o[0],six.b('multiline_double_double_quoted'))
- self.assertTrue(o[1],six.b('control-value-5'))
- self.assertTrue(o[2],six.b('this is a double double quoted "multiline\n value".'))
- self.assertTrue(o[3],six.b('control-value-5'))
-
- self.cleanup(tmp_data_file)
-
- def test_multiline_escaped_double_quoted_values_in_quoted_data(self):
- tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
- # FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
- cmd = Q_EXECUTABLE + ' -d " " "select replace(c6,X\'0A\',\'::\') from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),4)
-
- self.assertTrue(o[0],'multiline_escaped_double_quoted')
- self.assertTrue(o[1],'control-value-6')
- self.assertTrue(o[2],'this is an escaped "multiline:: value".')
- self.assertTrue(o[3],'control-value-6')
-
- self.cleanup(tmp_data_file)
-
- def test_disable_double_double_quoted_data_flag__values(self):
- # This test (and flag) is meant to verify backward comptibility only. It is possible that
- # this flag will be removed completely in the future
-
- tmp_data_file = self.create_file_with_data(double_double_quoted_data)
-
- cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('double_double_quoted'))
- self.assertEqual(o[1],six.b('this is a quoted value with "double'))
-
- cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select c3 from %s" -W none' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b(''))
- self.assertEqual(o[1],six.b('double'))
-
- cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select c4 from %s" -W none' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b(''))
- self.assertEqual(o[1],six.b('quotes"""'))
-
- self.cleanup(tmp_data_file)
-
- def test_disable_escaped_double_quoted_data_flag__values(self):
- # This test (and flag) is meant to verify backward comptibility only. It is possible that
- # this flag will be removed completely in the future
-
- tmp_data_file = self.create_file_with_data(escaped_double_quoted_data)
-
- cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('escaped_double_quoted'))
- self.assertEqual(o[1],six.b('this is a quoted value with \\escaped'))
-
- cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select c3 from %s" -W none' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b(''))
- self.assertEqual(o[1],six.b('double'))
-
- cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select c4 from %s" -W none' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b(''))
- self.assertEqual(o[1],six.b('quotes\\""'))
-
- self.cleanup(tmp_data_file)
-
- def test_combined_quoted_data_flags__number_of_columns_detected(self):
- # This test (and flags) is meant to verify backward comptibility only. It is possible that
- # these flags will be removed completely in the future
- tmp_data_file = self.create_file_with_data(combined_quoted_data)
-
- cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting --disable-escaped-double-quoting "select * from %s" -A' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- o = o[o.index(six.b(' Fields:'))+1:]
-
- self.assertEqual(len(o),7) # found 7 fields
-
- cmd = Q_EXECUTABLE + ' -d " " --disable-escaped-double-quoting "select * from %s" -A' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- o = o[o.index(six.b(' Fields:'))+1:]
-
- self.assertEqual(len(o),5) # found 5 fields
-
- cmd = Q_EXECUTABLE + ' -d " " --disable-double-double-quoting "select * from %s" -A' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- o = o[o.index(six.b(' Fields:'))+1:]
-
- self.assertEqual(len(o),5) # found 5 fields
-
- cmd = Q_EXECUTABLE + ' -d " " "select * from %s" -A' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- o = o[o.index(six.b(' Fields:'))+1:]
-
- self.assertEqual(len(o),3) # found only 3 fields, which is the correct amount
-
- self.cleanup(tmp_data_file)
-
-
-class EncodingTests(AbstractQTestCase):
-
- def test_utf8_with_bom_encoding(self):
- utf_8_data_with_bom = six.b('\xef\xbb\xbf"typeid","limit","apcost","date","checkpointId"\n"1","2","5","1,2,3,4,5,6,7","3000,3001,3002"\n"2","2","5","1,2,3,4,5,6,7","3003,3004,3005"\n')
- tmp_data_file = self.create_file_with_data(utf_8_data_with_bom,encoding=None)
-
- cmd = Q_EXECUTABLE + ' -d , -H -O -e utf-8-sig "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(e),0)
- self.assertEqual(len(o),3)
-
- self.assertEqual(o[0],six.b('typeid,limit,apcost,date,checkpointId'))
- self.assertEqual(o[1],six.b('1,2,5,"1,2,3,4,5,6,7","3000,3001,3002"'))
- self.assertEqual(o[2],six.b('2,2,5,"1,2,3,4,5,6,7","3003,3004,3005"'))
-
- self.cleanup(tmp_data_file)
-
-
-class QrcTests(AbstractQTestCase):
-
- def test_explicit_qrc_filename_not_found(self):
- non_existent_filename = str(uuid.uuid4())
- env_to_inject = { 'QRC_FILENAME': non_existent_filename}
- cmd = Q_EXECUTABLE + ' "select 1"'
- retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)
-
- self.assertEqual(retcode, 244)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertTrue(e[0] == six.b('QRC_FILENAME env var exists, but cannot find qrc file at %s' % non_existent_filename))
-
- def test_explicit_qrc_filename_that_exists(self):
- tmp_qrc_file = self.create_file_with_data(six.b('''[options]
-output_delimiter=|
-'''))
- env_to_inject = { 'QRC_FILENAME': tmp_qrc_file.name}
- cmd = Q_EXECUTABLE + ' "select 1,2"'
- retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0] == six.b('1|2'))
-
- self.cleanup(tmp_qrc_file)
-
- def test_all_default_options(self):
- # Create a qrc file that contains all default values inside the qrc file, but with some different values than the regular defaults
- tmp_qrc_file = self.create_file_with_data(six.b('''[options]
-analyze_only=True
-beautify=True
-caching_mode=readwrite
-column_count=32
-delimiter=,
-disable_column_type_detection=True
-disable_double_double_quoting=False
-disable_escaped_double_quoting=False
-encoding=ascii
-formatting=xxx
-gzipped=True
-input_quoting_mode=all
-keep_leading_whitespace_in_values=True
-list_user_functions=True
-max_attached_sqlite_databases=888
-max_column_length_limit=8888
-mode=strict
-output_delimiter=|
-output_encoding=utf-8
-output_header=True
-output_quoting_mode=all
-overwrite_qsql=False
-pipe_delimited=True
-pipe_delimited_output=True
-query_encoding=ascii
-query_filename=query-filename
-save_db_to_disk_filename=save-db-to-disk-filename
-skip_header=True
-tab_delimited=True
-tab_delimited_output=true
-verbose=True
-with_universal_newlines=True
-'''))
- env_to_inject = { 'QRC_FILENAME': tmp_qrc_file.name}
- cmd = Q_EXECUTABLE + ' --dump-defaults'
- retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 34)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0],six.b('[options]'))
- o = o[1:]
-
- m = {}
- for r in o:
- key,val = r.split(six.b("="),1)
- m[key] = val
-
- self.assertEqual(m[six.b('analyze_only')],six.b('True'))
- self.assertEqual(m[six.b('beautify')],six.b('True'))
- self.assertEqual(m[six.b('caching_mode')],six.b('readwrite'))
- self.assertEqual(m[six.b('column_count')],six.b('32'))
- self.assertEqual(m[six.b('delimiter')],six.b(','))
- self.assertEqual(m[six.b('disable_column_type_detection')],six.b('True'))
- self.assertEqual(m[six.b('disable_double_double_quoting')],six.b('False'))
- self.assertEqual(m[six.b('disable_escaped_double_quoting')],six.b('False'))
- self.assertEqual(m[six.b('encoding')],six.b('ascii'))
- self.assertEqual(m[six.b('formatting')],six.b('xxx'))
- self.assertEqual(m[six.b('gzipped')],six.b('True'))
- self.assertEqual(m[six.b('input_quoting_mode')],six.b('all'))
- self.assertEqual(m[six.b('keep_leading_whitespace_in_values')],six.b('True'))
- self.assertEqual(m[six.b('list_user_functions')],six.b('True'))
- self.assertEqual(m[six.b('max_attached_sqlite_databases')],six.b('888'))
- self.assertEqual(m[six.b('max_column_length_limit')],six.b('8888'))
- self.assertEqual(m[six.b('mode')],six.b('strict'))
- self.assertEqual(m[six.b('output_delimiter')],six.b('|'))
- self.assertEqual(m[six.b('output_encoding')],six.b('utf-8'))
- self.assertEqual(m[six.b('output_header')],six.b('True'))
- self.assertEqual(m[six.b('output_quoting_mode')],six.b('all'))
- self.assertEqual(m[six.b('overwrite_qsql')],six.b('False'))
- self.assertEqual(m[six.b('pipe_delimited')],six.b('True'))
- self.assertEqual(m[six.b('pipe_delimited_output')],six.b('True'))
- self.assertEqual(m[six.b('query_encoding')],six.b('ascii'))
- self.assertEqual(m[six.b('query_filename')],six.b('query-filename'))
- self.assertEqual(m[six.b('save_db_to_disk_filename')],six.b('save-db-to-disk-filename'))
- self.assertEqual(m[six.b('skip_header')],six.b('True'))
- self.assertEqual(m[six.b('tab_delimited')],six.b('True'))
- self.assertEqual(m[six.b('tab_delimited_output')],six.b('True'))
- self.assertEqual(m[six.b('verbose')],six.b('True'))
- self.assertEqual(m[six.b('with_universal_newlines')],six.b('True'))
-
- self.cleanup(tmp_qrc_file)
-
- def test_caching_readwrite_using_qrc_file(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- tmpfile_folder = os.path.dirname(tmpfile.name)
- tmpfile_filename = os.path.basename(tmpfile.name)
- expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -d , "select * from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),3)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('a,1,0'))
- self.assertEqual(o[1],six.b('b,2,0'))
- self.assertEqual(o[2],six.b('c,,0'))
-
- # Ensure default does not create a cache file
- self.assertTrue(not os.path.exists(expected_cache_filename))
-
- tmp_qrc_file = self.create_file_with_data(six.b('''[options]
-caching_mode=readwrite
-'''))
- env_to_inject = { 'QRC_FILENAME': tmp_qrc_file.name}
- cmd = Q_EXECUTABLE + ' -d , "select * from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd, env_to_inject=env_to_inject)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),3)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('a,1,0'))
- self.assertEqual(o[1],six.b('b,2,0'))
- self.assertEqual(o[2],six.b('c,,0'))
-
- # Ensure that qrc file caching is being used and caching is activated (cache file should exist)
- self.assertTrue(os.path.exists(expected_cache_filename))
-
- self.cleanup(tmp_qrc_file)
- self.cleanup(tmpfile)
-
-
-class QsqlUsageTests(AbstractQTestCase):
-
- def test_concatenate_same_qsql_file_with_single_table(self):
- numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
-
- qsql_file_data = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers)
-
- tmpfile = self.create_file_with_data(qsql_file_data,suffix='.qsql')
-
- cmd = Q_EXECUTABLE + ' -t "select count(*) from (select * from %s union all select * from %s)"' % (tmpfile.name,tmpfile.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('20000'))
-
- def test_query_qsql_with_single_table(self):
- numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
-
- qsql_file_data = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers)
-
- tmpfile = self.create_file_with_data(qsql_file_data)
-
- cmd = Q_EXECUTABLE + ' -t "select sum(aa),sum(bb),sum(cc) from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('50005000\t50005000\t50005000'))
-
- def test_query_qsql_with_single_table_with_explicit_non_existent_tablename(self):
- numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
-
- qsql_file_data = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers)
-
- tmpfile = self.create_file_with_data(qsql_file_data)
-
- c = sqlite3.connect(tmpfile.name)
- actual_table_name = c.execute('select temp_table_name from _qcatalog').fetchall()[0][0]
- c.close()
-
-
- cmd = '%s -t "select sum(aa),sum(bb),sum(cc) from %s:::non-existent"' % (Q_EXECUTABLE,tmpfile.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 84)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
- self.assertEqual(e[0],six.b('Table non-existent could not be found in qsql file %s . Existing table names: %s' % (tmpfile.name,actual_table_name)))
-
- def test_query_qsql_with_single_table_with_explicit_table_name(self):
- numbers = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
-
- qsql_file_data = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers)
-
- tmpfile = self.create_file_with_data(qsql_file_data)
-
- c = sqlite3.connect(tmpfile.name)
- actual_table_name = c.execute('select temp_table_name from _qcatalog').fetchall()[0][0]
- c.close()
-
-
- cmd = '%s -t "select sum(aa),sum(bb),sum(cc) from %s:::%s"' % (Q_EXECUTABLE,tmpfile.name,actual_table_name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('50005000\t50005000\t50005000'))
-
- def test_query_multi_qsql_with_single_table(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- qsql_file_data1 = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
- tmpfile1 = self.create_file_with_data(qsql_file_data1,suffix='.qsql')
-
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
- qsql_file_data2 = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers2)
- tmpfile2 = self.create_file_with_data(qsql_file_data2,suffix='.qsql')
-
- cmd = Q_EXECUTABLE + ' -t "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s small_file left join %s large_file on (large_file.aa == small_file.bb)"' % (tmpfile2.name,tmpfile1.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('55\t55\t55'))
-
- def test_query_concatenated_qsqls_each_with_single_table(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- qsql_file_data1 = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
- tmpfile1 = self.create_file_with_data(qsql_file_data1,suffix='.qsql')
-
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
- qsql_file_data2 = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers2)
- tmpfile2 = self.create_file_with_data(qsql_file_data2,suffix='.qsql')
-
- cmd = Q_EXECUTABLE + ' -t "select sum(aa),sum(bb),sum(cc) from (select * from %s union all select * from %s)"' % (tmpfile2.name,tmpfile1.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('50005055\t50005055\t50005055'))
-
- def test_concatenated_qsql_and_data_stream__column_names_mismatch(self):
- N1 = 10000
- N2 = 100
-
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, N1 + 1)]
- csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
- tmpfile1 = self.create_file_with_data(csv_file_data1)
- expected_cache_filename1 = '%s.qsql' % tmpfile1.name
-
- cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertTrue(os.path.exists(expected_cache_filename1))
-
- cmd = 'seq 1 %s | %s -c 1 "select count(*) from (select * from %s UNION ALL select * from -)"' % (N2, Q_EXECUTABLE,expected_cache_filename1)
-
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 1)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
- self.assertEqual(e[0],six.b('query error: SELECTs to the left and right of UNION ALL do not have the same number of result columns'))
-
- def test_concatenated_qsql_and_data_stream(self):
- N1 = 10000
- N2 = 100
-
- numbers1 = [[six.b(str(i))] for i in range(1, N1 + 1)]
- csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('c1')], numbers1)
- tmpfile1 = self.create_file_with_data(csv_file_data1)
- expected_cache_filename1 = '%s.qsql' % tmpfile1.name
-
- cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertTrue(os.path.exists(expected_cache_filename1))
-
- cmd = 'seq 1 %s | %s -t -c 1 "select count(*),sum(c1) from (select * from %s UNION ALL select * from -)"' % (N2, Q_EXECUTABLE,expected_cache_filename1)
-
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('%s\t%s' % (N1+N2,sum(range(1,N1+1)) + sum(range(1,N2+1)))))
-
- def test_concatenated_qsql_and_data_stream__explicit_table_name(self):
- N1 = 10000
- N2 = 100
-
- numbers1 = [[six.b(str(i))] for i in range(1, N1 + 1)]
- csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('c1')], numbers1)
- tmpfile1 = self.create_file_with_data(csv_file_data1)
- tmpfile1_expected_table_name = os.path.basename(tmpfile1.name)
-
- expected_cache_filename1 = '%s.qsql' % tmpfile1.name
-
- cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertTrue(os.path.exists(expected_cache_filename1))
-
- cmd = 'seq 1 %s | %s -t -c 1 "select count(*),sum(c1) from (select * from %s:::%s UNION ALL select * from -)"' % (N2, Q_EXECUTABLE,expected_cache_filename1,tmpfile1_expected_table_name)
-
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('%s\t%s' % (N1+N2,sum(range(1,N1+1)) + sum(range(1,N2+1)))))
-
- def test_write_to_qsql__check_chosen_table_name(self):
- numbers1 = [[six.b(str(i))] for i in range(1, 10001)]
- csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('c1')], numbers1)
- tmpfile1 = self.create_file_with_data(csv_file_data1)
- expected_cache_filename1 = '%s.qsql' % tmpfile1.name
-
- cmd = Q_EXECUTABLE + ' -c 1 -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertTrue(os.path.exists(expected_cache_filename1))
-
- c = sqlite3.connect(expected_cache_filename1)
- qcatalog_entries = c.execute('select temp_table_name from _qcatalog').fetchall()
- self.assertEqual(len(qcatalog_entries),1)
- self.assertEqual(qcatalog_entries[0][0],os.path.basename(tmpfile1.name))
-
- def test_concatenated_mixes_qsql_with_single_table_and_csv(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
- tmpfile1 = self.create_file_with_data(csv_file_data1)
- expected_cache_filename1 = '%s.qsql' % tmpfile1.name
-
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
- csv_file_data2 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers2)
- tmpfile2 = self.create_file_with_data(csv_file_data2)
- expected_cache_filename2 = '%s.qsql' % tmpfile2.name
-
-
- cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertTrue(os.path.exists(expected_cache_filename1))
-
- cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile2.name
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertTrue(os.path.exists(expected_cache_filename2))
-
- # csv and qsql files prepared. now test all four combinations
-
- cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s union all select * from %s)"' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),2)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('cnt\tsum_aa\tsum_bb\tsum_cc'))
- self.assertEqual(o[1],six.b('10010\t50005055\t50005055\t50005055'))
-
- cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s union all select * from %s.qsql)"' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),2)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('cnt\tsum_aa\tsum_bb\tsum_cc'))
- self.assertEqual(o[1],six.b('10010\t50005055\t50005055\t50005055'))
-
- cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s.qsql union all select * from %s)"' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),2)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('cnt\tsum_aa\tsum_bb\tsum_cc'))
- self.assertEqual(o[1],six.b('10010\t50005055\t50005055\t50005055'))
-
- cmd = Q_EXECUTABLE + ' -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s.qsql union all select * from %s.qsql)"' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),2)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('cnt\tsum_aa\tsum_bb\tsum_cc'))
- self.assertEqual(o[1],six.b('10010\t50005055\t50005055\t50005055'))
-
- def test_analysis_of_concatenated_mixes_qsql_with_single_table_and_csv(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- csv_file_data1 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
- tmpfile1 = self.create_file_with_data(csv_file_data1)
- expected_cache_filename1 = '%s.qsql' % tmpfile1.name
-
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
- csv_file_data2 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers2)
- tmpfile2 = self.create_file_with_data(csv_file_data2)
- expected_cache_filename2 = '%s.qsql' % tmpfile2.name
-
- cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertTrue(os.path.exists(expected_cache_filename1))
-
- cmd = Q_EXECUTABLE + ' -H -t "select count(*) from %s" -C readwrite' % tmpfile2.name
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertTrue(os.path.exists(expected_cache_filename2))
-
- # csv and qsql files prepared
-
- # Test function, will be used multiple times, each time with a different combination
-
- def do_check(caching_mode,
- file1_source_type,file1_table_postfix,file1_postfix,
- file2_source_type,file2_table_postfix,file2_postfix):
- cmd = '%s -C %s -O -H -t "select count(*) cnt,sum(aa) sum_aa,sum(bb) sum_bb,sum(cc) sum_cc from (select * from %s%s UNION ALL select * from %s%s)" -A' % (
- Q_EXECUTABLE,
- caching_mode,
- tmpfile1.name,
- file1_table_postfix,
- tmpfile2.name,
- file2_table_postfix)
-
- retcode, o, e = run_command(cmd)
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),14)
- self.assertEqual(len(e),0)
- self.assertEqual(o, [
- six.b('Table: %s%s' % (tmpfile1.name,file1_table_postfix)),
- six.b(' Sources:'),
- six.b(' source_type: %s source: %s%s' % (file1_source_type,tmpfile1.name,file1_postfix)),
- six.b(' Fields:'),
- six.b(' `aa` - int'),
- six.b(' `bb` - int'),
- six.b(' `cc` - int'),
- six.b('Table: %s%s' % (tmpfile2.name,file2_table_postfix)),
- six.b(' Sources:'),
- six.b(' source_type: %s source: %s%s' % (file2_source_type,tmpfile2.name,file2_postfix)),
- six.b(' Fields:'),
- six.b(' `aa` - int'),
- six.b(' `bb` - int'),
- six.b(' `cc` - int')])
-
- # now test *the analysis results* of all four combinations, adding `-C read`, so the
- # qsql will be used. Running with `-C none`, would have caused the qsql not to be used even if the qsql file exists
-
- do_check(caching_mode='read',
- file1_source_type='qsql-file-with-original',file1_table_postfix='',file1_postfix='.qsql',
- file2_source_type='qsql-file-with-original',file2_table_postfix='',file2_postfix='.qsql')
- do_check('read',
- file1_source_type='qsql-file-with-original',file1_table_postfix='',file1_postfix='.qsql',
- file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')
- do_check('read',
- file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
- file2_source_type='qsql-file-with-original',file2_table_postfix='',file2_postfix='.qsql')
- do_check('read',
- file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
- file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')
-
- # Now test the all combinations again, this time with `-C none`, to make sure that by
- # default, the qsql file is not used, and -A shows that fact
-
- do_check(caching_mode='none',
- file1_source_type='file-with-unused-qsql',file1_table_postfix='',file1_postfix='',
- file2_source_type='file-with-unused-qsql',file2_table_postfix='',file2_postfix='')
- do_check('none',
- file1_source_type='file-with-unused-qsql',file1_table_postfix='',file1_postfix='',
- file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')
- do_check('none',
- file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
- file2_source_type='file-with-unused-qsql',file2_table_postfix='',file2_postfix='')
- do_check('none',
- file1_source_type='qsql-file',file1_table_postfix='.qsql',file1_postfix='.qsql',
- file2_source_type='qsql-file',file2_table_postfix='.qsql',file2_postfix='.qsql')
-
- def test_mixed_qsql_with_single_table_and_csv__missing_header_parameter_for_csv(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- qsql_file_data1 = self.arrays_to_qsql_file_content([six.b('aa'), six.b('bb'), six.b('cc')], numbers1)
- tmpfile1 = self.create_file_with_data(qsql_file_data1,suffix='.qsql')
-
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
- csv_file_data2 = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'), six.b('bb'), six.b('cc')], numbers2)
- tmpfile2 = self.create_file_with_data(csv_file_data2)
-
- cmd = Q_EXECUTABLE + ' -t "select sum(aa),sum(bb),sum(cc) from (select * from %s union all select * from %s)"' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0],six.b('Warning - There seems to be header line in the file, but -H has not been specified. All fields will be detected as text fields, and the header line will appear as part of the data'))
- self.assertEqual(o[0],six.b('50005055.0\t50005055.0\t50005055.0'))
-
- def test_qsql_with_multiple_tables_direct_use(self):
- numbers1 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 10001)]
- qsql_filename1 = self.create_qsql_file_with_content_and_return_filename([six.b('aa'), six.b('bb'), six.b('cc')],numbers1)
- expected_stored_table_name1 = os.path.basename(qsql_filename1)[:-5]
-
- numbers2 = [[six.b(str(i)), six.b(str(i)), six.b(str(i))] for i in range(1, 11)]
- qsql_filename2 = self.create_qsql_file_with_content_and_return_filename([six.b('aa'), six.b('bb'), six.b('cc')],numbers2)
- expected_stored_table_name2 = os.path.basename(qsql_filename2)[:-5]
-
- qsql_with_multiple_tables = self.generate_tmpfile_name(suffix='.qsql')
-
- cmd = '%s -t "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s large_file left join %s small_file on (large_file.aa == small_file.bb)" -S %s' % \
- (Q_EXECUTABLE,qsql_filename1,qsql_filename2,qsql_with_multiple_tables)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 4)
- self.assertEqual(e[0], six.b('Going to save data into a disk database: %s' % qsql_with_multiple_tables))
- self.assertTrue(e[1].startswith(six.b('Data has been saved into %s . Saving has taken' % qsql_with_multiple_tables)))
- self.assertEqual(e[2],six.b('Query to run on the database: select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s large_file left join %s small_file on (large_file.aa == small_file.bb);' % \
- (expected_stored_table_name1,expected_stored_table_name2)))
- self.assertEqual(e[3],six.b('You can run the query directly from the command line using the following command: echo "select sum(large_file.aa),sum(large_file.bb),sum(large_file.cc) from %s large_file left join %s small_file on (large_file.aa == small_file.bb)" | sqlite3 %s' % \
- (expected_stored_table_name1,expected_stored_table_name2,qsql_with_multiple_tables)))
-
- cmd = '%s -d , "select count(*) cnt,sum(aa),sum(bb),sum(cc) from %s:::%s"' % (Q_EXECUTABLE,qsql_with_multiple_tables,expected_stored_table_name1)
- r, o, e = run_command(cmd)
-
- self.assertEqual(r,0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('10000,50005000,50005000,50005000'))
-
- def test_direct_use_of_sqlite_db_with_one_table(self):
- tmpfile = self.create_file_with_data(six.b(''),suffix='.sqlite')
- os.remove(tmpfile.name)
- c = sqlite3.connect(tmpfile.name)
- c.execute(' create table mytable (x int, y int)').fetchall()
- c.execute(' insert into mytable (x,y) values (100,200),(300,400)').fetchall()
- c.commit()
- c.close()
-
- cmd = Q_EXECUTABLE + ' -t "select sum(x),sum(y) from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('400\t600'))
-
- cmd = Q_EXECUTABLE + ' -t "select sum(x),sum(y) from %s:::mytable"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('400\t600'))
-
- def test_direct_use_of_sqlite_db_with_one_table__nonexistent_table(self):
- tmpfile = self.create_file_with_data(six.b(''),suffix='.sqlite')
- os.remove(tmpfile.name)
- c = sqlite3.connect(tmpfile.name)
- c.execute(' create table some_numbers (x int, y int)').fetchall()
- c.execute(' insert into some_numbers (x,y) values (100,200),(300,400)').fetchall()
- c.commit()
- c.close()
-
- cmd = Q_EXECUTABLE + ' -t "select sum(x),sum(y) from %s:::non_existent"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 85)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0],six.b('Table non_existent could not be found in sqlite file %s . Existing table names: some_numbers' % (tmpfile.name)))
-
-
- def test_qsql_creation_and_direct_use(self):
- numbers = [[six.b(str(i)),six.b(str(i)),six.b(str(i))] for i in range(1,10001)]
-
- file_data = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'),six.b('bb'),six.b('cc')],numbers)
-
- tmpfile = self.create_file_with_data(file_data)
- tmpfile_folder = os.path.dirname(tmpfile.name)
- tmpfile_filename = os.path.basename(tmpfile.name)
- expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -t "select sum(aa),sum(bb),sum(cc) from %s" -H -C readwrite' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('50005000\t50005000\t50005000'))
-
- self.assertTrue(os.path.exists(expected_cache_filename))
-
- self.cleanup(tmpfile)
-
- # Get the data using a comma delimiter, to make sure that column parsing was done correctlyAdding to qcatalog table:
- cmd = Q_EXECUTABLE + ' -D , "select count(*),sum(aa),sum(bb),sum(cc) from %s"' % expected_cache_filename
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('10000,50005000,50005000,50005000'))
-
- def test_analysis_of_qsql_direct_usage(self):
- numbers = [[six.b(str(i)),six.b(str(i)),six.b(str(i))] for i in range(1,10001)]
-
- file_data = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'),six.b('bb'),six.b('cc')],numbers)
-
- tmpfile = self.create_file_with_data(file_data)
- tmpfile_folder = os.path.dirname(tmpfile.name)
- tmpfile_filename = os.path.basename(tmpfile.name)
- expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -t "select sum(aa),sum(bb),sum(cc) from %s" -H -C readwrite' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('50005000\t50005000\t50005000'))
-
- self.assertTrue(os.path.exists(expected_cache_filename))
-
- self.cleanup(tmpfile)
-
- cmd = Q_EXECUTABLE + ' "select * from %s" -A' % expected_cache_filename
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 7)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('Table: %s' % expected_cache_filename))
- self.assertEqual(o[1],six.b(" Sources:"))
- self.assertEqual(o[2],six.b(' source_type: qsql-file source: %s' % expected_cache_filename))
- self.assertEqual(o[3],six.b(" Fields:"))
- self.assertEqual(o[4],six.b(' `aa` - int'))
- self.assertEqual(o[5],six.b(' `bb` - int'))
- self.assertEqual(o[6],six.b(' `cc` - int'))
-
- def test_analysis_of_qsql_direct_usage2(self):
- numbers = [[six.b(str(i)),six.b(str(i)),six.b(str(i))] for i in range(1,10001)]
-
- file_data = self.arrays_to_csv_file_content(six.b('\t'),[six.b('aa'),six.b('bb'),six.b('cc')],numbers)
-
- tmpfile = self.create_file_with_data(file_data)
- tmpfile_folder = os.path.dirname(tmpfile.name)
- tmpfile_filename = os.path.basename(tmpfile.name)
- expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -t "select sum(aa),sum(bb),sum(cc) from %s" -H -C readwrite' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('50005000\t50005000\t50005000'))
-
- self.assertTrue(os.path.exists(expected_cache_filename))
-
- self.cleanup(tmpfile)
-
- cmd = Q_EXECUTABLE + ' "select * from %s" -A' % expected_cache_filename
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 7)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('Table: %s' % expected_cache_filename))
- self.assertEqual(o[1],six.b(" Sources:"))
- self.assertEqual(o[2],six.b(' source_type: qsql-file source: %s' % expected_cache_filename))
- self.assertEqual(o[3],six.b(" Fields:"))
- self.assertEqual(o[4],six.b(' `aa` - int'))
- self.assertEqual(o[5],six.b(' `bb` - int'))
- self.assertEqual(o[6],six.b(' `cc` - int'))
-
- def test_direct_qsql_usage_for_single_table_qsql_file(self):
- disk_db_filename = self.random_tmp_filename('save-to-db','qsql')
-
- cmd = 'seq 1 10000 | %s -t "select sum(aa),sum(bb),sum(cc) from -" -S %s' % (Q_EXECUTABLE,disk_db_filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
-
- cmd = '%s -D, "select count(*),sum(c1) from %s:::data_stream_stdin"' % (Q_EXECUTABLE,disk_db_filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('10000,50005000'))
-
- def test_direct_qsql_usage_for_single_table_qsql_file__nonexistent_table(self):
- disk_db_filename = self.random_tmp_filename('save-to-db','qsql')
-
- cmd = 'seq 1 10000 | %s -t "select sum(aa),sum(bb),sum(cc) from -" -S %s' % (Q_EXECUTABLE,disk_db_filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
-
- cmd = '%s -D, "select count(*),sum(c1) from %s:::unknown_table_name"' % (Q_EXECUTABLE,disk_db_filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 85)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
- self.assertEqual(e[0],six.b('Table unknown_table_name could not be found in sqlite file %s . Existing table names: data_stream_stdin' % (disk_db_filename)))
-
- def test_direct_qsql_usage_from_written_data_stream(self):
- disk_db_filename = self.random_tmp_filename('save-to-db','qsql')
-
- cmd = 'seq 1 10000 | %s -t "select sum(aa),sum(bb),sum(cc) from -" -S %s' % (Q_EXECUTABLE,disk_db_filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
-
- cmd = '%s -D, "select count(*),sum(c1) from %s:::data_stream_stdin"' % (Q_EXECUTABLE,disk_db_filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('10000,50005000'))
-
- def test_direct_qsql_self_join(self):
- disk_db_filename = self.random_tmp_filename('save-to-db','qsql')
-
- N = 100
- cmd = 'seq 1 %s | %s -t "select count(*),sum(c1) from -" -S %s' % (N,Q_EXECUTABLE,disk_db_filename)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
-
- cmd = '%s -D, "select count(*),sum(a.c1),sum(b.c1) from %s:::data_stream_stdin a left join %s:::data_stream_stdin b"' % (Q_EXECUTABLE,disk_db_filename,disk_db_filename)
- retcode, o, e = run_command(cmd)
-
- expected_sum = sum(range(1,N+1))*N
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
- self.assertEqual(o[0],six.b('10000,%s,%s' % (expected_sum,expected_sum)))
-
-
-class CachingTests(AbstractQTestCase):
-
- def test_cache_empty_file(self):
- file_data = six.b("a,b,c")
- tmpfile = self.create_file_with_data(file_data)
- tmpfile_folder = os.path.dirname(tmpfile.name)
- tmpfile_filename = os.path.basename(tmpfile.name)
- tmpfile_expected_table_name = os.path.basename(tmpfile.name)
- expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C none' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0],six.b("Warning - data is empty"))
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0],six.b("Warning - data is empty"))
-
- # After readwrite caching has been activated, the cache file is expected to exist
- self.assertTrue(os.path.exists(expected_cache_filename))
-
- # Read the cache file directly, to make sure it's a valid sqlite file
- import sqlite3
- db = sqlite3.connect(expected_cache_filename)
- table_list = db.execute("select content_signature_key,temp_table_name,content_signature,creation_time,source_type,source from _qcatalog where temp_table_name == '%s'" % (tmpfile_expected_table_name)).fetchall()
- self.assertTrue(len(table_list) == 1)
- table_metadata = table_list[0]
- results = db.execute("select * from %s" % table_metadata[1]).fetchall()
- self.assertTrue(len(results) == 0)
-
- self.cleanup(tmpfile)
-
- def test_reading_the_wrong_cache__original_file_having_different_data(self):
- file_data1 = six.b("a,b,c\n10,20,30\n30,40,50")
-
- tmpfile1 = self.create_file_with_data(file_data1)
- tmpfile1_folder = os.path.dirname(tmpfile1.name)
- tmpfile1_filename = os.path.basename(tmpfile1.name)
- expected_cache_filename = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0], six.b('10'))
- self.assertEqual(o[1], six.b('30'))
-
- # Ensure cache has been created
- self.assertTrue(os.path.exists(expected_cache_filename))
-
- # Overwrite the original file
- file_data2 = six.b("a,b,c\n10,20,30\n30,40,50\n50,60,70")
- self.write_file(tmpfile1.name,file_data2)
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile1.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 81)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- self.assertEqual(e[0], six.b('%s vs %s.qsql: Content Signatures differ at inferer.rows (actual analysis data differs)' % \
- (tmpfile1.name,tmpfile1.name)))
-
-
- def test_reading_the_wrong_cache__original_file_having_different_delimiter(self):
- file_data1 = six.b("a,b,c\n10,20,30\n30,40,50")
-
- tmpfile1 = self.create_file_with_data(file_data1)
- tmpfile1_folder = os.path.dirname(tmpfile1.name)
- tmpfile1_filename = os.path.basename(tmpfile1.name)
- expected_cache_filename = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0], six.b('10'))
- self.assertEqual(o[1], six.b('30'))
-
- # Ensure cache has been created
- self.assertTrue(os.path.exists(expected_cache_filename))
-
- # Overwrite the original file
- file_data2 = six.b("a\tb\tc\n10\t20\t30\n30\t40\t50")
- self.write_file(tmpfile1.name,file_data2)
-
- cmd = Q_EXECUTABLE + ' -H -t "select a from %s" -C read' % tmpfile1.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 80)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- x = six.b("%s vs %s.qsql: Content Signatures for table %s differ at input_delimiter (source value '\t' disk signature value ',')" % \
- (tmpfile1.name,tmpfile1.name,tmpfile1.name))
- self.assertEqual(e[0], x)
-
- def test_rename_cache_and_read_from_it(self):
- # create a file, along with its qsql
- file_data1 = six.b("a,b,c\n10,20,30\n30,40,50")
-
- tmpfile1 = self.create_file_with_data(file_data1)
- tmpfile1_folder = os.path.dirname(tmpfile1.name)
- tmpfile1_filename = os.path.basename(tmpfile1.name)
- expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0], six.b('10'))
- self.assertEqual(o[1], six.b('30'))
- # Ensure cache has been created
- self.assertTrue(os.path.exists(expected_cache_filename1))
-
- tmp_fn = self.generate_tmpfile_name("aa","qsql")
- os.rename(expected_cache_filename1,tmp_fn)
-
- cmd = '%s "select a from %s"' % (Q_EXECUTABLE,tmp_fn)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0], six.b('10'))
- self.assertEqual(o[1], six.b('30'))
-
-
- def test_reading_the_wrong_cache__qsql_file_not_having_a_matching_content_signature(self):
- # create a file, along with its qsql
- file_data1 = six.b("a,b,c\n10,20,30\n30,40,50")
-
- tmpfile1 = self.create_file_with_data(file_data1)
- tmpfile1_folder = os.path.dirname(tmpfile1.name)
- tmpfile1_filename = os.path.basename(tmpfile1.name)
- expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0], six.b('10'))
- self.assertEqual(o[1], six.b('30'))
- # Ensure cache has been created
- self.assertTrue(os.path.exists(expected_cache_filename1))
-
- file_data2 = six.b("c,d,e\n10,20,30\n30,40,50")
-
- # create another file with a different header, along with its qsql
- tmpfile2 = self.create_file_with_data(file_data2)
- tmpfile2_folder = os.path.dirname(tmpfile2.name)
- tmpfile2_filename = os.path.basename(tmpfile2.name)
- expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -d , "select c from %s" -C readwrite' % tmpfile2.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0], six.b('10'))
- self.assertEqual(o[1], six.b('30'))
- # Ensure cache has been created
- self.assertTrue(os.path.exists(expected_cache_filename2))
-
- # now take the second qsql file as if it was the first. Execution on file 1 should fail, since the qsql file
- # does not really contain the table we're after
-
- os.remove(expected_cache_filename1)
- os.rename(expected_cache_filename2,expected_cache_filename1)
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile1.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 80)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
- x = six.b("%s vs %s.qsql: Content Signatures for table %s differ at inferer.header_row (source value '['a', 'b', 'c']' disk signature value '['c', 'd', 'e']')" % (tmpfile1.name,tmpfile1.name,tmpfile1.name))
- self.assertEqual(e[0], x)
-
- def test_reading_the_wrong_cache__qsql_file_not_having_any_content_signature(self):
- # create a file, along with its qsql
- file_data1 = six.b("a,b,c\n10,20,30\n30,40,50")
-
- tmpfile1 = self.create_file_with_data(file_data1)
- tmpfile1_folder = os.path.dirname(tmpfile1.name)
- tmpfile1_filename = os.path.basename(tmpfile1.name)
- expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0], six.b('10'))
- self.assertEqual(o[1], six.b('30'))
- # Ensure cache has been created
- self.assertTrue(os.path.exists(expected_cache_filename1))
-
- file_data2 = six.b("c,d,e\n10,20,30\n30,40,50")
-
- # delete qcatalog content, so no entries will be available
- c = sqlite3.connect(expected_cache_filename1)
- c.execute('delete from _qcatalog').fetchall()
- c.commit()
- c.close()
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile1.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 97)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
- self.assertEqual(e[0],six.b("Could not autodetect table name in qsql file. File contains no record of a table"))
-
-
- def test_cache_full_flow(self):
- file_data = six.b("a,b,c\n10,20,30\n30,40,50")
- tmpfile = self.create_file_with_data(file_data)
- tmpfile_folder = os.path.dirname(tmpfile.name)
- tmpfile_filename = os.path.basename(tmpfile.name)
- expected_tmpfile_table_name = tmpfile_filename
- expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C none' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0],six.b('10'))
- self.assertEqual(o[1],six.b('30'))
-
- # Ensure cache has not been created
- self.assertTrue(not os.path.exists(expected_cache_filename))
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0],six.b('10'))
- self.assertEqual(o[1],six.b('30'))
-
- # Ensure cache has not been created, as cache mode is "read" only
- self.assertTrue(not os.path.exists(expected_cache_filename))
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0],six.b('10'))
- self.assertEqual(o[1],six.b('30'))
-
- # After readwrite caching has been activated, the cache file is expected to exist
- self.assertTrue(os.path.exists(expected_cache_filename))
-
- # Read the cache file directly, to make sure it's a valid sqlite file
- db = sqlite3.connect(expected_cache_filename)
- table_list = db.execute("select content_signature_key,temp_table_name,content_signature,creation_time,source_type,source from _qcatalog where temp_table_name == '%s'" % expected_tmpfile_table_name).fetchall()
- self.assertTrue(len(table_list) == 1)
- table_metadata = table_list[0]
- results = db.execute("select * from %s" % table_metadata[1]).fetchall()
- self.assertEqual(results[0],(10,20,30))
- self.assertEqual(results[1],(30,40,50))
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0],six.b('10'))
- self.assertEqual(o[1],six.b('30'))
-
- # After readwrite caching has been activated, the cache file is expected to exist
- self.assertTrue(os.path.exists(expected_cache_filename))
-
- self.cleanup(tmpfile)
-
- def test_cache_full_flow_with_concatenated_files(self):
- file_data1 = six.b("a,b,c\n10,11,12\n20,21,22")
- tmpfile1 = self.create_file_with_data(file_data1)
- tmpfile1_folder = os.path.dirname(tmpfile1.name)
- tmpfile1_filename = os.path.basename(tmpfile1.name)
- expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
-
- file_data2 = six.b("a,b,c\n30,31,32\n40,41,42")
- tmpfile2 = self.create_file_with_data(file_data2)
- tmpfile2_folder = os.path.dirname(tmpfile2.name)
- tmpfile2_filename = os.path.basename(tmpfile2.name)
- expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')
-
- cmd = Q_EXECUTABLE + ' -O -H -d , "select * from (select * from %s UNION ALL select * from %s)" -C readwrite' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 5)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('a,b,c'))
- self.assertEqual(o[1],six.b('10,11,12'))
- self.assertEqual(o[2],six.b('20,21,22'))
- self.assertEqual(o[3],six.b('30,31,32'))
- self.assertEqual(o[4],six.b('40,41,42'))
-
- self.assertTrue(os.path.exists(expected_cache_filename1))
- self.assertTrue(os.path.exists(expected_cache_filename2))
-
- self.cleanup(tmpfile1)
- self.cleanup(tmpfile2)
-
-
- def test_analyze_result_with_cache_file(self):
- file_data = six.b("a,b,c\n10,20,30\n30,40,50")
- tmpfile = self.create_file_with_data(file_data)
- tmpfile_folder = os.path.dirname(tmpfile.name)
- tmpfile_filename = os.path.basename(tmpfile.name)
- expected_cache_filename = os.path.join(tmpfile_folder,tmpfile_filename + '.qsql')
-
- # Ensure cache has not been created yet
- self.assertTrue(not os.path.exists(expected_cache_filename))
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0],six.b('10'))
- self.assertEqual(o[1],six.b('30'))
-
- # Ensure cache is now created
- self.assertTrue(os.path.exists(expected_cache_filename))
-
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),7)
- self.assertEqual(len(e),0)
-
- self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: qsql-file-with-original source: %s.qsql' % tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4],six.b(' `a` - int'))
- self.assertEqual(o[5],six.b(' `b` - int'))
- self.assertEqual(o[6],six.b(' `c` - int'))
-
- # delete the newly created cache
- os.remove(expected_cache_filename)
-
- # Now rerun the analysis without the cache file
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C read -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o),7)
- self.assertEqual(len(e),0)
-
- self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4],six.b(' `a` - int'))
- self.assertEqual(o[5],six.b(' `b` - int'))
- self.assertEqual(o[6],six.b(' `c` - int'))
-
- self.cleanup(tmpfile)
-
- def test_partial_caching_exists(self):
- file1_data = six.b("a,b,c\n10,20,30\n30,40,50\n60,70,80")
- tmpfile1 = self.create_file_with_data(file1_data)
- tmpfile1_folder = os.path.dirname(tmpfile1.name)
- tmpfile1_filename = os.path.basename(tmpfile1.name)
- expected_cache_filename1 = os.path.join(tmpfile1_folder,tmpfile1_filename + '.qsql')
-
- file2_data = six.b("b,x\n10,linewith10\n20,linewith20\n30,linewith30\n40,linewith40")
- tmpfile2 = self.create_file_with_data(file2_data)
- tmpfile2_folder = os.path.dirname(tmpfile2.name)
- tmpfile2_filename = os.path.basename(tmpfile2.name)
- expected_cache_filename2 = os.path.join(tmpfile2_folder,tmpfile2_filename + '.qsql')
-
- # Use only first file, and cache
- cmd = Q_EXECUTABLE + ' -H -d , "select a from %s" -C readwrite' % tmpfile1.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
- self.assertTrue(o[0],six.b('10'))
- self.assertEqual(o[1],six.b('30'))
-
- # Ensure cache has been created for file 1
- self.assertTrue(os.path.exists(expected_cache_filename1))
-
- # Use both files with read caching, one should be read from cache, the other from the file
- cmd = Q_EXECUTABLE + ' -H -d , "select file1.a,file1.b,file1.c,file2.x from %s file1 left join %s file2 on (file1.b = file2.b)" -C read' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('10,20,30,linewith20'))
- self.assertEqual(o[1],six.b('30,40,50,linewith40'))
- self.assertEqual(o[2],six.b('60,70,80,'))
-
- # Ensure cache has NOT been created for file 2
- self.assertTrue(not os.path.exists(expected_cache_filename2))
-
- # Now rerun the query, this time with readwrite caching, so the second file cache will be written
- cmd = Q_EXECUTABLE + ' -H -d , "select file1.a,file1.b,file1.c,file2.x from %s file1 left join %s file2 on (file1.b = file2.b)" -C readwrite' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
- self.assertEqual(o[0],six.b('10,20,30,linewith20'))
- self.assertEqual(o[1],six.b('30,40,50,linewith40'))
- self.assertEqual(o[2],six.b('60,70,80,'))
-
- # Ensure cache has now been created for file 2
- self.assertTrue(os.path.exists(expected_cache_filename2))
-
- self.cleanup(tmpfile1)
- self.cleanup(tmpfile2)
-
-
-class UserFunctionTests(AbstractQTestCase):
- def test_regexp_int_data_handling(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
-
- cmd = Q_EXECUTABLE + ' -d , "select c2 from %s where regexp(\'^1\',c2)"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0],six.b("1"))
-
- self.cleanup(tmpfile)
-
- def test_percentile_func(self):
- cmd = 'seq 1000 1999 | %s "select substr(c1,0,3),percentile(c1,0),percentile(c1,0.5),percentile(c1,1) from - group by substr(c1,0,3)" -c 1' % Q_EXECUTABLE
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 10)
- self.assertEqual(len(e), 0)
-
- output_table = [l.split(six.b(" ")) for l in o]
- group_labels = [int(row[0]) for row in output_table]
- minimum_values = [float(row[1]) for row in output_table]
- median_values = [float(row[2]) for row in output_table]
- max_values = [float(row[3]) for row in output_table]
-
- base_values = list(range(1000,2000,100))
-
- self.assertEqual(group_labels,list(range(10,20)))
- self.assertEqual(minimum_values,base_values)
- self.assertEqual(median_values,list(map(lambda x: x + 49.5,base_values)))
- self.assertEqual(max_values,list(map(lambda x: x + 99,base_values)))
-
- def test_regexp_null_data_handling(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
-
- cmd = Q_EXECUTABLE + ' -d , "select count(*) from %s where regexp(\'^\',c2)"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0],six.b("2"))
-
- self.cleanup(tmpfile)
-
- def test_md5_function(self):
- cmd = 'seq 1 4 | %s -c 1 -d , "select c1,md5(c1,\'utf-8\') from -"' % Q_EXECUTABLE
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),4)
- self.assertEqual(len(e),0)
-
- self.assertEqual(tuple(o[0].split(six.b(','),1)),(six.b('1'),six.b('c4ca4238a0b923820dcc509a6f75849b')))
- self.assertEqual(tuple(o[1].split(six.b(','),1)),(six.b('2'),six.b('c81e728d9d4c2f636f067f89cc14862c')))
- self.assertEqual(tuple(o[2].split(six.b(','),1)),(six.b('3'),six.b('eccbc87e4b5ce2fe28308fd9f2a7baf3')))
- self.assertEqual(tuple(o[3].split(six.b(','),1)),(six.b('4'),six.b('a87ff679a2f3e71d9181a67b7542122c')))
-
- def test_stddev_functions(self):
- tmpfile = self.create_file_with_data(six.b("\n".join(map(str,[234,354,3234,123,4234,234,634,56,65]))))
-
- cmd = '%s -c 1 -d , "select round(stddev_pop(c1),10),round(stddev_sample(c1),10) from %s"' % (Q_EXECUTABLE,tmpfile.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
-
- self.assertEqual(o[0],six.b('1479.7015464838,1569.4604964764'))
-
- self.cleanup(tmpfile)
-
- def test_sqrt_function(self):
- cmd = 'seq 1 5 | %s -c 1 -d , "select round(sqrt(c1),10) from -"' % Q_EXECUTABLE
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),5)
- self.assertEqual(len(e),0)
-
- self.assertEqual(o[0],six.b('1.0'))
- self.assertEqual(o[1],six.b('1.4142135624'))
- self.assertEqual(o[2],six.b('1.7320508076'))
- self.assertEqual(o[3],six.b('2.0'))
- self.assertEqual(o[4],six.b('2.2360679775'))
-
- def test_power_function(self):
- cmd = 'seq 1 5 | %s -c 1 -d , "select round(power(c1,2.5),10) from -"' % Q_EXECUTABLE
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),5)
- self.assertEqual(len(e),0)
-
- self.assertEqual(o[0],six.b('1.0'))
- self.assertEqual(o[1],six.b('5.6568542495'))
- self.assertEqual(o[2],six.b('15.5884572681'))
- self.assertEqual(o[3],six.b('32.0'))
- self.assertEqual(o[4],six.b('55.9016994375'))
-
- def test_file_functions(self):
- filenames = [
- "file1",
- "file2.csv",
- "/var/tmp/file3",
- "/var/tmp/file4.gz",
- ""
- ]
- data = "\n".join(filenames)
-
- cmd = 'echo "%s" | %s -c 1 -d , "select file_folder(c1),file_ext(c1),file_basename(c1),file_basename_no_ext(c1) from -"' % (data,Q_EXECUTABLE)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),5)
- self.assertEqual(len(e),0)
- self.assertEqual(o,[
- b',,file1,file1',
- b',.csv,file2.csv,file2',
- b'/var/tmp,,file3,file3',
- b'/var/tmp,.gz,file4.gz,file4',
- b',,,'
- ])
-
-
- def test_sha1_function(self):
- cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha1(c1) from -"' % Q_EXECUTABLE
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),4)
- self.assertEqual(len(e),0)
-
- self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab'))
- self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0'))
- self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb'))
- self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a'))
-
- def test_regexp_extract_function(self):
- query = """
- select
- regexp_extract('was ([0-9]+) seconds and ([0-9]+) ms',c1,0),
- regexp_extract('was ([0-9]+) seconds and ([0-9]+) ms',c1,1),
- regexp_extract('non-existent-(regexp)',c1,0)
- from
- -
- """
-
- cmd = 'echo "Duration was 322 seconds and 240 ms" | %s -c 1 -d , "%s"' % (Q_EXECUTABLE,query)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),1)
- self.assertEqual(len(e),0)
-
- self.assertEqual(o[0],six.b('322,240,'))
-
- def test_sha_function(self):
- cmd = 'seq 1 4 | %s -c 1 -d , "select c1,sha(c1,1,\'utf-8\') as sha1,sha(c1,224,\'utf-8\') as sha224,sha(c1,256,\'utf-8\') as sha256 from -"' % Q_EXECUTABLE
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),4)
- self.assertEqual(len(e),0)
-
- self.assertEqual(o[0],six.b('1,356a192b7913b04c54574d18c28d46e6395428ab,e25388fde8290dc286a6164fa2d97e551b53498dcbf7bc378eb1f178,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d49c01e52ddb7875b4b'))
- self.assertEqual(o[1],six.b('2,da4b9237bacccdf19c0760cab7aec4a8359010b0,58b2aaa0bfae7acc021b3260e941117b529b2e69de878fd7d45c61a9,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f90da3a666eec13ab35'))
- self.assertEqual(o[2],six.b('3,77de68daecd823babbb58edb1c8e14d7106e83bb,4cfc3a1811fe40afa401b25ef7fa0379f1f7c1930a04f8755d678474,4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce'))
- self.assertEqual(o[3],six.b('4,1b6453892473a467d07372d45eb05abc2031647a,271f93f45e9b4067327ed5c8cd30a034730aaace4382803c3e1d6c2f,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a'))
-
-
-class MultiHeaderTests(AbstractQTestCase):
- def test_output_header_when_multiple_input_headers_exist(self):
- TMPFILE_COUNT = 5
- tmpfiles = [self.create_file_with_data(sample_data_with_header) for x in range(TMPFILE_COUNT)]
-
- tmpfilenames = " UNION ALL ".join(map(lambda x:"select * from %s" % x.name, tmpfiles))
-
- cmd = Q_EXECUTABLE + ' -d , "select name,value1,value2 from (%s) order by name" -H -O' % tmpfilenames
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), TMPFILE_COUNT*3+1)
- self.assertEqual(o[0], six.b("name,value1,value2"))
-
- for i in range (TMPFILE_COUNT):
- self.assertEqual(o[1+i],sample_data_rows[0])
- for i in range (TMPFILE_COUNT):
- self.assertEqual(o[TMPFILE_COUNT+1+i],sample_data_rows[1])
- for i in range (TMPFILE_COUNT):
- self.assertEqual(o[TMPFILE_COUNT*2+1+i],sample_data_rows[2])
-
- for oi in o[1:]:
- self.assertTrue(six.b('name') not in oi)
-
- for i in range(TMPFILE_COUNT):
- self.cleanup(tmpfiles[i])
-
- def test_output_header_when_extra_header_column_names_are_different__concatenation_replacement(self):
- tmpfile1 = self.create_file_with_data(sample_data_with_header)
- tmpfile2 = self.create_file_with_data(generate_sample_data_with_header(six.b('othername,value1,value2')))
-
- cmd = Q_EXECUTABLE + ' -d , "select name,value1,value2 from (select * from %s union all select * from %s) order by name" -H -O' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 7)
- self.assertEqual(len(e), 0)
- self.assertTrue(o, [
- six.b('name,value1,value2'),
- six.b('a,1,0'),
- six.b('a,1,0'),
- six.b('b,2,0'),
- six.b('b,2,0'),
- six.b('c,,0'),
- six.b('c,,0')
- ])
-
- self.cleanup(tmpfile1)
- self.cleanup(tmpfile2)
-
- def test_output_header_when_extra_header_has_different_number_of_columns(self):
- tmpfile1 = self.create_file_with_data(sample_data_with_header)
- tmpfile2 = self.create_file_with_data(generate_sample_data_with_header(six.b('name,value1')))
-
- cmd = Q_EXECUTABLE + ' -d , "select name,value1,value2 from (select * from %s UNION ALL select * from %s) order by name" -H -O' % (tmpfile1.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 7)
- self.assertEqual(len(e), 0)
- self.assertTrue(o, [
- six.b('name,value1,value2'),
- six.b('a,1,0'),
- six.b('a,1,0'),
- six.b('b,2,0'),
- six.b('b,2,0'),
- six.b('c,,0'),
- six.b('c,,0')
- ])
-
- self.cleanup(tmpfile1)
- self.cleanup(tmpfile2)
-
-
-class ParsingModeTests(AbstractQTestCase):
-
- def test_strict_mode_column_count_mismatch_error(self):
- tmpfile = self.create_file_with_data(uneven_ls_output)
- cmd = Q_EXECUTABLE + ' -m strict "select count(*) from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
-
- self.assertTrue(six.b("Column Count is expected to identical") in e[0])
-
- self.cleanup(tmpfile)
-
- def test_strict_mode_too_large_specific_column_count(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , -m strict -c 4 "select count(*) from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
-
- self.assertEqual(
- e[0], six.b("Strict mode. Column count is expected to be 4 but is 3"))
-
- self.cleanup(tmpfile)
-
- def test_strict_mode_too_small_specific_column_count(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , -m strict -c 2 "select count(*) from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
-
- self.assertEqual(
- e[0], six.b("Strict mode. Column count is expected to be 2 but is 3"))
-
- self.cleanup(tmpfile)
-
- def test_relaxed_mode_missing_columns_in_header(self):
- tmpfile = self.create_file_with_data(
- sample_data_with_missing_header_names)
- cmd = Q_EXECUTABLE + ' -d , -m relaxed "select count(*) from %s" -H -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 7)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s') % six.b(tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4],six.b(' `name` - text'))
- self.assertEqual(o[5],six.b(' `value1` - int'))
- self.assertEqual(o[6],six.b(' `c3` - int'))
-
- self.cleanup(tmpfile)
-
- def test_strict_mode_missing_columns_in_header(self):
- tmpfile = self.create_file_with_data(
- sample_data_with_missing_header_names)
- cmd = Q_EXECUTABLE + ' -d , -m strict "select count(*) from %s" -H -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode, 0)
- self.assertEqual(len(o), 0)
- self.assertEqual(len(e), 1)
-
- self.assertEqual(
- e[0], six.b('Strict mode. Header row contains less columns than expected column count(2 vs 3)'))
-
- self.cleanup(tmpfile)
-
- def test_output_delimiter_with_missing_fields(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select * from %s" -D ";"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b('a;1;0'))
- self.assertEqual(o[1], six.b('b;2;0'))
- self.assertEqual(o[2], six.b('c;;0'))
-
- self.cleanup(tmpfile)
-
- def test_handling_of_null_integers(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select avg(c2) from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b('1.5'))
-
- self.cleanup(tmpfile)
-
- def test_empty_integer_values_converted_to_null(self):
- tmpfile = self.create_file_with_data(sample_data_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select * from %s where c2 is null"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b('c,,0'))
-
- self.cleanup(tmpfile)
-
- def test_empty_string_values_not_converted_to_null(self):
- tmpfile = self.create_file_with_data(
- sample_data_with_empty_string_no_header)
- cmd = Q_EXECUTABLE + ' -d , "select * from %s where c2 == %s"' % (
- tmpfile.name, "''")
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b('c,,0'))
-
- self.cleanup(tmpfile)
-
- def test_relaxed_mode_detected_columns(self):
- tmpfile = self.create_file_with_data(uneven_ls_output)
- cmd = Q_EXECUTABLE + ' -m relaxed "select count(*) from %s" -A' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
-
- column_rows = o[o.index(six.b(' Fields:'))+1:]
-
- self.assertEqual(len(column_rows), 11)
-
- column_tuples = [x.strip().split(six.b(" ")) for x in column_rows]
- column_info = [(x[0], x[2]) for x in column_tuples]
- column_names = [x[0] for x in column_tuples]
- column_types = [x[2] for x in column_tuples]
-
- self.assertEqual(column_names, [six.b('`c{}`'.format(x)) for x in range(1, 12)])
- self.assertEqual(column_types, list(map(lambda x:six.b(x),[
- 'text', 'int', 'text', 'text', 'int', 'text', 'int', 'int', 'text', 'text', 'text'])))
-
- self.cleanup(tmpfile)
-
- def test_relaxed_mode_detected_columns_with_specific_column_count(self):
- tmpfile = self.create_file_with_data(uneven_ls_output)
- cmd = Q_EXECUTABLE + ' -m relaxed "select count(*) from %s" -A -c 9' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
-
- column_rows = o[o.index(six.b(' Fields:'))+1:]
-
- self.assertEqual(len(column_rows), 9)
-
- column_tuples = [x.strip().split(six.b(" ")) for x in column_rows]
- column_info = [(x[0], x[2]) for x in column_tuples]
- column_names = [x[0] for x in column_tuples]
- column_types = [x[2] for x in column_tuples]
-
- self.assertEqual(column_names, [six.b('`c{}`'.format(x)) for x in range(1, 10)])
- self.assertEqual(
- column_types, list(map(lambda x:six.b(x),['text', 'int', 'text', 'text', 'int', 'text', 'int', 'int', 'text'])))
-
- self.cleanup(tmpfile)
-
- def test_relaxed_mode_last_column_data_with_specific_column_count(self):
- tmpfile = self.create_file_with_data(uneven_ls_output)
- cmd = Q_EXECUTABLE + ' -m relaxed "select c9 from %s" -c 9' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 9)
- self.assertEqual(len(e), 0)
-
- expected_output = list(map(lambda x:six.b(x),["/selinux", "/mnt", "/srv", "/lost+found", '"/initrd.img.old -> /boot/initrd.img-3.8.0-19-generic"',
- "/cdrom", "/home", '"/vmlinuz -> boot/vmlinuz-3.8.0-19-generic"', '"/initrd.img -> boot/initrd.img-3.8.0-19-generic"']))
-
- self.assertEqual(o, expected_output)
-
- self.cleanup(tmpfile)
-
- def test_1_column_warning_in_relaxed_mode(self):
- tmpfile = self.create_file_with_data(one_column_data)
- cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d ,' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('data without commas 1'))
- self.assertEqual(o[1],six.b('data without commas 2'))
-
- self.cleanup(tmpfile)
-
- def test_1_column_warning_in_strict_mode(self):
- tmpfile = self.create_file_with_data(one_column_data)
- cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d , -m strict' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('data without commas 1'))
- self.assertEqual(o[1],six.b('data without commas 2'))
-
- self.cleanup(tmpfile)
-
-
- def test_1_column_warning_suppression_in_relaxed_mode_when_column_count_is_specific(self):
- tmpfile = self.create_file_with_data(one_column_data)
- cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d , -m relaxed -c 1' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('data without commas 1'))
- self.assertEqual(o[1],six.b('data without commas 2'))
-
- self.cleanup(tmpfile)
-
- def test_1_column_warning_suppression_in_strict_mode_when_column_count_is_specific(self):
- tmpfile = self.create_file_with_data(one_column_data)
- cmd = Q_EXECUTABLE + ' -m relaxed "select c1 from %s" -d , -m strict -c 1' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o),2)
-
- self.assertEqual(o[0],six.b('data without commas 1'))
- self.assertEqual(o[1],six.b('data without commas 2'))
-
- self.cleanup(tmpfile)
-
- def test_fluffy_mode__as_relaxed_mode(self):
- tmpfile = self.create_file_with_data(uneven_ls_output)
- cmd = Q_EXECUTABLE + ' -m relaxed "select c9 from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 9)
- self.assertEqual(len(e), 0)
-
- expected_output = list(map(lambda x:six.b(x),["/selinux", "/mnt", "/srv", "/lost+found",
- "/initrd.img.old", "/cdrom", "/home", "/vmlinuz", "/initrd.img"]))
-
- self.assertEqual(o, expected_output)
-
- self.cleanup(tmpfile)
-
- def test_relaxed_mode_column_count_mismatch__was_previously_fluffy_mode_test(self):
- data_row = six.b("column1 column2 column3 column4")
- data_list = [data_row] * 1000
- data_list[950] = six.b("column1 column2 column3 column4 column5")
- tmpfile = self.create_file_with_data(six.b("\n").join(data_list))
-
- cmd = Q_EXECUTABLE + ' -m relaxed "select * from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
- self.assertEqual(len(o),1000)
- self.assertEqual(len(e),0)
- self.assertEqual(o[950],six.b('column1 column2 column3 "column4 column5"'))
-
- self.cleanup(tmpfile)
-
- def test_strict_mode_column_count_mismatch__less_columns(self):
- data_row = six.b("column1 column2 column3 column4")
- data_list = [data_row] * 1000
- data_list[750] = six.b("column1 column3 column4")
- tmpfile = self.create_file_with_data(six.b("\n").join(data_list))
-
- cmd = Q_EXECUTABLE + ' -m strict "select * from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode,0)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
- self.assertTrue(e[0].startswith(six.b("Strict mode - Expected 4 columns instead of 3 columns")))
- self.assertTrue(six.b(' row 751.') in e[0])
-
- self.cleanup(tmpfile)
-
- def test_strict_mode_column_count_mismatch__more_columns(self):
- data_row = six.b("column1 column2 column3 column4")
- data_list = [data_row] * 1000
- data_list[750] = six.b("column1 column2 column3 column4 column5")
- tmpfile = self.create_file_with_data(six.b("\n").join(data_list))
-
- cmd = Q_EXECUTABLE + ' -m strict "select * from %s"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertNotEqual(retcode,0)
- self.assertEqual(len(o),0)
- self.assertEqual(len(e),1)
- self.assertTrue(e[0].startswith(six.b("Strict mode - Expected 4 columns instead of 5 columns")))
- self.assertTrue(six.b(' row 751.') in e[0])
-
- self.cleanup(tmpfile)
-
-
-class FormattingTests(AbstractQTestCase):
-
- def test_column_formatting(self):
- # TODO Decide if this breaking change is reasonable
- #cmd = 'seq 1 10 | ' + Q_EXECUTABLE + ' -f 1=%4.3f,2=%4.3f "select sum(c1),avg(c1) from -" -c 1'
- cmd = 'seq 1 10 | ' + Q_EXECUTABLE + ' -f 1={:4.3f},2={:4.3f} "select sum(c1),avg(c1) from -" -c 1'
-
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 1)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b('55.000 5.500'))
-
- def test_column_formatting_with_output_header(self):
- perl_regex = "'s/1\n/column_name\n1\n/;'"
- # TODO Decide if this breaking change is reasonable
- #cmd = 'seq 1 10 | perl -pe ' + perl_regex + ' | ' + Q_EXECUTABLE + ' -f 1=%4.3f,2=%4.3f "select sum(column_name) mysum,avg(column_name) myavg from -" -c 1 -H -O'
- cmd = 'seq 1 10 | LANG=C perl -pe ' + perl_regex + ' | ' + Q_EXECUTABLE + ' -f 1={:4.3f},2={:4.3f} "select sum(column_name) mysum,avg(column_name) myavg from -" -c 1 -H -O'
-
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b('mysum myavg'))
- self.assertEqual(o[1], six.b('55.000 5.500'))
-
- def py3_test_successfuly_parse_universal_newlines_without_explicit_flag(self):
- def list_as_byte_list(l):
- return list(map(lambda x:six.b(x),l))
-
- expected_output = list(map(lambda x:list_as_byte_list(x),[['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-May-07', '6850000', 'USD', 'b'],
- ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Oct-06', '6000000', 'USD', 'a'],
- ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Jan-08', '25000000', 'USD', 'c'],
- ['mycityfaces', 'MyCityFaces', '7', 'web', 'Scottsdale', 'AZ', '1-Jan-08', '50000', 'USD', 'seed'],
- ['flypaper', 'Flypaper', '', 'web', 'Phoenix', 'AZ', '1-Feb-08', '3000000', 'USD', 'a'],
- ['infusionsoft', 'Infusionsoft', '105', 'software', 'Gilbert', 'AZ', '1-Oct-07', '9000000', 'USD', 'a']]))
-
- data = six.b('permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round\rlifelock,LifeLock,,web,Tempe,AZ,1-May-07,6850000,USD,b\rlifelock,LifeLock,,web,Tempe,AZ,1-Oct-06,6000000,USD,a\rlifelock,LifeLock,,web,Tempe,AZ,1-Jan-08,25000000,USD,c\rmycityfaces,MyCityFaces,7,web,Scottsdale,AZ,1-Jan-08,50000,USD,seed\rflypaper,Flypaper,,web,Phoenix,AZ,1-Feb-08,3000000,USD,a\rinfusionsoft,Infusionsoft,105,software,Gilbert,AZ,1-Oct-07,9000000,USD,a')
- tmp_data_file = self.create_file_with_data(data)
-
- cmd = Q_EXECUTABLE + ' -d , -H "select * from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 6)
-
- actual_output = list(map(lambda row: row.split(six.b(",")),o))
-
- self.assertEqual(actual_output,expected_output)
-
- self.cleanup(tmp_data_file)
-
- test_parsing_universal_newlines_without_explicit_flag = py3_test_successfuly_parse_universal_newlines_without_explicit_flag
-
- def test_universal_newlines_parsing_flag(self):
- def list_as_byte_list(l):
- return list(map(lambda x:six.b(x),l))
-
- expected_output = list(map(lambda x:list_as_byte_list(x),[['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-May-07', '6850000', 'USD', 'b'],
- ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Oct-06', '6000000', 'USD', 'a'],
- ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Jan-08', '25000000', 'USD', 'c'],
- ['mycityfaces', 'MyCityFaces', '7', 'web', 'Scottsdale', 'AZ', '1-Jan-08', '50000', 'USD', 'seed'],
- ['flypaper', 'Flypaper', '', 'web', 'Phoenix', 'AZ', '1-Feb-08', '3000000', 'USD', 'a'],
- ['infusionsoft', 'Infusionsoft', '105', 'software', 'Gilbert', 'AZ', '1-Oct-07', '9000000', 'USD', 'a']]))
-
- data = six.b('permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round\rlifelock,LifeLock,,web,Tempe,AZ,1-May-07,6850000,USD,b\rlifelock,LifeLock,,web,Tempe,AZ,1-Oct-06,6000000,USD,a\rlifelock,LifeLock,,web,Tempe,AZ,1-Jan-08,25000000,USD,c\rmycityfaces,MyCityFaces,7,web,Scottsdale,AZ,1-Jan-08,50000,USD,seed\rflypaper,Flypaper,,web,Phoenix,AZ,1-Feb-08,3000000,USD,a\rinfusionsoft,Infusionsoft,105,software,Gilbert,AZ,1-Oct-07,9000000,USD,a')
- tmp_data_file = self.create_file_with_data(data)
-
- cmd = Q_EXECUTABLE + ' -d , -H -U "select permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round from %s"' % tmp_data_file.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode,0)
-
- if len(e) == 2 or len(e) == 1:
- # In python 3.7, there's a deprecation warning for the 'U' file opening mode, which is ok for now
- self.assertIn(len(e), [1,2])
- self.assertTrue(b"DeprecationWarning: 'U' mode is deprecated" in e[0])
- elif len(e) != 0:
- # Nothing should be output to stderr in other versions
- self.assertTrue(False,msg='Unidentified output in stderr')
-
- self.assertEqual(len(o), 6)
-
- actual_output = list(map(lambda row: row.split(six.b(",")),o))
-
- self.assertEqual(actual_output,expected_output)
-
- self.cleanup(tmp_data_file)
-
-
-
-class SqlTests(AbstractQTestCase):
-
- def test_find_example(self):
- tmpfile = self.create_file_with_data(find_output)
- cmd = Q_EXECUTABLE + ' "select c5,c6,sum(c7)/1024.0/1024 as total from %s group by c5,c6 order by total desc"' % tmpfile.name
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
- self.assertEqual(len(e), 0)
-
- self.assertEqual(o[0], six.b('mapred mapred 0.9389581680297852'))
- self.assertEqual(o[1], six.b('root root 0.02734375'))
- self.assertEqual(o[2], six.b('harel harel 0.010888099670410156'))
-
- self.cleanup(tmpfile)
-
- def test_join_example(self):
- cmd = Q_EXECUTABLE + ' "select myfiles.c8,emails.c2 from {0}/exampledatafile myfiles join {0}/group-emails-example emails on (myfiles.c4 = emails.c1) where myfiles.c8 = \'ppp\'"'.format(EXAMPLES)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 2)
-
- self.assertEqual(o[0], six.b('ppp dip.1@otherdomain.com'))
- self.assertEqual(o[1], six.b('ppp dip.2@otherdomain.com'))
-
- def test_join_example_with_output_header(self):
- cmd = Q_EXECUTABLE + ' -O "select myfiles.c8 aaa,emails.c2 bbb from {0}/exampledatafile myfiles join {0}/group-emails-example emails on (myfiles.c4 = emails.c1) where myfiles.c8 = \'ppp\'"'.format(EXAMPLES)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(o), 3)
-
- self.assertEqual(o[0], six.b('aaa bbb'))
- self.assertEqual(o[1], six.b('ppp dip.1@otherdomain.com'))
- self.assertEqual(o[2], six.b('ppp dip.2@otherdomain.com'))
-
- def test_self_join1(self):
- tmpfile = self.create_file_with_data(six.b("\n").join([six.b("{} 9000".format(i)) for i in range(0,10)]))
- cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c1 = a2.c1)"' % (tmpfile.name,tmpfile.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 10)
-
- self.cleanup(tmpfile)
-
- def test_self_join_reuses_table(self):
- tmpfile = self.create_file_with_data(six.b("\n").join([six.b("{} 9000".format(i)) for i in range(0,10)]))
- cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c1 = a2.c1)" -A' % (tmpfile.name,tmpfile.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 6)
-
- self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s') % six.b(tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4],six.b(' `c1` - int'))
- self.assertEqual(o[5],six.b(' `c2` - int'))
-
- self.cleanup(tmpfile)
-
- def test_self_join2(self):
- tmpfile1 = self.create_file_with_data(six.b("\n").join([six.b("{} 9000".format(i)) for i in range(0,10)]))
- cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c2 = a2.c2)"' % (tmpfile1.name,tmpfile1.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 10*10)
-
- self.cleanup(tmpfile1)
-
- tmpfile2 = self.create_file_with_data(six.b("\n").join([six.b("{} 9000".format(i)) for i in range(0,10)]))
- cmd = Q_EXECUTABLE + ' "select * from %s a1 join %s a2 on (a1.c2 = a2.c2) join %s a3 on (a1.c2 = a3.c2)"' % (tmpfile2.name,tmpfile2.name,tmpfile2.name)
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 10*10*10)
-
- self.cleanup(tmpfile2)
-
- def test_disable_column_type_detection(self):
- tmpfile = self.create_file_with_data(six.b('''regular_text,text_with_digits1,text_with_digits2,float_number
-"regular text 1",67,"67",12.3
-"regular text 2",067,"067",22.3
-"regular text 3",123,"123",33.4
-"regular text 4",-123,"-123",0122.2
-'''))
-
- # Check original column type detection
- cmd = Q_EXECUTABLE + ' -A -d , -H "select * from %s"' % (tmpfile.name)
-
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 8)
-
- self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1], six.b(' Sources:'))
- self.assertEqual(o[2], six.b(' source_type: file source: %s') % six.b(tmpfile.name))
- self.assertEqual(o[3], six.b(' Fields:'))
- self.assertEqual(o[4], six.b(' `regular_text` - text'))
- self.assertEqual(o[5], six.b(' `text_with_digits1` - int'))
- self.assertEqual(o[6], six.b(' `text_with_digits2` - int'))
- self.assertEqual(o[7], six.b(' `float_number` - real'))
-
- # Check column types detected when actual detection is disabled
- cmd = Q_EXECUTABLE + ' -A -d , -H --as-text "select * from %s"' % (tmpfile.name)
-
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 8)
-
- self.assertEqual(o[0],six.b('Table: %s' % tmpfile.name))
- self.assertEqual(o[1],six.b(' Sources:'))
- self.assertEqual(o[2],six.b(' source_type: file source: %s') % six.b(tmpfile.name))
- self.assertEqual(o[3],six.b(' Fields:'))
- self.assertEqual(o[4],six.b(' `regular_text` - text'))
- self.assertEqual(o[5],six.b(' `text_with_digits1` - text'))
- self.assertEqual(o[6],six.b(' `text_with_digits2` - text'))
- self.assertEqual(o[7],six.b(' `float_number` - text'))
-
- # Get actual data with regular detection
- cmd = Q_EXECUTABLE + ' -d , -H "select * from %s"' % (tmpfile.name)
-
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 4)
-
- self.assertEqual(o[0],six.b("regular text 1,67,67,12.3"))
- self.assertEqual(o[1],six.b("regular text 2,67,67,22.3"))
- self.assertEqual(o[2],six.b("regular text 3,123,123,33.4"))
- self.assertEqual(o[3],six.b("regular text 4,-123,-123,122.2"))
-
- # Get actual data without detection
- cmd = Q_EXECUTABLE + ' -d , -H --as-text "select * from %s"' % (tmpfile.name)
-
- retcode, o, e = run_command(cmd)
-
- self.assertEqual(retcode, 0)
- self.assertEqual(len(e), 0)
- self.assertEqual(len(o), 4)
-
- self.assertEqual(o[0],six.b("regular text 1,67,67,12.3"))
- self.assertEqual(o[1],six.b("regular text 2,067,067,22.3"))
- self.assertEqual(o[2],six.b("regular text 3,123,123,33.4"))
- self.assertEqual(o[3],six.b("regular text 4,-123,-123,0122.2"))
-
- self.cleanup(tmpfile)
-
-
-class BasicModuleTests(AbstractQTestCase):
-
- def test_engine_isolation(self):
- tmpfile1 = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
- tmpfile2 = self.create_file_with_data(six.b("d e f\n10 20 30\n40 50 60"))
-
- # Run file 1 on engine 1
- q1 = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
- r = q1.execute('select * from %s' % tmpfile1.name)
- print("QueryQuery",file=sys.stdout)
-
- self.assertTrue(r.status == 'ok')
- self.assertEqual(len(r.warnings),0)
- self.assertEqual(len(r.data),2)
- self.assertEqual(r.metadata.output_column_name_list,['a','b','c'])
- self.assertEqual(r.data,[(1,2,3),(4,5,6)])
- self.assertTrue(tmpfile1.name in r.metadata.table_structures)
- self.assertTrue(tmpfile1.name in r.metadata.new_table_structures)
- self.assertEqual(r.metadata.table_structures[tmpfile1.name].atomic_fns,[tmpfile1.name])
- self.assertEqual(r.metadata.table_structures[tmpfile1.name].source_type,'file')
- self.assertEqual(r.metadata.table_structures[tmpfile1.name].source,tmpfile1.name)
-
- # run file 1 on engine 2
- q2 = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
- r2 = q2.execute('select * from %s' % tmpfile1.name)
- print("QueryQuery",file=sys.stdout)
-
- self.assertTrue(r2.status == 'ok')
- self.assertEqual(len(r2.warnings),0)
- self.assertEqual(len(r2.data),2)
- self.assertEqual(r2.metadata.output_column_name_list,['a','b','c'])
- self.assertEqual(r2.data,[(1,2,3),(4,5,6)])
- self.assertTrue(tmpfile1.name in r2.metadata.table_structures)
- self.assertTrue(tmpfile1.name in r2.metadata.new_table_structures)
- self.assertEqual(r2.metadata.table_structures[tmpfile1.name].atomic_fns,[tmpfile1.name])
- self.assertEqual(r2.metadata.table_structures[tmpfile1.name].source_type,'file')
- self.assertEqual(r2.metadata.table_structures[tmpfile1.name].source,tmpfile1.name)
-
- # run file 2 on engine 1
- r3 = q1.execute('select * from %s' % tmpfile2.name)
- print("QueryQuery",file=sys.stdout)
-
- print(r3)
- self.assertTrue(r3.status == 'ok')
- self.assertEqual(len(r3.warnings),0)
- self.assertEqual(len(r3.data),2)
- self.assertEqual(r3.metadata.output_column_name_list,['d','e','f'])
- self.assertEqual(r3.data,[(10,20,30),(40,50,60)])
- self.assertTrue(tmpfile2.name in r3.metadata.table_structures)
- self.assertTrue(tmpfile2.name in r3.metadata.new_table_structures)
- self.assertEqual(r3.metadata.table_structures[tmpfile2.name].atomic_fns,[tmpfile2.name])
- self.assertEqual(r3.metadata.table_structures[tmpfile2.name].source,tmpfile2.name)
- self.assertEqual(r3.metadata.table_structures[tmpfile2.name].source_type,'file')
-
- q1.done()
- q2.done()
-
- self.cleanup(tmpfile1)
- self.cleanup(tmpfile2)
-
- def test_simple_query(self):
- tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
-
- q = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
- r = q.execute('select * from %s' % tmpfile.name)
-
- self.assertTrue(r.status == 'ok')
- self.assertEqual(len(r.warnings),0)
- self.assertEqual(len(r.data),2)
- self.assertEqual(r.metadata.output_column_name_list,['a','b','c'])
- self.assertEqual(r.data,[(1,2,3),(4,5,6)])
- self.assertTrue(tmpfile.name in r.metadata.table_structures)
- self.assertTrue(tmpfile.name in r.metadata.new_table_structures)
- self.assertEqual(r.metadata.table_structures[tmpfile.name].atomic_fns,[tmpfile.name])
- self.assertEqual(r.metadata.table_structures[tmpfile.name].source_type,'file')
- self.assertEqual(r.metadata.table_structures[tmpfile.name].source,tmpfile.name)
-
- q.done()
- self.cleanup(tmpfile)
-
- def test_loaded_data_reuse(self):
- tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
-
- q = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
- r1 = q.execute('select * from %s' % tmpfile.name)
-
- r2 = q.execute('select * from %s' % tmpfile.name)
-
- self.assertTrue(r1.status == 'ok')
- self.assertEqual(len(r1.warnings),0)
- self.assertEqual(len(r1.data),2)
- self.assertEqual(r1.metadata.output_column_name_list,['a','b','c'])
- self.assertEqual(r1.data,[(1,2,3),(4,5,6)])
- self.assertTrue(tmpfile.name in r1.metadata.table_structures)
- self.assertTrue(tmpfile.name in r1.metadata.new_table_structures)
- self.assertEqual(r1.metadata.table_structures[tmpfile.name].atomic_fns,[tmpfile.name])
- self.assertEqual(r1.metadata.table_structures[tmpfile.name].source_type,'file')
- self.assertEqual(r1.metadata.table_structures[tmpfile.name].source,tmpfile.name)
-
- self.assertTrue(r2.status == 'ok')
- self.assertTrue(tmpfile.name in r2.metadata.table_structures)
- self.assertTrue(tmpfile.name not in r2.metadata.new_table_structures)
- self.assertEqual(r2.data,r1.data)
- self.assertEqual(r2.metadata.output_column_name_list,r2.metadata.output_column_name_list)
- self.assertEqual(len(r2.warnings),0)
-
- q.done()
-
- self.cleanup(tmpfile)
-
- def test_stdin_injection(self):
- tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
-
- data_streams_dict = {
- '-': DataStream('stdin','-',codecs.open(tmpfile.name,'rb',encoding='utf-8'))
- }
- q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
- r = q.execute('select * from -')
-
- self.assertTrue(r.status == 'ok')
- self.assertEqual(len(r.warnings),0)
- self.assertEqual(len(r.data),2)
- self.assertEqual(r.metadata.output_column_name_list,['a','b','c'])
- self.assertEqual(r.data,[(1,2,3),(4,5,6)])
- self.assertEqual(r.metadata.new_table_structures['-'],r.metadata.table_structures['-'])
- self.assertEqual(r.metadata.table_structures['-'].column_names,['a','b','c'])
- self.assertEqual(r.metadata.table_structures['-'].python_column_types,[int,int,int])
- self.assertEqual(r.metadata.table_structures['-'].sqlite_column_types,['int','int','int'])
- self.assertEqual(r.metadata.table_structures['-'].source_type,'data-stream')
- self.assertEqual(r.metadata.table_structures['-'].source,'stdin')
-
- q.done()
- self.cleanup(tmpfile)
-
- def test_named_stdin_injection(self):
- tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
-
- data_streams_dict = {
- 'my_stdin_data': DataStream('my_stdin_data','my_stdin_data',codecs.open(tmpfile.name,'rb',encoding='utf-8'))
- }
-
- q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
- r = q.execute('select a from my_stdin_data')
-
- self.assertTrue(r.status == 'ok')
- self.assertEqual(len(r.warnings),0)
- self.assertEqual(len(r.data),2)
- self.assertEqual(r.metadata.output_column_name_list,['a'])
- self.assertEqual(r.data,[(1,),(4,)])
- self.assertTrue('my_stdin_data' in r.metadata.table_structures)
- self.assertTrue('my_stdin_data' in r.metadata.new_table_structures)
- self.assertEqual(r.metadata.table_structures['my_stdin_data'].qtable_name,'my_stdin_data')
-
- q.done()
- self.cleanup(tmpfile)
-
- def test_data_stream_isolation(self):
- tmpfile1 = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
- tmpfile2 = self.create_file_with_data(six.b("d e f\n7 8 9\n10 11 12"))
-
- data_streams_dict = {
- 'a-': DataStream('a-','a-',codecs.open(tmpfile1.name, 'rb', encoding='utf-8')),
- 'b-': DataStream('b-','b-',codecs.open(tmpfile2.name, 'rb', encoding='utf-8'))
- }
-
- q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
- r1 = q.execute('select * from a-')
-
- self.assertTrue(r1.status == 'ok')
- self.assertEqual(len(r1.warnings),0)
- self.assertEqual(len(r1.data),2)
- self.assertEqual(r1.metadata.output_column_name_list,['a','b','c'])
- self.assertEqual(r1.data,[(1,2,3),(4,5,6)])
- self.assertTrue('a-' in r1.metadata.table_structures)
- self.assertEqual(len(r1.metadata.table_structures),1)
- self.assertEqual(r1.metadata.table_structures['a-'].source_type, 'data-stream')
- self.assertEqual(r1.metadata.table_structures['a-'].source, 'a-')
- self.assertEqual(r1.metadata.table_structures['a-'].column_names, ['a','b','c'])
- self.assertEqual(r1.metadata.table_structures['a-'].python_column_types, [int,int,int])
- self.assertEqual(r1.metadata.table_structures['a-'].sqlite_column_types, ['int','int','int'])
-
- r2 = q.execute('select * from b-')
-
- self.assertTrue(r2.status == 'ok')
- self.assertEqual(len(r2.warnings),0)
- self.assertEqual(len(r2.data),2)
- self.assertEqual(r2.metadata.output_column_name_list,['d','e','f'])
- self.assertEqual(r2.data,[(7,8,9),(10,11,12)])
-
- self.assertEqual(len(r1.metadata.table_structures),2)
- self.assertTrue('b-' in r1.metadata.table_structures)
- self.assertEqual(r1.metadata.table_structures['b-'].source_type, 'data-stream')
- self.assertEqual(r1.metadata.table_structures['b-'].source, 'b-')
- self.assertEqual(r1.metadata.table_structures['b-'].column_names, ['d','e','f'])
- self.assertEqual(r1.metadata.table_structures['b-'].python_column_types, [int,int,int])
- self.assertEqual(r1.metadata.table_structures['b-'].sqlite_column_types, ['int','int','int'])
-
- q.done()
- self.cleanup(tmpfile1)
- self.cleanup(tmpfile2)
-
- def test_multiple_stdin_injection(self):
- tmpfile1 = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
- tmpfile2 = self.create_file_with_data(six.b("d e f\n7 8 9\n10 11 12"))
-
- data_streams_dict = {
- 'my_stdin_data1': DataStream('my_stdin_data1','my_stdin_data1',codecs.open(tmpfile1.name,'rb',encoding='utf-8')),
- 'my_stdin_data2': DataStream('my_stdin_data2','my_stdin_data2',codecs.open(tmpfile2.name,'rb',encoding='utf-8'))
- }
- q = QTextAsData(QInputParams(skip_header=True,delimiter=' '),data_streams_dict=data_streams_dict)
- r1 = q.execute('select * from my_stdin_data1')
-
- self.assertTrue(r1.status == 'ok')
- self.assertEqual(len(r1.warnings),0)
- self.assertEqual(len(r1.data),2)
- self.assertEqual(r1.metadata.output_column_name_list,['a','b','c'])
- self.assertEqual(r1.data,[(1,2,3),(4,5,6)])
- self.assertTrue('my_stdin_data1' in r1.metadata.table_structures)
- self.assertTrue('my_stdin_data1' in r1.metadata.new_table_structures)
- self.assertEqual(r1.metadata.table_structures['my_stdin_data1'].qtable_name,'my_stdin_data1')
-
- r2 = q.execute('select * from my_stdin_data2')
-
- self.assertTrue(r2.status == 'ok')
- self.assertEqual(len(r2.warnings),0)
- self.assertEqual(len(r2.data),2)
- self.assertEqual(r2.metadata.output_column_name_list,['d','e','f'])
- self.assertEqual(r2.data,[(7,8,9),(10,11,12)])
- # There should be another data load, even though it's the same 'filename' as before
- self.assertTrue('my_stdin_data2' in r2.metadata.table_structures)
- self.assertTrue('my_stdin_data2' in r2.metadata.new_table_structures)
- self.assertEqual(r2.metadata.table_structures['my_stdin_data2'].qtable_name,'my_stdin_data2')
-
- r3 = q.execute('select aa.*,bb.* from my_stdin_data1 aa join my_stdin_data2 bb')
-
- self.assertTrue(r3.status == 'ok')
- self.assertEqual(len(r3.warnings),0)
- self.assertEqual(len(r3.data),4)
- self.assertEqual(r3.metadata.output_column_name_list,['a','b','c','d','e','f'])
- self.assertEqual(r3.data,[(1,2,3,7,8,9),(1,2,3,10,11,12),(4,5,6,7,8,9),(4,5,6,10,11,12)])
- self.assertTrue('my_stdin_data1' in r3.metadata.table_structures)
- self.assertTrue('my_stdin_data1' not in r3.metadata.new_table_structures)
-
- q.done()
- self.cleanup(tmpfile1)
- self.cleanup(tmpfile2)
-
- def test_different_input_params_for_different_files(self):
- tmpfile1 = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
- tmpfile2 = self.create_file_with_data(six.b("7\t8\t9\n10\t11\t12"))
-
- q = QTextAsData(QInputParams(skip_header=True,delimiter=' '))
-
- q.load_data(tmpfile1.name,QInputParams(skip_header=True,delimiter=' '))
- q.load_data(tmpfile2.name,QInputParams(skip_header=False,delimiter='\t'))
-
- r = q.execute('select aa.*,bb.* from %s aa join %s bb' % (tmpfile1.name,tmpfile2.name))
-
- self.assertTrue(r.status == 'ok')
- self.assertEqual(len(r.warnings),0)
- self.assertEqual(len(r.data),4)
- self.assertEqual(r.metadata.output_column_name_list,['a','b','c','c1','c2','c3'])
- self.assertEqual(r.data,[(1,2,3,7,8,9),(1,2,3,10,11,12),(4,5,6,7,8,9),(4,5,6,10,11,12)])
- self.assertTrue(tmpfile1.name not in r.metadata.new_table_structures)
- self.assertTrue(tmpfile2.name not in r.metadata.new_table_structures)
-
- q.done()
- self.cleanup(tmpfile1)
- self.cleanup(tmpfile2)
-
- def test_different_input_params_for_different_files_2(self):
- tmpfile1 = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
- tmpfile2 = self.create_file_with_data(six.b("7\t8\t9\n10\t11\t12"))
-
- q = QTextAsData()
-
- q.load_data(tmpfile1.name,QInputParams(skip_header=True,delimiter=' '))
- q.load_data(tmpfile2.name,QInputParams(skip_header=False,delimiter='\t'))
-
- r = q.execute('select aa.*,bb.* from %s aa join %s bb' % (tmpfile1.name,tmpfile2.name))
-
- self.assertTrue(r.status == 'ok')
- self.assertEqual(len(r.warnings),0)
- self.assertEqual(len(r.data),4)
- self.assertEqual(r.metadata.output_column_name_list,['a','b','c','c1','c2','c3'])
- self.assertEqual(r.data,[(1,2,3,7,8,9),(1,2,3,10,11,12),(4,5,6,7,8,9),(4,5,6,10,11,12)])
- self.assertTrue(tmpfile1.name not in r.metadata.new_table_structures)
- self.assertTrue(tmpfile2.name not in r.metadata.new_table_structures)
-
- q.done()
- self.cleanup(tmpfile1)
- self.cleanup(tmpfile2)
-
- def test_input_params_override(self):
- tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
-
- default_input_params = QInputParams()
-
- for k in default_input_params.__dict__.keys():
- setattr(default_input_params,k,'GARBAGE')
-
- q = QTextAsData(default_input_params)
-
- r = q.execute('select * from %s' % tmpfile.name)
-
- self.assertTrue(r.status == 'error')
-
- overwriting_input_params = QInputParams(skip_header=True,delimiter=' ')
-
- r2 = q.execute('select * from %s' % tmpfile.name,input_params=overwriting_input_params)
-
- self.assertTrue(r2.status == 'ok')
- self.assertEqual(len(r2.warnings),0)
- self.assertEqual(len(r2.data),2)
- self.assertEqual(r2.metadata.output_column_name_list,['a','b','c'])
- self.assertEqual(r2.data,[(1,2,3),(4,5,6)])
- self.assertTrue(tmpfile.name in r2.metadata.table_structures)
- self.assertTrue(tmpfile.name in r2.metadata.new_table_structures)
- self.assertEqual(r2.metadata.table_structures[tmpfile.name].atomic_fns,[tmpfile.name])
- self.assertEqual(r2.metadata.table_structures[tmpfile.name].source,tmpfile.name)
- self.assertEqual(r2.metadata.table_structures[tmpfile.name].source_type,'file')
-
- q.done()
- self.cleanup(tmpfile)
-
- def test_input_params_merge(self):
- input_params = QInputParams()
-
- for k in input_params.__dict__.keys():
- setattr(input_params,k,'GARBAGE')
-
- merged_input_params = input_params.merged_with(QInputParams())
-
- for k in merged_input_params.__dict__.keys():
- self.assertTrue(getattr(merged_input_params,k) != 'GARBAGE')
-
- for k in input_params.__dict__.keys():
- self.assertTrue(getattr(merged_input_params,k) != 'GARBAGE')
-
- def test_table_analysis_with_syntax_error(self):
-
- q = QTextAsData()
-
- q_output = q.analyze("bad syntax")
-
- q.done()
- self.assertTrue(q_output.status == 'error')
- self.assertTrue(q_output.error.msg.startswith('query error'))
-
- def test_execute_response(self):
- tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
-
- q = QTextAsData()
-
- q_output = q.execute("select a,c from %s" % tmpfile.name,QInputParams(skip_header=True))
-
- self.assertTrue(q_output.status == 'ok')
- self.assertTrue(q_output.error is None)
- self.assertEqual(len(q_output.warnings),0)
- self.assertEqual(len(q_output.data),2)
- self.assertEqual(q_output.data,[ (1,3),(4,6) ])
- self.assertTrue(q_output.metadata is not None)
-
- metadata = q_output.metadata
-
- self.assertEqual(metadata.output_column_name_list, [ 'a','c'])
- self.assertTrue(tmpfile.name in metadata.new_table_structures)
- self.assertEqual(len(metadata.table_structures),1)
-
- table_structure = metadata.new_table_structures[tmpfile.name]
-
- self.assertEqual(table_structure.column_names,[ 'a','b','c'])
- self.assertEqual(table_structure.python_column_types,[ int,int,int])
- self.assertEqual(table_structure.sqlite_column_types,[ 'int','int','int'])
- self.assertEqual(table_structure.qtable_name, tmpfile.name)
- self.assertEqual(table_structure.atomic_fns,[tmpfile.name])
- self.assertEqual(table_structure.source_type,'file')
- self.assertEqual(table_structure.source,tmpfile.name)
-
- q.done()
- self.cleanup(tmpfile)
-
- def test_analyze_response(self):
- tmpfile = self.create_file_with_data(six.b("a b c\n1 2 3\n4 5 6"))
-
- q = QTextAsData()
-
- q_output = q.analyze("select a,c from %s" % tmpfile.name,QInputParams(skip_header=True))
-
- self.assertTrue(q_output.status == 'ok')
- self.assertTrue(q_output.error is None)
- self.assertEqual(len(q_output.warnings),0)
- self.assertEqual(len(q_output.data),2)
- self.assertEqual(q_output.data,[ (1,3),(4,6) ])
- self.assertTrue(q_output.metadata is not None)
-
- metadata = q_output.metadata
-
- self.assertEqual(metadata.output_column_name_list, [ 'a','c'])
- self.assertEqual(len(metadata.table_structures),1)
- self.assertTrue(tmpfile.name in metadata.new_table_structures)
-
- table_structure = metadata.table_structures[tmpfile.name]
-
- self.assertEqual(table_structure.column_names,[ 'a','b','c'])
- self.assertEqual(table_structure.python_column_types,[ int,int,int])
- self.assertEqual(table_structure.sqlite_column_types,[ 'int','int','int'])
- self.assertEqual(table_structure.qtable_name, tmpfile.name)
- self.assertEqual(table_structure.atomic_fns,[tmpfile.name])
- self.assertEqual(table_structure.source_type,'file')
- self.assertEqual(table_structure.source,tmpfile.name)
-
- q.done()
- self.cleanup(tmpfile)
-
- def test_load_data_from_string_without_previous_data_load(self):
- input_str = six.u('column1,column2,column3\n') + six.u('\n').join([six.u('value1,2.5,value3')] * 1000)
-
-
- data_streams_dict = {
- 'my_data': DataStream('my_data_stream_id','my_data',six.StringIO(input_str))
- }
- q = QTextAsData(default_input_params=QInputParams(skip_header=True,delimiter=','),data_streams_dict=data_streams_dict)
-
- q_output = q.execute('select column2,column3 from my_data')
-
- self.assertTrue(q_output.status == 'ok')
- self.assertTrue(q_output.error is None)
- self.assertEqual(len(q_output.warnings),0)
- self.assertTrue(len(q_output.data),1000)
- self.assertEqual(len(set(q_output.data)),1)
- self.assertEqual(list(set(q_output.data))[0],(2.5,'value3'))
-
- metadata = q_output.metadata
-
- self.assertTrue(metadata.output_column_name_list,['column2','column3'])
- self.assertTrue('my_data' in metadata.new_table_structures)
- self.assertEqual(len(metadata.table_structures),1)
-
- table_structure = metadata.table_structures['my_data']
-
- self.assertEqual(table_structure.column_names,['column1','column2','column3'])
- self.assertEqual(table_structure.sqlite_column_types,['text','real','text'])
- self.assertEqual(table_structure.python_column_types,[str,float,str])
- self.assertEqual(table_structure.qtable_name, 'my_data')
- self.assertEqual(table_structure.source_type, 'data-stream')
- self.assertEqual(table_structure.source, 'my_data_stream_id')
-
- q.done()
-
- def test_load_data_from_string_with_previous_data_load(self):
- input_str = six.u('column1,column2,column3\n') + six.u('\n').join([six.u('value1,2.5,value3')] * 1000)
-
- data_streams_dict = {
- 'my_data': DataStream('a','my_data',six.StringIO(input_str))
- }
- q = QTextAsData(default_input_params=QInputParams(skip_header=True,delimiter=','),data_streams_dict=data_streams_dict)
-
- dl = q.load_data('my_data',QInputParams(skip_header=True,delimiter=','))
-
- q_output = q.execute('select column2,column3 from my_data')
-
- self.assertTrue(q_output.status == 'ok')
- self.assertTrue(q_output.error is None)
- self.assertEqual(len(q_output.warnings),0)
- self.assertTrue(len(q_output.data),1000)
- self.assertEqual(len(set(q_output.data)),1)
- self.assertEqual(list(set(q_output.data))[0],(2.5,'value3'))
-
- metadata = q_output.metadata
-
- self.assertTrue(metadata.output_column_name_list,['column2','column3'])
- self.assertTrue('my_data' not in metadata.new_table_structures)
- self.assertEqual(len(metadata.table_structures),1)
-
- table_structure = metadata.table_structures['my_data']
-
- self.assertEqual(table_structure.column_names,['column1','column2','column3'])
- self.assertEqual(table_structure.sqlite_column_types,['text','real','text'])
- self.assertEqual(table_structure.python_column_types,[str,float,str])
- self.assertEqual(table_structure.qtable_name, 'my_data')
-
- q.done()
-
-
-
-class BenchmarkAttemptResults(object):
- def __init__(self, attempt, lines, columns, duration,return_code):
- self.attempt = attempt
- self.lines = lines
- self.columns = columns
- self.duration = duration
- self.return_code = return_code
-
- def __str__(self):
- return "{}".format(self.__dict__)
- __repr__ = __str__
-
-class BenchmarkResults(object):
- def __init__(self, lines, columns, attempt_results, mean, stddev):
- self.lines = lines
- self.columns = columns
- self.attempt_results = attempt_results
- self.mean = mean
- self.stddev = stddev
-
- def __str__(self):
- return "{}".format(self.__dict__)
- __repr__ = __str__
-
-@pytest.mark.benchmark
-class BenchmarkTests(AbstractQTestCase):
-
- BENCHMARK_DIR = os.environ.get('Q_BENCHMARK_DATA_DIR')
-
- def _ensure_benchmark_data_dir_exists(self):
- try:
- os.mkdir(BenchmarkTests.BENCHMARK_DIR)
- except Exception as e:
- pass
-
- def _create_benchmark_file_if_needed(self):
- self._ensure_benchmark_data_dir_exists()
-
- if os.path.exists('{}/'.format(BenchmarkTests.BENCHMARK_DIR)):
- return
-
- g = GzipFile('unit-file.csv.gz')
- d = g.read().decode('utf-8')
- f = open('{}/benchmark-file.csv'.format(BenchmarkTests.BENCHMARK_DIR), 'w')
- for i in range(100):
- f.write(d)
- f.close()
-
- def _prepare_test_file(self, lines, columns):
-
- filename = '{}/_benchmark_data__lines_{}_columns_{}.csv'.format(BenchmarkTests.BENCHMARK_DIR,lines, columns)
-
- if os.path.exists(filename):
- return filename
-
- c = ['c{}'.format(x + 1) for x in range(columns)]
-
- # write a header line
- ff = open(filename,'w')
- ff.write(",".join(c))
- ff.write('\n')
- ff.close()
-
- r, o, e = run_command('head -{} {}/benchmark-file.csv | ' + Q_EXECUTABLE + ' -d , "select {} from -" >> {}'.format(lines, BenchmarkTests.BENCHMARK_DIR, ','.join(c), filename))
- self.assertEqual(r, 0)
- # Create file cache as part of preparation
- r, o, e = run_command(Q_EXECUTABLE + ' -C readwrite -d , "select count(*) from %s"' % filename)
- self.asserEqual(r, 0)
- return filename
-
- def _decide_result(self,attempt_results):
-
- failed = list(filter(lambda a: a.return_code != 0,attempt_results))
-
- if len(failed) == 0:
- mean = sum([x.duration for x in attempt_results]) / len(attempt_results)
- sum_squared = sum([(x.duration - mean)**2 for x in attempt_results])
- ddof = 0
- pvar = sum_squared / (len(attempt_results) - ddof)
- stddev = pvar ** 0.5
- else:
- mean = None
- stddev = None
-
- return BenchmarkResults(
- attempt_results[0].lines,
- attempt_results[0].columns,
- attempt_results,
- mean,
- stddev
- )
-
- def _perform_test_performance_matrix(self,name,generate_cmd_function):
- results = []
-
- benchmark_results_folder = os.environ.get("Q_BENCHMARK_RESULTS_FOLDER",'')
- if benchmark_results_folder == "":
- raise Exception("Q_BENCHMARK_RESULTS_FOLDER must be provided as an environment variable")
-
- self._create_benchmark_file_if_needed()
- for columns in [1, 5, 10, 20, 50, 100]:
- for lines in [1, 10, 100, 1000, 10000, 100000, 1000000]:
- attempt_results = []
- for attempt in range(10):
- filename = self._prepare_test_file(lines, columns)
- if DEBUG:
- print("Testing {}".format(filename))
- t0 = time.time()
- r, o, e = run_command(generate_cmd_function(filename,lines,columns))
- duration = time.time() - t0
- attempt_result = BenchmarkAttemptResults(attempt, lines, columns, duration, r)
- attempt_results += [attempt_result]
- if DEBUG:
- print("Results: {}".format(attempt_result.__dict__))
- final_result = self._decide_result(attempt_results)
- results += [final_result]
-
- series_fields = [six.u('lines'),six.u('columns')]
- value_fields = [six.u('mean'),six.u('stddev')]
-
- all_fields = series_fields + value_fields
-
- output_filename = '{}/{}.benchmark-results'.format(benchmark_results_folder,name)
- output_file = open(output_filename,'w')
- for columns,g in itertools.groupby(sorted(results,key=lambda x:x.columns),key=lambda x:x.columns):
- x = six.u("\t").join(series_fields + [six.u('{}_{}').format(name, f) for f in value_fields])
- print(x,file = output_file)
- for result in g:
- print(six.u("\t").join(map(str,[getattr(result,f) for f in all_fields])),file=output_file)
- output_file.close()
-
- print("results have been written to : {}".format(output_filename))
- if DEBUG:
- print("RESULTS FOR {}".format(name))
- print(open(output_filename,'r').read())
-
- def test_q_matrix(self):
- Q_BENCHMARK_NAME = os.environ.get('Q_BENCHMARK_NAME')
- if Q_BENCHMARK_NAME is None:
- raise Exception('Q_BENCHMARK_NAME must be provided as an env var')
-
- def generate_q_cmd(data_filename, line_count, column_count):
- Q_BENCHMARK_ADDITIONAL_PARAMS = os.environ.get('Q_BENCHMARK_ADDITIONAL_PARAMS') or ''
- additional_params = ''
- additional_params = additional_params + ' ' + Q_BENCHMARK_ADDITIONAL_PARAMS
- return '{} -d , {} "select count(*) from {}"'.format(Q_EXECUTABLE,additional_params, data_filename)
- self._perform_test_performance_matrix(Q_BENCHMARK_NAME,generate_q_cmd)
-
- def _get_textql_version(self):
- r,o,e = run_command("textql --version")
- if r != 0:
- raise Exception("Could not find textql")
- if len(e) != 0:
- raise Exception("Errors while getting textql version")
- return o[0]
-
- def _get_octosql_version(self):
- r,o,e = run_command("octosql --version")
- if r != 0:
- raise Exception("Could not find octosql")
- if len(e) != 0:
- raise Exception("Errors while getting octosql version")
- version = re.findall('v[0-9]+\\.[0-9]+\\.[0-9]+',str(o[0],encoding='utf-8'))[0]
- return version
-
- def test_textql_matrix(self):
- def generate_textql_cmd(data_filename,line_count,column_count):
- return 'textql -dlm , -sql "select count(*)" {}'.format(data_filename)
-
- name = 'textql_%s' % self._get_textql_version()
- self._perform_test_performance_matrix(name,generate_textql_cmd)
-
- def test_octosql_matrix(self):
- config_fn = self.random_tmp_filename('octosql', 'config')
- def generate_octosql_cmd(data_filename,line_count,column_count):
- j = """
-dataSources:
- - name: bmdata
- type: csv
- config:
- path: "{}"
- headerRow: false
- batchSize: 10000
-""".format(data_filename)[1:]
- f = open(config_fn,'w')
- f.write(j)
- f.close()
- return 'octosql -c {} -o batch-csv "select count(*) from bmdata a"'.format(config_fn)
-
- name = 'octosql_%s' % self._get_octosql_version()
- self._perform_test_performance_matrix(name,generate_octosql_cmd)
-
-def suite():
- tl = unittest.TestLoader()
- basic_stuff = tl.loadTestsFromTestCase(BasicTests)
- parsing_mode = tl.loadTestsFromTestCase(ParsingModeTests)
- sql = tl.loadTestsFromTestCase(SqlTests)
- formatting = tl.loadTestsFromTestCase(FormattingTests)
- basic_module_stuff = tl.loadTestsFromTestCase(BasicModuleTests)
- save_db_to_disk_tests = tl.loadTestsFromTestCase(SaveDbToDiskTests)
- user_functions_tests = tl.loadTestsFromTestCase(UserFunctionTests)
- multi_header_tests = tl.loadTestsFromTestCase(MultiHeaderTests)
- return unittest.TestSuite([basic_module_stuff, basic_stuff, parsing_mode, sql, formatting,save_db_to_disk_tests,multi_header_tests,user_functions_tests])
-
-if __name__ == '__main__':
- if len(sys.argv) > 1:
- suite = unittest.TestSuite()
- if '.' in sys.argv[1]:
- c,m = sys.argv[1].split(".")
- suite.addTest(globals()[c](m))
- else:
- tl = unittest.TestLoader()
- tc = tl.loadTestsFromTestCase(globals()[sys.argv[1]])
- suite = unittest.TestSuite([tc])
- else:
- suite = suite()
-
- test_runner = unittest.TextTestRunner(verbosity=2)
- result = test_runner.run(suite)
- sys.exit(not result.wasSuccessful())
diff --git a/test/utils.py b/test/utils.py
new file mode 100644
index 00000000..b015ba24
--- /dev/null
+++ b/test/utils.py
@@ -0,0 +1,83 @@
+# q uses this encoding as the default output encoding. Some of the tests use it in order to
+# make sure that the output is correctly encoded
+import itertools
+import locale
+import os
+from subprocess import PIPE, Popen
+import sys
+
+
+def b(s):
+ return s.encode("latin-1")
+
+SYSTEM_ENCODING = locale.getpreferredencoding()
+Q_EXECUTABLE = os.getenv('Q_EXECUTABLE', 'q')
+
+DEBUG = '-v' in sys.argv
+if os.environ.get('Q_DEBUG'):
+ DEBUG = True
+
+def batch(iterable, n=1):
+ r = []
+ l = len(iterable)
+ for ndx in range(0, l, n):
+ r += [iterable[ndx:min(ndx + n, l)]]
+ return r
+
+
+def partition(pred, iterable):
+ t1, t2 = itertools.tee(iterable)
+ return list(itertools.filterfalse(pred, t1)), list(filter(pred, t2))
+
+
+def run_command(cmd_to_run,env_to_inject=None):
+ global DEBUG
+ if DEBUG:
+ print("CMD: {}".format(cmd_to_run))
+
+ if env_to_inject is None:
+ env = os.environ.copy()
+ else:
+ env = os.environ.copy()
+ env.update(env_to_inject)
+
+ p = Popen(cmd_to_run, stdout=PIPE, stderr=PIPE, shell=True,env=env)
+ o, e = p.communicate()
+ # remove last newline
+ o = o.rstrip()
+ e = e.strip()
+ # split rows
+ if o != b(''):
+ o = o.split(b(os.linesep))
+ else:
+ o = []
+ if e != b(''):
+ e = e.split(b(os.linesep))
+ else:
+ e = []
+
+ res = (p.returncode, o, e)
+ if DEBUG:
+ print("RESULT:{}".format(res))
+ return res
+
+
+def one_column_warning(e):
+ return e[0].startswith(b('Warning: column count is one'))
+
+
+def sqlite_dict_factory(cursor, row):
+ d = {}
+ for idx, col in enumerate(cursor.description):
+ d[col[0]] = row[idx]
+ return d
+
+
+def get_sqlite_table_list(c,exclude_qcatalog=True):
+ if exclude_qcatalog:
+ r = c.execute("select tbl_name from sqlite_master where type='table' and tbl_name != '_qcatalog'").fetchall()
+ else:
+ r = c.execute("select tbl_name from sqlite_master where type='table'").fetchall()
+
+ return r
+