Skip to content

Commit 3131b7e

Browse files
committed
Temporary commit, please rebase and remove loggers [skip ci]
1 parent d1fac6d commit 3131b7e

File tree

2 files changed

+85
-6
lines changed

2 files changed

+85
-6
lines changed

mfr/extensions/tabular/libs/stdlib_tools.py

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,47 @@
11
import re
22
import csv
3+
import logging
34

4-
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
55
from mfr.extensions.tabular import utilities
6+
from mfr.extensions.tabular.settings import MAX_FILE_SIZE, TABULAR_INIT_SNIFF_SIZE
7+
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
8+
9+
logger = logging.getLogger(__name__)
610

711

812
def csv_stdlib(fp):
913
"""Read and convert a csv file to JSON format using the python standard library
10-
:param fp: File pointer object
11-
:return: tuple of table headers and data
14+
15+
Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to
16+
effectively detect the correct dialect of the file.
17+
18+
:param fp: the file pointer object
19+
:return: a tuple of table headers and data
1220
"""
13-
data = fp.read(2048)
21+
22+
# Prepare the first row for sniffing
23+
data = fp.read(TABULAR_INIT_SNIFF_SIZE)
24+
data = _trim_or_append_data(fp, data, TABULAR_INIT_SNIFF_SIZE, 0)
25+
26+
# Reset the file pointer
1427
fp.seek(0)
1528

29+
# Sniff the first row to find a matching format
1630
try:
1731
dialect = csv.Sniffer().sniff(data)
1832
except csv.Error:
1933
dialect = csv.excel
2034
else:
2135
_set_dialect_quote_attrs(dialect, data)
2236

37+
# Explicitly delete data when it is on longer used.
2338
del data
39+
40+
# Create the CSV reader with the detected dialect
2441
reader = csv.DictReader(fp, dialect=dialect)
42+
43+
# Update the reader field names to avoid duplicate column names when performing row extraction
2544
columns = []
26-
# update the reader field names to avoid duplicate column names when performing row extraction
2745
for idx, fieldname in enumerate(reader.fieldnames or []):
2846
column_count = sum(1 for column in columns if fieldname == column['name'])
2947
if column_count:
@@ -92,3 +110,62 @@ def _set_dialect_quote_attrs(dialect, data):
92110
dialect.quotechar = '"'
93111
if re.search('"""[[({]\'.+\',', data):
94112
dialect.doublequote = True
113+
114+
115+
def _trim_or_append_data(fp, text, read_size, sniff_size):
116+
"""Recursively read data from a file and return its first row. The file starts with ``text``
117+
and the file pointer points to the next character immediately after `text`.
118+
119+
:param fp: the file pointer from which data is read
120+
:param text: the current text chunk to check the new line character
121+
:param read_size: the last read size when `fp.read()` is called
122+
:param sniff_size: the accumulated size fo the text to sniff
123+
:return: the first row of the file in string
124+
"""
125+
126+
logger.info('>>> ??? &&& ~~~ _trim_or_append_data() ...')
127+
logger.info('>>> ??? &&& ~~~ len(text)={}\tread_size={}\tsniff_size={}'
128+
.format(len(text), read_size, sniff_size))
129+
130+
# Try to find the first new line character in the text chunk
131+
index = _find_new_line(text)
132+
# If found, return the trimmed substring
133+
if index != -1:
134+
logger.info('>>> ??? &&& ~~~ new line found @ index = {}, '
135+
'return the trimmed text'.format(index))
136+
return text[:index]
137+
# Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text
138+
sniff_size += read_size
139+
read_size *= 2
140+
more_text = fp.read(read_size)
141+
142+
# If text to sniff now goes over the max file size limit, raise the renderer error since there
143+
# is no need to sniff when the file is already too large to be rendered.
144+
if sniff_size + len(more_text) >= MAX_FILE_SIZE:
145+
raise TabularRendererError(
146+
'The first row of this file is too large for the sniffer to detect the dialect. '
147+
'Please download and view it locally.',
148+
code=400,
149+
extension='csv'
150+
)
151+
# If the size is still within the limit, recursively check `more_text`
152+
logger.info('>>> ??? &&& ~~~ sniff more text')
153+
return text + _trim_or_append_data(fp, more_text, read_size, sniff_size)
154+
155+
156+
def _find_new_line(text):
157+
"""Check the text string for any type of new line character.
158+
159+
:param text: the text string to check
160+
:return: the index of the new line character if found. Otherwise, return -1.
161+
"""
162+
163+
index = text.rfind('\r\n')
164+
if index == -1:
165+
index = text.rfind('\n')
166+
if index == -1:
167+
index = text.rfind('\r')
168+
169+
logger.info('>>> ??? ### new line index = {}'.format(index))
170+
171+
return index

mfr/extensions/tabular/settings.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44

55
config = settings.child('TABULAR_EXTENSION_CONFIG')
66

7-
MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10Mb
7+
MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10MB
88
MAX_SIZE = int(config.get('MAX_SIZE', 10000)) # max number of rows or columns allowed.
99
TABLE_WIDTH = int(config.get('TABLE_WIDTH', 700))
1010
TABLE_HEIGHT = int(config.get('TABLE_HEIGHT', 600))
1111

12+
TABULAR_INIT_SNIFF_SIZE = int(config.get('TABULAR_SNIFF_SIZE', 128)) # 4KB
13+
1214
LIBS = config.get_object('LIBS', {
1315
'.csv': [libs.csv_stdlib],
1416
'.tsv': [libs.csv_stdlib],

0 commit comments

Comments
 (0)