|
1 | 1 | import re
|
2 | 2 | import csv
|
| 3 | +import logging |
3 | 4 |
|
4 |
| -from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError |
5 | 5 | from mfr.extensions.tabular import utilities
|
| 6 | +from mfr.extensions.tabular.settings import MAX_FILE_SIZE, TABULAR_INIT_SNIFF_SIZE |
| 7 | +from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError |
| 8 | + |
| 9 | +logger = logging.getLogger(__name__) |
6 | 10 |
|
7 | 11 |
|
8 | 12 | def csv_stdlib(fp):
|
9 | 13 | """Read and convert a csv file to JSON format using the python standard library
|
10 |
| - :param fp: File pointer object |
11 |
| - :return: tuple of table headers and data |
| 14 | +
|
| 15 | + Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to |
| 16 | + effectively detect the correct dialect of the file. |
| 17 | +
|
| 18 | + :param fp: the file pointer object |
| 19 | + :return: a tuple of table headers and data |
12 | 20 | """
|
13 |
| - data = fp.read(2048) |
| 21 | + |
| 22 | + # Prepare the first row for sniffing |
| 23 | + data = fp.read(TABULAR_INIT_SNIFF_SIZE) |
| 24 | + data = _trim_or_append_data(fp, data, TABULAR_INIT_SNIFF_SIZE, 0) |
| 25 | + |
| 26 | + # Reset the file pointer |
14 | 27 | fp.seek(0)
|
15 | 28 |
|
| 29 | + # Sniff the first row to find a matching format |
16 | 30 | try:
|
17 | 31 | dialect = csv.Sniffer().sniff(data)
|
18 | 32 | except csv.Error:
|
19 | 33 | dialect = csv.excel
|
20 | 34 | else:
|
21 | 35 | _set_dialect_quote_attrs(dialect, data)
|
22 | 36 |
|
| 37 | + # Explicitly delete data when it is on longer used. |
23 | 38 | del data
|
| 39 | + |
| 40 | + # Create the CSV reader with the detected dialect |
24 | 41 | reader = csv.DictReader(fp, dialect=dialect)
|
| 42 | + |
| 43 | + # Update the reader field names to avoid duplicate column names when performing row extraction |
25 | 44 | columns = []
|
26 |
| - # update the reader field names to avoid duplicate column names when performing row extraction |
27 | 45 | for idx, fieldname in enumerate(reader.fieldnames or []):
|
28 | 46 | column_count = sum(1 for column in columns if fieldname == column['name'])
|
29 | 47 | if column_count:
|
@@ -92,3 +110,62 @@ def _set_dialect_quote_attrs(dialect, data):
|
92 | 110 | dialect.quotechar = '"'
|
93 | 111 | if re.search('"""[[({]\'.+\',', data):
|
94 | 112 | dialect.doublequote = True
|
| 113 | + |
| 114 | + |
| 115 | +def _trim_or_append_data(fp, text, read_size, sniff_size): |
| 116 | + """Recursively read data from a file and return its first row. The file starts with ``text`` |
| 117 | + and the file pointer points to the next character immediately after `text`. |
| 118 | +
|
| 119 | + :param fp: the file pointer from which data is read |
| 120 | + :param text: the current text chunk to check the new line character |
| 121 | + :param read_size: the last read size when `fp.read()` is called |
| 122 | + :param sniff_size: the accumulated size fo the text to sniff |
| 123 | + :return: the first row of the file in string |
| 124 | + """ |
| 125 | + |
| 126 | + logger.info('>>> ??? &&& ~~~ _trim_or_append_data() ...') |
| 127 | + logger.info('>>> ??? &&& ~~~ len(text)={}\tread_size={}\tsniff_size={}' |
| 128 | + .format(len(text), read_size, sniff_size)) |
| 129 | + |
| 130 | + # Try to find the first new line character in the text chunk |
| 131 | + index = _find_new_line(text) |
| 132 | + # If found, return the trimmed substring |
| 133 | + if index != -1: |
| 134 | + logger.info('>>> ??? &&& ~~~ new line found @ index = {}, ' |
| 135 | + 'return the trimmed text'.format(index)) |
| 136 | + return text[:index] |
| 137 | + # Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text |
| 138 | + sniff_size += read_size |
| 139 | + read_size *= 2 |
| 140 | + more_text = fp.read(read_size) |
| 141 | + |
| 142 | + # If text to sniff now goes over the max file size limit, raise the renderer error since there |
| 143 | + # is no need to sniff when the file is already too large to be rendered. |
| 144 | + if sniff_size + len(more_text) >= MAX_FILE_SIZE: |
| 145 | + raise TabularRendererError( |
| 146 | + 'The first row of this file is too large for the sniffer to detect the dialect. ' |
| 147 | + 'Please download and view it locally.', |
| 148 | + code=400, |
| 149 | + extension='csv' |
| 150 | + ) |
| 151 | + # If the size is still within the limit, recursively check `more_text` |
| 152 | + logger.info('>>> ??? &&& ~~~ sniff more text') |
| 153 | + return text + _trim_or_append_data(fp, more_text, read_size, sniff_size) |
| 154 | + |
| 155 | + |
| 156 | +def _find_new_line(text): |
| 157 | + """Check the text string for any type of new line character. |
| 158 | +
|
| 159 | + :param text: the text string to check |
| 160 | + :return: the index of the new line character if found. Otherwise, return -1. |
| 161 | + """ |
| 162 | + |
| 163 | + index = text.rfind('\r\n') |
| 164 | + if index == -1: |
| 165 | + index = text.rfind('\n') |
| 166 | + if index == -1: |
| 167 | + index = text.rfind('\r') |
| 168 | + |
| 169 | + logger.info('>>> ??? ### new line index = {}'.format(index)) |
| 170 | + |
| 171 | + return index |
0 commit comments