From 988b484dcd6571a076910b0da60303645a398ab6 Mon Sep 17 00:00:00 2001 From: Tim Gu <2013tim.g@gmail.com> Date: Mon, 23 Sep 2024 14:05:23 -0400 Subject: [PATCH] Fix unsupported unicode character "0x92". Some files created on windows might contain "0x92" which is not ASCII and cannot be parsed with utf8. The solution is to use 'cp1252' encoding (https://stackoverflow.com/questions/46000191/utf-8-codec-cant-decode-byte-0x92-in-position-18-invalid-start-byte) --- src/semantic_code_search/embed.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/semantic_code_search/embed.py b/src/semantic_code_search/embed.py index d82d030..eb98e19 100644 --- a/src/semantic_code_search/embed.py +++ b/src/semantic_code_search/embed.py @@ -65,15 +65,26 @@ def _get_repo_functions(root, supported_file_extensions, relevant_node_types): for fp in tqdm([root + '/' + f for f in os.popen('git -C {} ls-files'.format(root)).read().split('\n')]): if not os.path.isfile(fp): continue - with open(fp, 'r') as f: - lang = supported_file_extensions.get(fp[fp.rfind('.'):]) - if lang: - parser = get_parser(lang) - file_content = f.read() - tree = parser.parse(bytes(file_content, 'utf8')) - all_nodes = list(_traverse_tree(tree.root_node)) - functions.extend(_extract_functions( - all_nodes, fp, file_content, relevant_node_types)) + try: + with open(fp, 'r') as f: + lang = supported_file_extensions.get(fp[fp.rfind('.'):]) + if lang: + parser = get_parser(lang) + file_content = f.read() + tree = parser.parse(bytes(file_content, 'utf8')) + all_nodes = list(_traverse_tree(tree.root_node)) + functions.extend(_extract_functions( + all_nodes, fp, file_content, relevant_node_types)) + except: + with open(fp, 'r', encoding='cp1252') as f: + lang = supported_file_extensions.get(fp[fp.rfind('.'):]) + if lang: + parser = get_parser(lang) + file_content = f.read() + tree = parser.parse(bytes(file_content, 'cp1252')) + all_nodes = list(_traverse_tree(tree.root_node)) + functions.extend(_extract_functions( + all_nodes, fp, file_content, relevant_node_types)) return functions