From 66529448231d72c58b4a1591dc9b8b83b372868a Mon Sep 17 00:00:00 2001 From: John Steel Date: Tue, 18 Jun 2024 13:04:09 -0400 Subject: [PATCH] Add more encodings --- httpbin/core.py | 87 +++++++++++++- httpbin/helpers.py | 25 ++++ httpbin/templates/encoding/big5.txt | 6 + httpbin/templates/encoding/demo.html.j2 | 8 ++ httpbin/templates/encoding/euc-jp.txt | 6 + httpbin/templates/encoding/gb2312.txt | 6 + httpbin/templates/encoding/iso-8859-1.txt | 15 +++ httpbin/templates/encoding/iso-8859-2.txt | 15 +++ httpbin/templates/encoding/iso-8859-3.txt | 15 +++ httpbin/templates/encoding/iso-8859-4.txt | 15 +++ httpbin/templates/encoding/shift_jis.txt | 6 + .../{UTF-8-demo.txt => encoding/utf-8.txt} | 7 -- httpbin/templates/encoding/windows-1252.txt | 16 +++ tests/test_httpbin.py | 111 ++++++++++++++++++ 14 files changed, 326 insertions(+), 12 deletions(-) create mode 100644 httpbin/templates/encoding/big5.txt create mode 100644 httpbin/templates/encoding/demo.html.j2 create mode 100644 httpbin/templates/encoding/euc-jp.txt create mode 100644 httpbin/templates/encoding/gb2312.txt create mode 100644 httpbin/templates/encoding/iso-8859-1.txt create mode 100644 httpbin/templates/encoding/iso-8859-2.txt create mode 100644 httpbin/templates/encoding/iso-8859-3.txt create mode 100644 httpbin/templates/encoding/iso-8859-4.txt create mode 100644 httpbin/templates/encoding/shift_jis.txt rename httpbin/templates/{UTF-8-demo.txt => encoding/utf-8.txt} (98%) create mode 100644 httpbin/templates/encoding/windows-1252.txt diff --git a/httpbin/core.py b/httpbin/core.py index e23cc86b..82c37b1a 100644 --- a/httpbin/core.py +++ b/httpbin/core.py @@ -51,6 +51,7 @@ parse_multi_value_header, next_stale_after_value, digest_challenge_response, + normalize_charset, ) from .utils import weighted_choice from .structures import CaseInsensitiveDict @@ -1407,20 +1408,96 @@ def cache_control(value): return response -@app.route("/encoding/utf8") -def encoding(): - """Returns a UTF-8 encoded body. +@app.route("/encoding/") +def encoding(charset): + """Returns the requested charset and encoding. --- tags: - Response formats + parameters: + - in: path + name: charset + type: + default: 'utf8' + - in: query + name: content-type + type: string + description: The content type of the response. If unset will use response content type ("accept" header). + default: '' + produces: + - text/html + - text/plain + - '*/*' + responses: + 200: + description: Content with the requested encoding and content type. + """ + return encoding_generic(charset, None) + + +@app.route("/encoding//") +def encoding_generic(charset, body): + """Returns the requested charset and encoding. + --- + tags: + - Response formats + parameters: + - in: path + name: charset + type: + default: 'utf8' + - in: query + name: content-type + type: string + description: The content type of the response. If unset will use response content type ("accept" header). + default: '' + - in: path + name: body + type: string + default: SFRUUEJJTiDjga_mnIDpq5jjgafjgZk= produces: - text/html + - text/plain + - '*/*' responses: 200: - description: Encoded UTF-8 content. + description: Content with the requested encoding and content type and body. """ + response = make_response() + + charset = charset or request.headers.get("accept-charset", "utf-8") + accept_header = request.headers.get("accept") + if accept_header is not None: + accept_header = accept_header.split(";")[0].split(",")[0] + response.content_type = (request.args.get("content-type", accept_header) or "text/html") + "; charset=" + charset + normalized_charset = (normalize_charset(charset) or "utf-8").lower() - return render_template("UTF-8-demo.txt") + if body: + response.data = base64.urlsafe_b64decode(body) + return response + elif normalized_charset in ["utf-8", "utf-16", "utf-32"]: + template_data = { + "title": "Unicode Demo", + "citation_url": "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt", + "body_template": "encoding/utf-8.txt", + "citation_prefix": ("Taken from" if normalized_charset == "utf-8" + else f"Re-encoded to {normalized_charset} from the utf-8 taken from") + } + else: + template_data = { + "title": f"{normalized_charset} Demo", + "citation_url": "", + "body_template": f"encoding/{normalized_charset}.txt", + "citation_prefix": "" + } + + if response.content_type.startswith("text/html"): + template_name = "encoding/demo.html.j2" + else: + template_name = template_data["body_template"] + response.data = render_template(template_name, **template_data).encode(normalized_charset) + + return response @app.route("/bytes/") diff --git a/httpbin/helpers.py b/httpbin/helpers.py index 836c8026..60ba17da 100644 --- a/httpbin/helpers.py +++ b/httpbin/helpers.py @@ -483,3 +483,28 @@ def digest_challenge_response(app, qop, algorithm, stale = False): auth = WWWAuthenticate("digest", values=values) response.headers['WWW-Authenticate'] = auth.to_header() return response + + +def normalize_charset(charset): + charset = charset.lower() + charset_aliases = { + "utf[-_]?8": "UTF-8", + "utf[-_]?16": "UTF-16", + "utf[-_]?32": "UTF-32", + "iso-ir-6|ansi_x3.4-1968|ansi_x3.4-1986|iso_646.irv:1991|ascii|iso646-us|us|csascii": "US-ASCII", + "iso[-_]?8859[-_]?2|iso-ir-101|csisolatin2|latin[-_]?2|l2|ibm912|cp912": "ISO-8859-2", + "iso[-_]?8859[-_]?3|iso-ir-109|csisolatin3|latin[-_]?3|l3|ibm913|cp913": "ISO-8859-3", + "iso[-_]?8859[-_]?4|iso-ir-110|csisolatin4|latin[-_]?4|l4|ibm914|cp914": "ISO-8859-4", + "iso[-_]?8859[-_]?1?|iso-ir-100|csisolatin1|latin[-_]?1|l1|ibm819|cp819": "ISO-8859-1", + "big5|csbig5|cn-big5": "Big5", + "gb2312|csgb2312|chinese": "GB2312", + "euc-jp|.*japanese": "EUC-JP", + "shift_jis|csshiftjis|ms_kanji|x-sjis": "Shift_JIS", + "windows-1252|windows1252|cp1252|ms-ee": "Windows-1252", + } + + for pattern, normalized in charset_aliases.items(): + if re.match(pattern, charset): + return normalized + + return None diff --git a/httpbin/templates/encoding/big5.txt b/httpbin/templates/encoding/big5.txt new file mode 100644 index 00000000..ed58cacc --- /dev/null +++ b/httpbin/templates/encoding/big5.txt @@ -0,0 +1,6 @@ +這是一個中文文本的範例。這段文本是以Big5編碼的。Big5是用來處理中文字符的編碼之一。以下是一段中文的文章。 + +中文中有很多不同的字符,包括漢字、標點符號等。漢字是從古代流傳下來的字符,每個字符都有其獨特的意義。例如,“中文”這個詞是由兩個漢字組成的:“中”和“文”。 + +今天的天氣非常好。天空湛藍,微風習習。公園裡有許多人在散步。孩子們在遊樂場玩耍,大人們坐在長椅上聊天。很多人帶著狗在公園裡散步。在自然環境中度過時光,讓人感到非常放鬆。 + diff --git a/httpbin/templates/encoding/demo.html.j2 b/httpbin/templates/encoding/demo.html.j2 new file mode 100644 index 00000000..d85a656a --- /dev/null +++ b/httpbin/templates/encoding/demo.html.j2 @@ -0,0 +1,8 @@ +

{{ title }}

+ +

{{ citation_prefix }} {{ citation_url }}

+ +
+{% include body_template ignore missing %}
+
diff --git a/httpbin/templates/encoding/euc-jp.txt b/httpbin/templates/encoding/euc-jp.txt new file mode 100644 index 00000000..41ca7832 --- /dev/null +++ b/httpbin/templates/encoding/euc-jp.txt @@ -0,0 +1,6 @@ +これは日本語のテキストの例です。このテキストは、EUC-JPエンコーディングで符号化されています。EUC-JPは、日本語をコンピュータで扱うための文字エンコーディングの一つです。以下に、日本語の文章を続けます。 + +日本語には、漢字、ひらがな、カタカナの三種類の文字があります。漢字は中国から伝わった文字で、意味を持つ文字です。ひらがなとカタカナは、日本独自の音節文字で、発音を表します。例えば、「日本語」という単語は、漢字で「日本」と書き、ひらがなで「ご」と書きます。 + +今日の天気は晴れです。青い空が広がっており、風も心地よいです。公園では、多くの人々が散歩を楽しんでいます。子供たちは遊具で遊び、大人たちはベンチに座って話をしています。犬を連れた人も多く見かけます。自然の中で過ごす時間は、とてもリフレッシュできます。 + diff --git a/httpbin/templates/encoding/gb2312.txt b/httpbin/templates/encoding/gb2312.txt new file mode 100644 index 00000000..ab3285a3 --- /dev/null +++ b/httpbin/templates/encoding/gb2312.txt @@ -0,0 +1,6 @@ +这是一个中文文本的范例。这段文本是以GB2312编码的。GB2312是用来处理中文字符的编码之一。以下是一段中文的文章。 + +中文中有很多不同的字符,包括汉字、标点符号等。汉字是从古代流传下来的字符,每个字符都有其独特的意义。例如,“中文”这个词是由两个汉字组成的:“中”和“文”。 + +今天的天气非常好。天空湛蓝,微风习习。公园里有许多人在散步。孩子们在游乐场玩耍,大人们坐在长椅上聊天。很多人带着狗在公园里散步。在自然环境中度过时光,让人感到非常放松。 + diff --git a/httpbin/templates/encoding/iso-8859-1.txt b/httpbin/templates/encoding/iso-8859-1.txt new file mode 100644 index 00000000..af71d026 --- /dev/null +++ b/httpbin/templates/encoding/iso-8859-1.txt @@ -0,0 +1,15 @@ +https://en.wikipedia.org/wiki/ISO/IEC_8859-1 + + ! " # $ % & ' ( ) * + , - . / +0 1 2 3 4 5 6 7 8 9 : ; < = > ? +@ A B C D E F G H I J K L M N O +P Q R S T U V W X Y Z [ \ ] ^ _ +` a b c d e f g h i j k l m n o +p q r s t u v w x y z { | } ~ +NBSP¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ SHY ® ¯ +° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾ ¿ +À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï +Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß +à á â ã ä å æ ç è é ê ë ì í î ï +ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ + diff --git a/httpbin/templates/encoding/iso-8859-2.txt b/httpbin/templates/encoding/iso-8859-2.txt new file mode 100644 index 00000000..7dee3cc0 --- /dev/null +++ b/httpbin/templates/encoding/iso-8859-2.txt @@ -0,0 +1,15 @@ +https://en.wikipedia.org/wiki/ISO/IEC_8859-2 + + ! " # $ % & ' ( ) * + , - . / +0 1 2 3 4 5 6 7 8 9 : ; < = > ? +@ A B C D E F G H I J K L M N O +P Q R S T U V W X Y Z [ \ ] ^ _ +` a b c d e f g h i j k l m n o +p q r s t u v w x y z { | } ~ +NBSPĄ ˘ Ł ¤ Ľ Ś § ¨ Š Ş Ť Ź SHY Ž Ż +° ą ˛ ł ´ ľ ś ˇ ¸ š ş ť ź ˝ ž ż +Ŕ Á Â Ă Ä Ĺ Ć Ç Č É Ę Ë Ě Í Î Ď +Đ Ń Ň Ó Ô Ő Ö × Ř Ů Ú Ű Ü Ý Ţ ß +ŕ á â ă ä ĺ ć ç č é ę ë ě í î ď +đ ń ň ó ô ő ö ÷ ř ů ú ű ü ý ţ ˙ + diff --git a/httpbin/templates/encoding/iso-8859-3.txt b/httpbin/templates/encoding/iso-8859-3.txt new file mode 100644 index 00000000..3463d820 --- /dev/null +++ b/httpbin/templates/encoding/iso-8859-3.txt @@ -0,0 +1,15 @@ +https://en.wikipedia.org/wiki/ISO/IEC_8859-3 + + ! " # $ % & ' ( ) * + , - . / +0 1 2 3 4 5 6 7 8 9 : ; < = > ? +@ A B C D E F G H I J K L M N O +P Q R S T U V W X Y Z [ \ ] ^ _ +` a b c d e f g h i j k l m n o +p q r s t u v w x y z { | } ~ +NBSPĦ ˘ £ ¤ Ĥ § ¨ İ Ş Ğ Ĵ SHY Ż +° ħ ² ³ ´ µ ĥ · ¸ ı ş ğ ĵ ½ ż +À Á Â Ä Ċ Ĉ Ç È É Ê Ë Ì Í Î Ï + Ñ Ò Ó Ô Ġ Ö × Ĝ Ù Ú Û Ü Ŭ Ŝ ß +à á â ä ċ ĉ ç è é ê ë ì í î ï + ñ ò ó ô ġ ö ÷ ĝ ù ú û ü ŭ ŝ ˙ + diff --git a/httpbin/templates/encoding/iso-8859-4.txt b/httpbin/templates/encoding/iso-8859-4.txt new file mode 100644 index 00000000..b4f1633e --- /dev/null +++ b/httpbin/templates/encoding/iso-8859-4.txt @@ -0,0 +1,15 @@ +https://en.wikipedia.org/wiki/ISO/IEC_8859-4 + + ! " # $ % & ' ( ) * + , - . / +0 1 2 3 4 5 6 7 8 9 : ; < = > ? +@ A B C D E F G H I J K L M N O +P Q R S T U V W X Y Z [ \ ] ^ _ +` a b c d e f g h i j k l m n o +p q r s t u v w x y z { | } ~ +NBSPĄ ĸ Ŗ ¤ Ĩ Ļ § ¨ Š Ē Ģ Ŧ Ž SPH ¯ +° ą ˛ ŗ ´ ĩ ļ ˇ ¸ š ē ģ ŧ Ŋ ž ŋ +Ā Á Â Ã Ä Å Æ Į Č É Ę Ë Ė Í Î Ī +Đ Ņ Ō Ķ Ô Õ Ö × Ø Ų Ú Û Ü Ũ Ū ß +ā á â ã ä å æ į č é ę ë ė í î ī +đ ņ ō ķ ô õ ö ÷ ø ų ú û ü ũ ū ˙ + diff --git a/httpbin/templates/encoding/shift_jis.txt b/httpbin/templates/encoding/shift_jis.txt new file mode 100644 index 00000000..b746ec63 --- /dev/null +++ b/httpbin/templates/encoding/shift_jis.txt @@ -0,0 +1,6 @@ +これは日本語のテキストの例です。このテキストは、Shift_JISエンコーディングで符号化されています。Shift_JISは、日本語をコンピュータで扱うための文字エンコーディングの一つです。以下に、日本語の文章を続けます。 + +日本語には、漢字、ひらがな、カタカナの三種類の文字があります。漢字は中国から伝わった文字で、意味を持つ文字です。ひらがなとカタカナは、日本独自の音節文字で、発音を表します。例えば、「日本語」という単語は、漢字で「日本」と書き、ひらがなで「ご」と書きます。 + +今日の天気は晴れです。青い空が広がっており、風も心地よいです。公園では、多くの人々が散歩を楽しんでいます。子供たちは遊具で遊び、大人たちはベンチに座って話をしています。犬を連れた人も多く見かけます。自然の中で過ごす時間は、とてもリフレッシュできます。 + diff --git a/httpbin/templates/UTF-8-demo.txt b/httpbin/templates/encoding/utf-8.txt similarity index 98% rename from httpbin/templates/UTF-8-demo.txt rename to httpbin/templates/encoding/utf-8.txt index 726dd626..111d7b61 100644 --- a/httpbin/templates/UTF-8-demo.txt +++ b/httpbin/templates/encoding/utf-8.txt @@ -1,9 +1,3 @@ -

Unicode Demo

- -

Taken from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt

- -
 
 UTF-8 encoded sample plain-text file
 ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
@@ -217,4 +211,3 @@ Box drawing alignment tests:                                          █
   ╚══╩══╝  └──┴──┘  ╰──┴──╯  ╰──┴──╯  ┗━━┻━━┛  ▗▄▖▛▀▜   └╌╌┘ ╎ ┗╍╍┛ ┋  ▁▂▃▄▅▆▇█
                                                ▝▀▘▙▄▟
 
-
diff --git a/httpbin/templates/encoding/windows-1252.txt b/httpbin/templates/encoding/windows-1252.txt new file mode 100644 index 00000000..b76b43f6 --- /dev/null +++ b/httpbin/templates/encoding/windows-1252.txt @@ -0,0 +1,16 @@ +https://en.wikipedia.org/wiki/Windows-1252 + + ! " # $ % & ' ( ) * + , - . / +0 1 2 3 4 5 6 7 8 9 : ; < = > ? +@ A B C D E F G H I J K L M N O +P Q R S T U V W X Y Z [ \ ] ^ _ +` a b c d e f g h i j k l m n o +p q r s t u v w x y z { | } ~ +€ ‚ ƒ „ … † ‡ ˆ ‰ Š ‹ Œ Ž +NBSP¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ SHY ® ¯ +° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾ ¿ +À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï +Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß +à á â ã ä å æ ç è é ê ë ì í î ï +ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ + diff --git a/tests/test_httpbin.py b/tests/test_httpbin.py index 6b751245..de2f4728 100755 --- a/tests/test_httpbin.py +++ b/tests/test_httpbin.py @@ -11,6 +11,7 @@ import httpbin from httpbin.helpers import parse_multi_value_header +from httpbin.helpers import normalize_charset @contextlib.contextmanager @@ -790,3 +791,113 @@ def test_parse_multi_value_header(self): self.assertEqual(parse_multi_value_header('"xyzzy", "r2d2xxxx", "c3piozzzz"'), [ "xyzzy", "r2d2xxxx", "c3piozzzz" ]) self.assertEqual(parse_multi_value_header('W/"xyzzy", W/"r2d2xxxx", W/"c3piozzzz"'), [ "xyzzy", "r2d2xxxx", "c3piozzzz" ]) self.assertEqual(parse_multi_value_header('*'), [ "*" ]) + + def test_encoding_endpoint(self): + codec = 'utf-8' + response = self.app.get(f'/encoding/{codec}') + + # Check that the request was successful. + self.assertEqual(response.status_code, 200) + + # Check that the response headers indicate the correct content type. + self.assertEqual(response.headers['Content-Type'], 'text/html; charset=utf-8') + + # Check that the response body is not empty. + self.assertTrue(len(response.text) > 0) + + def test_encoding_endpoint_iso(self): + response = self.app.get('/encoding/ISO-8859-1?content-type=application/json') + + # Check that the request was successful. + self.assertEqual(response.status_code, 200) + + # Check that the response headers indicate the correct content type. + self.assertEqual("application/json; charset=ISO-8859-1", response.headers['Content-Type']) + + # Check that the response body is not empty. + self.assertTrue(len(response.data) > 0) + + def test_swagger_spec(self): + response = self.app.get('/spec.json') + + # Check that the request was successful. + self.assertEqual(response.status_code, 200) + + # Check that the response body is not empty. + self.assertTrue(len(response.text) > 0) + + def test_normalize_charset(self): + test_cases = [ + ("UTF8", "UTF-8"), + ("utf-8", "UTF-8"), + ("UTF16", "UTF-16"), + ("utf-16", "UTF-16"), + ("UTF32", "UTF-32"), + ("utf-32", "UTF-32"), + ("utf-64", None), + ("iso-ir-100", "ISO-8859-1"), + ("csISOLatin1", "ISO-8859-1"), + ("latin1", "ISO-8859-1"), + ("l1", "ISO-8859-1"), + ("IBM819", "ISO-8859-1"), + ("CP819", "ISO-8859-1"), + ("iso-ir-6", "US-ASCII"), + ("ANSI_X3.4-1968", "US-ASCII"), + ("ANSI_X3.4-1986", "US-ASCII"), + ("ISO_646.irv:1991", "US-ASCII"), + ("ASCII", "US-ASCII"), + ("ISO646-US", "US-ASCII"), + ("us", "US-ASCII"), + ("csASCII", "US-ASCII"), + ("ISO-8859-1", "ISO-8859-1"), + ("iso8859-1", "ISO-8859-1"), + ("iso88591", "ISO-8859-1"), + ("latin1", "ISO-8859-1"), + ("latin-1", "ISO-8859-1"), + ("ISO_8859-1:1987", "ISO-8859-1"), + ("ISO_8859-1", "ISO-8859-1"), + ("ISO-8859-2", "ISO-8859-2"), + ("iso-ir-101", "ISO-8859-2"), + ("csISOLatin2", "ISO-8859-2"), + ("latin2", "ISO-8859-2"), + ("l2", "ISO-8859-2"), + ("IBM912", "ISO-8859-2"), + ("CP912", "ISO-8859-2"), + ("ISO-8859-3", "ISO-8859-3"), + ("iso-ir-109", "ISO-8859-3"), + ("csISOLatin3", "ISO-8859-3"), + ("latin3", "ISO-8859-3"), + ("l3", "ISO-8859-3"), + ("IBM913", "ISO-8859-3"), + ("CP913", "ISO-8859-3"), + ("ISO-8859-4", "ISO-8859-4"), + ("iso-ir-110", "ISO-8859-4"), + ("csISOLatin4", "ISO-8859-4"), + ("latin4", "ISO-8859-4"), + ("l4", "ISO-8859-4"), + ("IBM914", "ISO-8859-4"), + ("CP914", "ISO-8859-4"), + ("big5", "Big5"), + ("csbig5", "Big5"), + ("cn-big5", "Big5"), + ("euc-jp", "EUC-JP"), + ("japanese", "EUC-JP"), + ("cseucpkdfmtjapanese", "EUC-JP"), + ("extended_unix_code_packed_format_for_japanese", "EUC-JP"), + ("shift_jis", "Shift_JIS"), + ("csshiftjis", "Shift_JIS"), + ("ms_kanji", "Shift_JIS"), + ("x-sjis", "Shift_JIS"), + ("gb2312", "GB2312"), + ("csGB2312", "GB2312"), + ("chinese", "GB2312"), + ("windows-1252", "Windows-1252"), + ("windows1252", "Windows-1252"), + ("cp1252", "Windows-1252"), + ("ms-ee", "Windows-1252"), + ("unknown-charset", None), + ] + + for charset, expected in test_cases: + with self.subTest(charset=charset): + self.assertEqual(normalize_charset(charset), expected)