Skip to content

Draft: Add more encodings #45

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 82 additions & 5 deletions httpbin/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
parse_multi_value_header,
next_stale_after_value,
digest_challenge_response,
normalize_charset,
)
from .utils import weighted_choice
from .structures import CaseInsensitiveDict
Expand Down Expand Up @@ -1407,20 +1408,96 @@ def cache_control(value):
return response


@app.route("/encoding/utf8")
def encoding():
"""Returns a UTF-8 encoded body.
@app.route("/encoding/<charset>")
def encoding(charset):
"""Returns the requested charset and encoding.
---
tags:
- Response formats
parameters:
- in: path
name: charset
type:
default: 'utf8'
- in: query
name: content-type
type: string
description: The content type of the response. If unset will use response content type ("accept" header).
default: ''
produces:
- text/html
- text/plain
- '*/*'
responses:
200:
description: Content with the requested encoding and content type.
"""
return encoding_generic(charset, None)


@app.route("/encoding/<charset>/<body>")
def encoding_generic(charset, body):
"""Returns the requested charset and encoding.
---
tags:
- Response formats
parameters:
- in: path
name: charset
type:
default: 'utf8'
- in: query
name: content-type
type: string
description: The content type of the response. If unset will use response content type ("accept" header).
default: ''
- in: path
name: body
type: string
default: SFRUUEJJTiDjga_mnIDpq5jjgafjgZk=
produces:
- text/html
- text/plain
- '*/*'
responses:
200:
description: Encoded UTF-8 content.
description: Content with the requested encoding and content type and body.
"""
response = make_response()

charset = charset or request.headers.get("accept-charset", "utf-8")
accept_header = request.headers.get("accept")
if accept_header is not None:
accept_header = accept_header.split(";")[0].split(",")[0]
response.content_type = (request.args.get("content-type", accept_header) or "text/html") + "; charset=" + charset
normalized_charset = (normalize_charset(charset) or "utf-8").lower()

return render_template("UTF-8-demo.txt")
if body:
response.data = base64.urlsafe_b64decode(body)
return response
elif normalized_charset in ["utf-8", "utf-16", "utf-32"]:
template_data = {
"title": "Unicode Demo",
"citation_url": "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt",
"body_template": "encoding/utf-8.txt",
"citation_prefix": ("Taken from" if normalized_charset == "utf-8"
else f"Re-encoded to {normalized_charset} from the utf-8 taken from")
}
else:
template_data = {
"title": f"{normalized_charset} Demo",
"citation_url": "",
"body_template": f"encoding/{normalized_charset}.txt",
"citation_prefix": ""
}

if response.content_type.startswith("text/html"):
template_name = "encoding/demo.html.j2"
else:
template_name = template_data["body_template"]
response.data = render_template(template_name, **template_data).encode(normalized_charset)

return response


@app.route("/bytes/<int:n>")
Expand Down
25 changes: 25 additions & 0 deletions httpbin/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,3 +483,28 @@ def digest_challenge_response(app, qop, algorithm, stale = False):
auth = WWWAuthenticate("digest", values=values)
response.headers['WWW-Authenticate'] = auth.to_header()
return response


def normalize_charset(charset):
charset = charset.lower()
charset_aliases = {
"utf[-_]?8": "UTF-8",
"utf[-_]?16": "UTF-16",
"utf[-_]?32": "UTF-32",
"iso-ir-6|ansi_x3.4-1968|ansi_x3.4-1986|iso_646.irv:1991|ascii|iso646-us|us|csascii": "US-ASCII",
"iso[-_]?8859[-_]?2|iso-ir-101|csisolatin2|latin[-_]?2|l2|ibm912|cp912": "ISO-8859-2",
"iso[-_]?8859[-_]?3|iso-ir-109|csisolatin3|latin[-_]?3|l3|ibm913|cp913": "ISO-8859-3",
"iso[-_]?8859[-_]?4|iso-ir-110|csisolatin4|latin[-_]?4|l4|ibm914|cp914": "ISO-8859-4",
"iso[-_]?8859[-_]?1?|iso-ir-100|csisolatin1|latin[-_]?1|l1|ibm819|cp819": "ISO-8859-1",
"big5|csbig5|cn-big5": "Big5",
"gb2312|csgb2312|chinese": "GB2312",
"euc-jp|.*japanese": "EUC-JP",
"shift_jis|csshiftjis|ms_kanji|x-sjis": "Shift_JIS",
"windows-1252|windows1252|cp1252|ms-ee": "Windows-1252",
}

for pattern, normalized in charset_aliases.items():
if re.match(pattern, charset):
return normalized

return None
6 changes: 6 additions & 0 deletions httpbin/templates/encoding/big5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
這是一個中文文本的範例。這段文本是以Big5編碼的。Big5是用來處理中文字符的編碼之一。以下是一段中文的文章。

中文中有很多不同的字符,包括漢字、標點符號等。漢字是從古代流傳下來的字符,每個字符都有其獨特的意義。例如,“中文”這個詞是由兩個漢字組成的:“中”和“文”。

今天的天氣非常好。天空湛藍,微風習習。公園裡有許多人在散步。孩子們在遊樂場玩耍,大人們坐在長椅上聊天。很多人帶著狗在公園裡散步。在自然環境中度過時光,讓人感到非常放鬆。

8 changes: 8 additions & 0 deletions httpbin/templates/encoding/demo.html.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<h1>{{ title }}</h1>

<p>{{ citation_prefix }} <a
href="{{ citation_url }}">{{ citation_url }}</a></p>

<pre>
{% include body_template ignore missing %}
</pre>
6 changes: 6 additions & 0 deletions httpbin/templates/encoding/euc-jp.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
これは日本語のテキストの例です。このテキストは、EUC-JPエンコーディングで符号化されています。EUC-JPは、日本語をコンピュータで扱うための文字エンコーディングの一つです。以下に、日本語の文章を続けます。

日本語には、漢字、ひらがな、カタカナの三種類の文字があります。漢字は中国から伝わった文字で、意味を持つ文字です。ひらがなとカタカナは、日本独自の音節文字で、発音を表します。例えば、「日本語」という単語は、漢字で「日本」と書き、ひらがなで「ご」と書きます。

今日の天気は晴れです。青い空が広がっており、風も心地よいです。公園では、多くの人々が散歩を楽しんでいます。子供たちは遊具で遊び、大人たちはベンチに座って話をしています。犬を連れた人も多く見かけます。自然の中で過ごす時間は、とてもリフレッシュできます。

6 changes: 6 additions & 0 deletions httpbin/templates/encoding/gb2312.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
这是一个中文文本的范例。这段文本是以GB2312编码的。GB2312是用来处理中文字符的编码之一。以下是一段中文的文章。

中文中有很多不同的字符,包括汉字、标点符号等。汉字是从古代流传下来的字符,每个字符都有其独特的意义。例如,“中文”这个词是由两个汉字组成的:“中”和“文”。

今天的天气非常好。天空湛蓝,微风习习。公园里有许多人在散步。孩子们在游乐场玩耍,大人们坐在长椅上聊天。很多人带着狗在公园里散步。在自然环境中度过时光,让人感到非常放松。

15 changes: 15 additions & 0 deletions httpbin/templates/encoding/iso-8859-1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
https://en.wikipedia.org/wiki/ISO/IEC_8859-1

! " # $ % & ' ( ) * + , - . /
0 1 2 3 4 5 6 7 8 9 : ; < = > ?
@ A B C D E F G H I J K L M N O
P Q R S T U V W X Y Z [ \ ] ^ _
` a b c d e f g h i j k l m n o
p q r s t u v w x y z { | } ~
NBSP¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ SHY ® ¯
° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾ ¿
À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï
Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß
à á â ã ä å æ ç è é ê ë ì í î ï
ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ

15 changes: 15 additions & 0 deletions httpbin/templates/encoding/iso-8859-2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
https://en.wikipedia.org/wiki/ISO/IEC_8859-2

! " # $ % & ' ( ) * + , - . /
0 1 2 3 4 5 6 7 8 9 : ; < = > ?
@ A B C D E F G H I J K L M N O
P Q R S T U V W X Y Z [ \ ] ^ _
` a b c d e f g h i j k l m n o
p q r s t u v w x y z { | } ~
NBSPĄ ˘ Ł ¤ Ľ Ś § ¨ Š Ş Ť Ź SHY Ž Ż
° ą ˛ ł ´ ľ ś ˇ ¸ š ş ť ź ˝ ž ż
Ŕ Á Â Ă Ä Ĺ Ć Ç Č É Ę Ë Ě Í Î Ď
Đ Ń Ň Ó Ô Ő Ö × Ř Ů Ú Ű Ü Ý Ţ ß
ŕ á â ă ä ĺ ć ç č é ę ë ě í î ď
đ ń ň ó ô ő ö ÷ ř ů ú ű ü ý ţ ˙

15 changes: 15 additions & 0 deletions httpbin/templates/encoding/iso-8859-3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
https://en.wikipedia.org/wiki/ISO/IEC_8859-3

! " # $ % & ' ( ) * + , - . /
0 1 2 3 4 5 6 7 8 9 : ; < = > ?
@ A B C D E F G H I J K L M N O
P Q R S T U V W X Y Z [ \ ] ^ _
` a b c d e f g h i j k l m n o
p q r s t u v w x y z { | } ~
NBSPĦ ˘ £ ¤ Ĥ § ¨ İ Ş Ğ Ĵ SHY Ż
° ħ ² ³ ´ µ ĥ · ¸ ı ş ğ ĵ ½ ż
À Á Â Ä Ċ Ĉ Ç È É Ê Ë Ì Í Î Ï
Ñ Ò Ó Ô Ġ Ö × Ĝ Ù Ú Û Ü Ŭ Ŝ ß
à á â ä ċ ĉ ç è é ê ë ì í î ï
ñ ò ó ô ġ ö ÷ ĝ ù ú û ü ŭ ŝ ˙

15 changes: 15 additions & 0 deletions httpbin/templates/encoding/iso-8859-4.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
https://en.wikipedia.org/wiki/ISO/IEC_8859-4

! " # $ % & ' ( ) * + , - . /
0 1 2 3 4 5 6 7 8 9 : ; < = > ?
@ A B C D E F G H I J K L M N O
P Q R S T U V W X Y Z [ \ ] ^ _
` a b c d e f g h i j k l m n o
p q r s t u v w x y z { | } ~
NBSPĄ ĸ Ŗ ¤ Ĩ Ļ § ¨ Š Ē Ģ Ŧ Ž SPH ¯
° ą ˛ ŗ ´ ĩ ļ ˇ ¸ š ē ģ ŧ Ŋ ž ŋ
Ā Á Â Ã Ä Å Æ Į Č É Ę Ë Ė Í Î Ī
Đ Ņ Ō Ķ Ô Õ Ö × Ø Ų Ú Û Ü Ũ Ū ß
ā á â ã ä å æ į č é ę ë ė í î ī
đ ņ ō ķ ô õ ö ÷ ø ų ú û ü ũ ū ˙

6 changes: 6 additions & 0 deletions httpbin/templates/encoding/shift_jis.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
これは日本語のテキストの例です。このテキストは、Shift_JISエンコーディングで符号化されています。Shift_JISは、日本語をコンピュータで扱うための文字エンコーディングの一つです。以下に、日本語の文章を続けます。

日本語には、漢字、ひらがな、カタカナの三種類の文字があります。漢字は中国から伝わった文字で、意味を持つ文字です。ひらがなとカタカナは、日本独自の音節文字で、発音を表します。例えば、「日本語」という単語は、漢字で「日本」と書き、ひらがなで「ご」と書きます。

今日の天気は晴れです。青い空が広がっており、風も心地よいです。公園では、多くの人々が散歩を楽しんでいます。子供たちは遊具で遊び、大人たちはベンチに座って話をしています。犬を連れた人も多く見かけます。自然の中で過ごす時間は、とてもリフレッシュできます。

Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
<h1>Unicode Demo</h1>

<p>Taken from <a
href="http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt">http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt</a></p>

<pre>

UTF-8 encoded sample plain-text file
‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
Expand Down Expand Up @@ -217,4 +211,3 @@ Box drawing alignment tests: █
╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
▝▀▘▙▄▟

</pre>
16 changes: 16 additions & 0 deletions httpbin/templates/encoding/windows-1252.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
https://en.wikipedia.org/wiki/Windows-1252

! " # $ % & ' ( ) * + , - . /
0 1 2 3 4 5 6 7 8 9 : ; < = > ?
@ A B C D E F G H I J K L M N O
P Q R S T U V W X Y Z [ \ ] ^ _
` a b c d e f g h i j k l m n o
p q r s t u v w x y z { | } ~
€ ‚ ƒ „ … † ‡ ˆ ‰ Š ‹ Œ Ž
NBSP¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ SHY ® ¯
° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾ ¿
À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï
Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß
à á â ã ä å æ ç è é ê ë ì í î ï
ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ

111 changes: 111 additions & 0 deletions tests/test_httpbin.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import httpbin
from httpbin.helpers import parse_multi_value_header
from httpbin.helpers import normalize_charset


@contextlib.contextmanager
Expand Down Expand Up @@ -790,3 +791,113 @@ def test_parse_multi_value_header(self):
self.assertEqual(parse_multi_value_header('"xyzzy", "r2d2xxxx", "c3piozzzz"'), [ "xyzzy", "r2d2xxxx", "c3piozzzz" ])
self.assertEqual(parse_multi_value_header('W/"xyzzy", W/"r2d2xxxx", W/"c3piozzzz"'), [ "xyzzy", "r2d2xxxx", "c3piozzzz" ])
self.assertEqual(parse_multi_value_header('*'), [ "*" ])

def test_encoding_endpoint(self):
codec = 'utf-8'
response = self.app.get(f'/encoding/{codec}')

# Check that the request was successful.
self.assertEqual(response.status_code, 200)

# Check that the response headers indicate the correct content type.
self.assertEqual(response.headers['Content-Type'], 'text/html; charset=utf-8')

# Check that the response body is not empty.
self.assertTrue(len(response.text) > 0)

def test_encoding_endpoint_iso(self):
response = self.app.get('/encoding/ISO-8859-1?content-type=application/json')

# Check that the request was successful.
self.assertEqual(response.status_code, 200)

# Check that the response headers indicate the correct content type.
self.assertEqual("application/json; charset=ISO-8859-1", response.headers['Content-Type'])

# Check that the response body is not empty.
self.assertTrue(len(response.data) > 0)

def test_swagger_spec(self):
response = self.app.get('/spec.json')

# Check that the request was successful.
self.assertEqual(response.status_code, 200)

# Check that the response body is not empty.
self.assertTrue(len(response.text) > 0)

def test_normalize_charset(self):
test_cases = [
("UTF8", "UTF-8"),
("utf-8", "UTF-8"),
("UTF16", "UTF-16"),
("utf-16", "UTF-16"),
("UTF32", "UTF-32"),
("utf-32", "UTF-32"),
("utf-64", None),
("iso-ir-100", "ISO-8859-1"),
("csISOLatin1", "ISO-8859-1"),
("latin1", "ISO-8859-1"),
("l1", "ISO-8859-1"),
("IBM819", "ISO-8859-1"),
("CP819", "ISO-8859-1"),
("iso-ir-6", "US-ASCII"),
("ANSI_X3.4-1968", "US-ASCII"),
("ANSI_X3.4-1986", "US-ASCII"),
("ISO_646.irv:1991", "US-ASCII"),
("ASCII", "US-ASCII"),
("ISO646-US", "US-ASCII"),
("us", "US-ASCII"),
("csASCII", "US-ASCII"),
("ISO-8859-1", "ISO-8859-1"),
("iso8859-1", "ISO-8859-1"),
("iso88591", "ISO-8859-1"),
("latin1", "ISO-8859-1"),
("latin-1", "ISO-8859-1"),
("ISO_8859-1:1987", "ISO-8859-1"),
("ISO_8859-1", "ISO-8859-1"),
("ISO-8859-2", "ISO-8859-2"),
("iso-ir-101", "ISO-8859-2"),
("csISOLatin2", "ISO-8859-2"),
("latin2", "ISO-8859-2"),
("l2", "ISO-8859-2"),
("IBM912", "ISO-8859-2"),
("CP912", "ISO-8859-2"),
("ISO-8859-3", "ISO-8859-3"),
("iso-ir-109", "ISO-8859-3"),
("csISOLatin3", "ISO-8859-3"),
("latin3", "ISO-8859-3"),
("l3", "ISO-8859-3"),
("IBM913", "ISO-8859-3"),
("CP913", "ISO-8859-3"),
("ISO-8859-4", "ISO-8859-4"),
("iso-ir-110", "ISO-8859-4"),
("csISOLatin4", "ISO-8859-4"),
("latin4", "ISO-8859-4"),
("l4", "ISO-8859-4"),
("IBM914", "ISO-8859-4"),
("CP914", "ISO-8859-4"),
("big5", "Big5"),
("csbig5", "Big5"),
("cn-big5", "Big5"),
("euc-jp", "EUC-JP"),
("japanese", "EUC-JP"),
("cseucpkdfmtjapanese", "EUC-JP"),
("extended_unix_code_packed_format_for_japanese", "EUC-JP"),
("shift_jis", "Shift_JIS"),
("csshiftjis", "Shift_JIS"),
("ms_kanji", "Shift_JIS"),
("x-sjis", "Shift_JIS"),
("gb2312", "GB2312"),
("csGB2312", "GB2312"),
("chinese", "GB2312"),
("windows-1252", "Windows-1252"),
("windows1252", "Windows-1252"),
("cp1252", "Windows-1252"),
("ms-ee", "Windows-1252"),
("unknown-charset", None),
]

for charset, expected in test_cases:
with self.subTest(charset=charset):
self.assertEqual(normalize_charset(charset), expected)