jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
Add get_charset_from_content_type

used to extract the charset from the content-type response header

Change-Id: I7216488c9582f6de92034378b1d588e8dbfbc717
---
M pywikibot/comms/http.py
M scripts/reflinks.py
M tests/http_tests.py
3 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py
index d683dfe..792a44d 100644
--- a/pywikibot/comms/http.py
+++ b/pywikibot/comms/http.py
@@ -395,6 +395,31 @@
return response


+# Extract charset (from content-type header)
+CHARSET_RE = re.compile(
+ r'charset\s*=\s*(?P<q>[\'"]?)(?P<charset>[^\'",;>/]+)(?P=q)',
+ flags=re.I,
+)
+
+
+def get_charset_from_content_type(content_type: str) -> Optional[str]:
+ """Get charset from the content-type header.
+
+ .. versionadded:: 7.3
+ """
+ m = CHARSET_RE.search(content_type)
+ if not m:
+ return None
+ charset = m.group('charset').strip('"\' ').lower()
+ # Convert to python correct encoding names
+ if re.sub(r'[ _\-]', '', charset) == 'xeucjp':
+ charset = 'euc_jp'
+ else:
+ # fix cp encodings (T304830)
+ charset = re.sub(r'\Acp[ _\-](\d{3,4})', r'cp\1', charset)
+ return charset
+
+
def _get_encoding_from_response_headers(response) -> Optional[str]:
"""Return charset given by the response header."""
content_type = response.headers.get('content-type')
@@ -402,9 +427,9 @@
if not content_type:
return None

- m = re.search('charset=(?P<charset>.*?$)', content_type)
- if m:
- header_encoding = m.group('charset')
+ charset = get_charset_from_content_type(content_type)
+ if charset:
+ header_encoding = charset
elif 'json' in content_type:
# application/json | application/sparql-results+json
header_encoding = 'utf-8'
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 67283a7..d8e571c 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -58,12 +58,12 @@
from functools import partial
from http import HTTPStatus
from textwrap import shorten
-from typing import Optional

import pywikibot
from pywikibot import comms, config, i18n, pagegenerators, textlib
-from pywikibot.backports import Match, removeprefix
+from pywikibot.backports import removeprefix
from pywikibot.bot import ConfigParserBot, ExistingPageBot, SingleSiteBot
+from pywikibot.comms.http import get_charset_from_content_type
from pywikibot.exceptions import (
FatalServerError,
Server414Error,
@@ -474,9 +474,6 @@
# Regex to grasp content-type meta HTML tag in HTML source
self.META_CONTENT = re.compile(
br'(?i)<meta[^>]*(?:content\-type|charset)[^>]*>')
- # Extract the encoding from a charset property (from content-type !)
- self.CHARSET = re.compile(
- r'(?i)charset\s*=\s*(?P<enc>(?P<q>[\'"]?)[^\'",;>/]*(?P=q))')
# Extract html title from page
self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)')
# Matches content inside <script>/<style>/HTML comments
@@ -549,21 +546,6 @@
return True
return super().skip_page(page)

- @staticmethod
- def charset(enc: Match) -> Optional[str]:
- """Find an encoding type."""
- if enc:
- # Use encoding if found. Else use chardet apparent encoding
- encoding = enc.group('enc').strip('"\' ').lower()
- # Convert to python correct encoding names
- if re.sub(r'[ _\-]', '', encoding) == 'xeucjp':
- encoding = 'euc_jp'
- else:
- # fix cp encodings (T304830)
- encoding = re.sub(r'\Acp[ _\-](\d{3,4})', r'cp\1', encoding)
- return encoding
- return None
-
def treat(self, page) -> None:
"""Process one page."""
# Load the page's text from the wiki
@@ -664,14 +646,12 @@
linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)

meta_content = self.META_CONTENT.search(linkedpagetext)
- s = None
+ encoding = None
if content_type:
- # use charset from http header
- s = self.CHARSET.search(content_type)
+ encoding = get_charset_from_content_type(content_type)

if meta_content:
tag = None
- encoding = self.charset(s)
encodings = [encoding] if encoding else []
encodings += list(page.site.encodings())
for enc in encodings:
@@ -679,14 +659,12 @@
tag = meta_content.group().decode(enc)
break

- # Prefer the contentType from the HTTP header :
+ # Prefer the content-type from the HTTP header
if not content_type and tag:
content_type = tag
- if not s:
- # use charset from html
- s = self.CHARSET.search(tag)
+ if not encoding:
+ encoding = get_charset_from_content_type(tag)

- encoding = self.charset(s)
if encoding:
r.encoding = encoding

diff --git a/tests/http_tests.py b/tests/http_tests.py
index 384bd91..491eea5 100755
--- a/tests/http_tests.py
+++ b/tests/http_tests.py
@@ -403,6 +403,20 @@
resp.encoding = http._decide_encoding(resp, charset)
self.assertEqual('latin1', resp.encoding)

+ def test_charset_not_last(self):
+ """Test charset not last part of content-type header."""
+ charset = None
+ resp = CharsetTestCase._create_response(
+ headers={
+ 'content-type': (
+ 'text/html; charset=utf-8; profile='
+ '"https://www.mediawiki.org/wiki/Specs/HTML/2.4.0"'
+ )
+ },
+ data=CharsetTestCase.UTF8_BYTES)
+ resp.encoding = http._decide_encoding(resp, charset)
+ self.assertEqual('utf-8', resp.encoding)
+
def test_server_charset(self):
"""Test decoding with server explicit charset."""
charset = None

To view, visit change 783432. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I7216488c9582f6de92034378b1d588e8dbfbc717
Gerrit-Change-Number: 783432
Gerrit-PatchSet: 6
Gerrit-Owner: JJMC89 <JJMC89.Wikimedia@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged