jenkins-bot submitted this change.

View Change

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

Add get_charset_from_content_type

used to extract the charset from the content-type response header

Change-Id: I7216488c9582f6de92034378b1d588e8dbfbc717
---
M pywikibot/comms/http.py
M scripts/reflinks.py
M tests/http_tests.py
3 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py
index d683dfe..792a44d 100644
--- a/pywikibot/comms/http.py
+++ b/pywikibot/comms/http.py
@@ -395,6 +395,31 @@
     return response
 
 
+# Extract charset (from content-type header)
+CHARSET_RE = re.compile(
+    r'charset\s*=\s*(?P<q>[\'"]?)(?P<charset>[^\'",;>/]+)(?P=q)',
+    flags=re.I,
+)
+
+
+def get_charset_from_content_type(content_type: str) -> Optional[str]:
+    """Get charset from the content-type header.
+
+    .. versionadded:: 7.3
+    """
+    m = CHARSET_RE.search(content_type)
+    if not m:
+        return None
+    charset = m.group('charset').strip('"\' ').lower()
+    # Convert to python correct encoding names
+    if re.sub(r'[ _\-]', '', charset) == 'xeucjp':
+        charset = 'euc_jp'
+    else:
+        # fix cp encodings (T304830)
+        charset = re.sub(r'\Acp[ _\-](\d{3,4})', r'cp\1', charset)
+    return charset
+
+
 def _get_encoding_from_response_headers(response) -> Optional[str]:
     """Return charset given by the response header."""
     content_type = response.headers.get('content-type')
@@ -402,9 +427,9 @@
     if not content_type:
         return None
 
-    m = re.search('charset=(?P<charset>.*?$)', content_type)
-    if m:
-        header_encoding = m.group('charset')
+    charset = get_charset_from_content_type(content_type)
+    if charset:
+        header_encoding = charset
     elif 'json' in content_type:
         # application/json | application/sparql-results+json
         header_encoding = 'utf-8'
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 67283a7..d8e571c 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -58,12 +58,12 @@
 from functools import partial
 from http import HTTPStatus
 from textwrap import shorten
-from typing import Optional
 
 import pywikibot
 from pywikibot import comms, config, i18n, pagegenerators, textlib
-from pywikibot.backports import Match, removeprefix
+from pywikibot.backports import removeprefix
 from pywikibot.bot import ConfigParserBot, ExistingPageBot, SingleSiteBot
+from pywikibot.comms.http import get_charset_from_content_type
 from pywikibot.exceptions import (
     FatalServerError,
     Server414Error,
@@ -474,9 +474,6 @@
         # Regex to grasp content-type meta HTML tag in HTML source
         self.META_CONTENT = re.compile(
             br'(?i)<meta[^>]*(?:content\-type|charset)[^>]*>')
-        # Extract the encoding from a charset property (from content-type !)
-        self.CHARSET = re.compile(
-            r'(?i)charset\s*=\s*(?P<enc>(?P<q>[\'"]?)[^\'",;>/]*(?P=q))')
         # Extract html title from page
         self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)')
         # Matches content inside <script>/<style>/HTML comments
@@ -549,21 +546,6 @@
             return True
         return super().skip_page(page)
 
-    @staticmethod
-    def charset(enc: Match) -> Optional[str]:
-        """Find an encoding type."""
-        if enc:
-            # Use encoding if found. Else use chardet apparent encoding
-            encoding = enc.group('enc').strip('"\' ').lower()
-            # Convert to python correct encoding names
-            if re.sub(r'[ _\-]', '', encoding) == 'xeucjp':
-                encoding = 'euc_jp'
-            else:
-                # fix cp encodings (T304830)
-                encoding = re.sub(r'\Acp[ _\-](\d{3,4})', r'cp\1', encoding)
-            return encoding
-        return None
-
     def treat(self, page) -> None:
         """Process one page."""
         # Load the page's text from the wiki
@@ -664,14 +646,12 @@
             linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)
 
             meta_content = self.META_CONTENT.search(linkedpagetext)
-            s = None
+            encoding = None
             if content_type:
-                # use charset from http header
-                s = self.CHARSET.search(content_type)
+                encoding = get_charset_from_content_type(content_type)
 
             if meta_content:
                 tag = None
-                encoding = self.charset(s)
                 encodings = [encoding] if encoding else []
                 encodings += list(page.site.encodings())
                 for enc in encodings:
@@ -679,14 +659,12 @@
                         tag = meta_content.group().decode(enc)
                         break
 
-                # Prefer the contentType from the HTTP header :
+                # Prefer the content-type from the HTTP header
                 if not content_type and tag:
                     content_type = tag
-                if not s:
-                    # use charset from html
-                    s = self.CHARSET.search(tag)
+                if not encoding:
+                    encoding = get_charset_from_content_type(tag)
 
-            encoding = self.charset(s)
             if encoding:
                 r.encoding = encoding
 
diff --git a/tests/http_tests.py b/tests/http_tests.py
index 384bd91..491eea5 100755
--- a/tests/http_tests.py
+++ b/tests/http_tests.py
@@ -403,6 +403,20 @@
         resp.encoding = http._decide_encoding(resp, charset)
         self.assertEqual('latin1', resp.encoding)
 
+    def test_charset_not_last(self):
+        """Test charset not last part of content-type header."""
+        charset = None
+        resp = CharsetTestCase._create_response(
+            headers={
+                'content-type': (
+                    'text/html; charset=utf-8; profile='
+                    '"https://www.mediawiki.org/wiki/Specs/HTML/2.4.0"'
+                )
+            },
+            data=CharsetTestCase.UTF8_BYTES)
+        resp.encoding = http._decide_encoding(resp, charset)
+        self.assertEqual('utf-8', resp.encoding)
+
     def test_server_charset(self):
         """Test decoding with server explicit charset."""
         charset = None

To view, visit change 783432. To unsubscribe, or for help writing mail filters, visit settings.