jenkins-bot has submitted this change and it was merged.
Change subject: timestripper: prevent recognizing components too far from each other
......................................................................
timestripper: prevent recognizing components too far from each other
timestripper should not be too flexible about the locations
of the components of a timestamp. The added test demonstrates
a false positive, incorrectly recognized as a timestamp.
This patch places a limit to the distance between neighboring
components of a timestamp. Tentatively the limit is set to 10.
Change-Id: I8ef86e21f08248d6abb7d1b78252029d2ce0c017
---
M pywikibot/textlib.py
M tests/timestripper_tests.py
2 files changed, 44 insertions(+), 14 deletions(-)
Approvals:
Dalba: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index fc7c4b1..5a0e80e 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -128,6 +128,11 @@
'or': u'୦୧୨୩୪୫୬୭୮୯',
}
+# Used in TimeStripper. When a timestamp-like line have longer gaps
+# than this between year, month, etc in it, then the line will not be
+# considered to contain a timestamp.
+TIMESTAMP_GAP_LIMIT = 10
+
def to_local_digits(phrase, lang):
"""
@@ -1972,17 +1977,19 @@
return (txt, None)
@staticmethod
- def _valid_date_dict_order(dateDict):
+ def _valid_date_dict_positions(dateDict):
"""Check consistency of reasonable positions for
groups."""
- day_pos = dateDict['day']['pos']
- month_pos = dateDict['month']['pos']
- year_pos = dateDict['year']['pos']
- time_pos = dateDict['time']['pos']
- tzinfo_pos = dateDict['tzinfo']['pos']
+ time_pos = dateDict['time']['start']
+ tzinfo_pos = dateDict['tzinfo']['start']
+ date_pos = sorted(
+ (dateDict['day'], dateDict['month'],
dateDict['year']),
+ key=lambda x: x['start'])
+ min_pos, max_pos = date_pos[0]['start'], date_pos[-1]['start']
+ max_gap = max(x[1]['start'] - x[0]['end']
+ for x in zip(date_pos, date_pos[1:]))
- date_pos = sorted((day_pos, month_pos, year_pos))
- min_pos, max_pos = date_pos[0], date_pos[-1]
-
+ if max_gap > TIMESTAMP_GAP_LIMIT:
+ return False
if tzinfo_pos < min_pos or tzinfo_pos < time_pos:
return False
if min_pos < tzinfo_pos < max_pos:
@@ -2023,15 +2030,16 @@
line, match_obj = self._last_match_and_replace(line, pat)
if match_obj:
for group, value in match_obj.groupdict().items():
- pos = match_obj.start(group)
- # Store also match pos in line, for later order check.
- matchDict = {group: {'value': value, 'pos': pos}}
- dateDict.update(matchDict)
+ start, end = (match_obj.start(group), match_obj.end(group))
+ # The positions are stored for later validation
+ dateDict[group] = {
+ 'value': value, 'start': start, 'end':
end
+ }
# all fields matched -> date valid
# groups are in a reasonable order.
if (all(g in dateDict for g in self.groups) and
- self._valid_date_dict_order(dateDict)):
+ self._valid_date_dict_positions(dateDict)):
# remove 'time' key, now split in hour/minute and not needed
# by datetime.
del dateDict['time']
diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py
index 90eca23..1fdd938 100644
--- a/tests/timestripper_tests.py
+++ b/tests/timestripper_tests.py
@@ -142,6 +142,28 @@
)
+class TestTimeStripperNumberAndDate(TestTimeStripperCase):
+
+ """Test cases for lines with (non-year) numbers and
timestamps."""
+
+ family = 'wikipedia'
+ code = 'en'
+
+ def test_four_digit_is_not_year_with_no_timestamp(self):
+ """A 4-digit number should not be mistaken as year (w/o
timestamp)."""
+ self.assertIsNone(
+ self.ts.timestripper(
+ '2000 people will meet on 16 December at 22:00 (UTC).'))
+
+ def test_four_digit_is_not_year_with_timestamp(self):
+ """A 4-digit number should not be mistaken as year (w/
timestamp)."""
+ self.assertEqual(
+ self.ts.timestripper(
+ '2000 people will attend. --12:12, 14 December 2015 (UTC)'),
+ datetime.datetime(
+ 2015, 12, 14, 12, 12, tzinfo=tzoneFixedOffset(0, 'UTC')))
+
+
class TestTimeStripperLanguage(TestCase):
"""Test cases for English language."""
--
To view, visit
https://gerrit.wikimedia.org/r/321862
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I8ef86e21f08248d6abb7d1b78252029d2ce0c017
Gerrit-PatchSet: 10
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Whym <whym(a)whym.org>
Gerrit-Reviewer: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>