jenkins-bot has submitted this change and it was merged.
Change subject: site.py: yield preloaded pages in the same order as requested
......................................................................
site.py: yield preloaded pages in the same order as requested
Change-Id: I3d1400b27fd61fe14b13dbe6138e310cdbe3048c
---
M pywikibot/site.py
M tests/pagegenerators_tests.py
M tests/site_tests.py
3 files changed, 89 insertions(+), 26 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/site.py b/pywikibot/site.py
index 2800514..a43aa0f 100644
--- a/pywikibot/site.py
+++ b/pywikibot/site.py
@@ -19,6 +19,7 @@
import datetime
import functools
import hashlib
+import heapq
import itertools
import json
import mimetypes
@@ -3091,38 +3092,55 @@
langlinks=False, pageprops=False):
"""Return a generator to a list of preloaded pages.
- Note that [at least in current implementation] pages may be iterated
- in a different order than in the underlying pagelist.
+ Pages are iterated in the same order than in the underlying pagelist.
+ In case of duplicates in a groupsize batch, return the first entry.
@param pagelist: an iterable that returns Page objects
@param groupsize: how many Pages to query at a time
@type groupsize: int
- @param templates: preload list of templates in the pages
- @param langlinks: preload list of language links found in the pages
+ @param templates: preload pages (typically templates) transcluded in
+ the provided pages
+ @type templates: bool
+ @param langlinks: preload all language links from the provided pages
+ to other languages
+ @type langlinks: bool
+ @param pageprops: preload various properties defined in the page content
+ @type pageprops: bool
"""
+ props = 'revisions|info|categoryinfo'
+ if templates:
+ props += '|templates'
+ if langlinks:
+ props += '|langlinks'
+ if pageprops:
+ props += '|pageprops'
+
+ rvprop = ['ids', 'flags', 'timestamp', 'user',
'comment', 'content']
+
for sublist in itergroup(pagelist, groupsize):
+ # Do not use p.pageid property as it will force page loading.
pageids = [str(p._pageid) for p in sublist
if hasattr(p, "_pageid") and p._pageid > 0]
- cache = dict((p.title(withSection=False), p) for p in sublist)
+ cache = {}
+ # In case of duplicates, return the first entry.
+ for priority, page in enumerate(sublist):
+ cache.setdefault(page.title(withSection=False),
+ (priority, page))
- props = "revisions|info|categoryinfo"
- if templates:
- props += '|templates'
- if langlinks:
- props += '|langlinks'
- if pageprops:
- props += '|pageprops'
+ prio_queue = []
+ next_prio = 0
rvgen = api.PropertyGenerator(props, site=self)
rvgen.set_maximum_items(-1) # suppress use of "rvlimit" parameter
if len(pageids) == len(sublist):
# only use pageids if all pages have them
- rvgen.request["pageids"] = "|".join(pageids)
+ rvgen.request['pageids'] = set(pageids)
else:
- rvgen.request["titles"] =
"|".join(list(cache.keys()))
- rvgen.request[u"rvprop"] =
u"ids|flags|timestamp|user|comment|content"
+ rvgen.request['titles'] = list(cache.keys())
+ rvgen.request['rvprop'] = rvprop
pywikibot.output(u"Retrieving %s pages from %s."
% (len(cache), self))
+
for pagedata in rvgen:
pywikibot.debug(u"Preloading %s" % pagedata, _logger)
try:
@@ -3148,8 +3166,20 @@
pywikibot.debug(u"pageids=%s" % pageids, _logger)
pywikibot.debug(u"titles=%s" % list(cache.keys()),
_logger)
continue
- page = cache[pagedata['title']]
+ priority, page = cache[pagedata['title']]
api.update_page(page, pagedata, rvgen.props)
+ priority, page = heapq.heappushpop(prio_queue, (priority, page))
+ # Smallest priority matches expected one; yield.
+ if priority == next_prio:
+ yield page
+ next_prio += 1
+ else:
+ # Push back onto the heap.
+ heapq.heappush(prio_queue, (priority, page))
+
+ # Empty the heap.
+ while prio_queue:
+ priority, page = heapq.heappop(prio_queue)
yield page
def validate_tokens(self, types):
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index b5b8f57..5148716 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -481,30 +481,46 @@
def test_basic(self):
"""Test PreloadingGenerator with a list of
pages."""
mainpage = self.get_mainpage()
- links = list(self.site.pagelinks(mainpage, total=10))
+ links = [page for page in self.site.pagelinks(mainpage, total=20)
+ if page.exists()]
count = 0
for page in PreloadingGenerator(links, groupsize=20):
self.assertIsInstance(page, pywikibot.Page)
self.assertIsInstance(page.exists(), bool)
- if page.exists():
- self.assertEqual(len(page._revisions), 1)
- self.assertIsNotNone(page._revisions[page._revid].text)
- self.assertFalse(hasattr(page, '_pageprops'))
+ self.assertEqual(len(page._revisions), 1)
+ self.assertIsNotNone(page._revisions[page._revid].text)
+ self.assertFalse(hasattr(page, '_pageprops'))
count += 1
self.assertEqual(len(links), count)
def test_low_step(self):
"""Test PreloadingGenerator with a list of
pages."""
mainpage = self.get_mainpage()
- links = list(self.site.pagelinks(mainpage, total=20))
+ links = [page for page in self.site.pagelinks(mainpage, total=20)
+ if page.exists()]
count = 0
for page in PreloadingGenerator(links, groupsize=10):
self.assertIsInstance(page, pywikibot.Page)
self.assertIsInstance(page.exists(), bool)
- if page.exists():
- self.assertEqual(len(page._revisions), 1)
- self.assertIsNotNone(page._revisions[page._revid].text)
- self.assertFalse(hasattr(page, '_pageprops'))
+ self.assertEqual(len(page._revisions), 1)
+ self.assertIsNotNone(page._revisions[page._revid].text)
+ self.assertFalse(hasattr(page, '_pageprops'))
+ count += 1
+ self.assertEqual(len(links), count)
+
+ def test_order(self):
+ """Test outcome is following same order of
input."""
+ mainpage = self.get_mainpage()
+ links = [page for page in self.site.pagelinks(mainpage, total=20)
+ if page.exists()]
+ count = 0
+ for page in PreloadingGenerator(links, groupsize=10):
+ self.assertIsInstance(page, pywikibot.Page)
+ self.assertIsInstance(page.exists(), bool)
+ self.assertEqual(len(page._revisions), 1)
+ self.assertIsNotNone(page._revisions[page._revid].text)
+ self.assertFalse(hasattr(page, '_pageprops'))
+ self.assertEqual(page, links[count])
count += 1
self.assertEqual(len(links), count)
diff --git a/tests/site_tests.py b/tests/site_tests.py
index 677e32d..22b59de 100644
--- a/tests/site_tests.py
+++ b/tests/site_tests.py
@@ -2463,6 +2463,23 @@
"""Test site.preloadpages()."""
+ def test_order(self):
+ """Test outcome is following same order of
input."""
+ mainpage = self.get_mainpage()
+ links = [page for page in self.site.pagelinks(mainpage, total=20)
+ if page.exists()]
+ pages = list(self.site.preloadpages(links, groupsize=5))
+ self.assertEqual(pages, links)
+
+ def test_duplicates(self):
+ """Test outcome is following same order of
input."""
+ mainpage = self.get_mainpage()
+ links = [page for page in self.site.pagelinks(mainpage, total=20)
+ if page.exists()]
+ dupl_links = links + links[::-1]
+ pages = list(self.site.preloadpages(dupl_links, groupsize=40))
+ self.assertEqual(pages, links)
+
def test_pageids(self):
"""Test basic preloading with pageids."""
mysite = self.get_site()
--
To view, visit
https://gerrit.wikimedia.org/r/285047
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I3d1400b27fd61fe14b13dbe6138e310cdbe3048c
Gerrit-PatchSet: 6
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>