python python-requests wikipedia-api mediawiki-api batching

Understanding Wikipedia titles batching API

With the MediaWiki API we can query the Wikipedia API. One of the fields is titles where one or more titles can be queried at the same time. Batching them together is recommended in high load scenarios to avoid multiple consecutive requests. Multiple titles should be separated by a pipe | character.

I am using the Wikipedia API to find "translations" of categories. Let's say I have an English category "Antiquity", I want to find the corresponding category in a different language. That is possible by querying the API for the prop langlinks.

I find that, indeed, I can find such one-on-one mappings of an English category if I do not use batching, but if I do use batching, I do not always get all of the results back. To illustrate, I have a list of English categories, and at each iteration I process one item more than before (starting with only one). With batching, it becomes clear that with larger lists (still well within the max. limit of 50 imposed by the API), the earlier categories are lost and not included anymore. When not using batching (batch size=1), this issue does not occur.

import requests

def get_translated_category(category_titles: str | list[str], target_lang: str, batch_size: int = 50) -> list[str]:
    """Fetch the translated equivalent of a Wikipedia category."""
    
    endpoint = "https://en.wikipedia.org/w/api.php"
    if isinstance(category_titles, str):
        category_titles = [category_titles]

    category_titles = [f"Category:{title}" for title in category_titles]

    translated_categories = {}
    # API is limited to 50 titles per request
    for start_idx in range(0, len(category_titles), batch_size):
        end_idx = start_idx + batch_size
        batch_titles = category_titles[start_idx:end_idx]
        params = {
            "action": "query",
            "format": "json",
            "prop": "langlinks",
            "titles": "|".join(batch_titles),
            "lllimit": "max"
        }

        response = requests.get(endpoint, params=params)
        data = response.json()

        pages = data.get("query", {}).get("pages", {})
        for page_data in pages.values():
            title = page_data["title"].split(":")[-1]
            if title in translated_categories:
                print("We already found this category title!")
            langlinks = page_data.get("langlinks", [])
            for link in langlinks:
                if link["lang"] == target_lang:
                    translated_categories[title] = link["*"].split(":")[-1]

    return translated_categories




if __name__ == "__main__":
    english_categories: list[str] = [
        "Classical antiquity",
        "Late antiquity",
        "Latin-language literature",
        "Roman Kingdom",
        "Roman Republic",
        "Roman Empire",
        "Byzantine Empire",
        "Latin language",
        "Ancient Greek",
        "Ancient Greece",
        "Ancient Greek literature",
        "Medieval history of Greece",
    ]

    print("Batch size 50 (default)")
    for idx in range(len(english_categories)):
        categories = english_categories[:idx+1]
        latin_categories = get_translated_category(categories, "la")
        print(latin_categories)
    
    print()
    print("Batch size 1 (no batching)")
    for idx in range(len(english_categories)):
        categories = english_categories[:idx+1]
        latin_categories = get_translated_category(categories, "la", batch_size=1)
        print(latin_categories)

The output of the code above is:

Batch size 50 (default)
{'Classical antiquity': 'Res classicae'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae', 'Roman Empire': 'Imperium Romanum'}
{'Byzantine Empire': 'Imperium Byzantinum', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae', 'Roman Empire': 'Imperium Romanum'}
{'Byzantine Empire': 'Imperium Byzantinum', 'Late antiquity': 'Antiquitas Posterior', 'Latin language': 'Lingua Latina', 'Roman Empire': 'Imperium Romanum'}
{'Byzantine Empire': 'Imperium Byzantinum', 'Late antiquity': 'Antiquitas Posterior', 'Latin language': 'Lingua Latina', 'Roman Empire': 'Imperium Romanum'}
{'Ancient Greece': 'Graecia antiqua', 'Byzantine Empire': 'Imperium Byzantinum', 'Late antiquity': 'Antiquitas Posterior', 'Roman Empire': 'Imperium Romanum'}
{'Ancient Greece': 'Graecia antiqua', 'Byzantine Empire': 'Imperium Byzantinum', 'Late antiquity': 'Antiquitas Posterior', 'Roman Empire': 'Imperium Romanum'}
{'Ancient Greece': 'Graecia antiqua', 'Byzantine Empire': 'Imperium Byzantinum', 'Late antiquity': 'Antiquitas Posterior', 'Roman Empire': 'Imperium Romanum'}

Batch size 1 (no batching)
{'Classical antiquity': 'Res classicae'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae', 'Roman Empire': 'Imperium Romanum'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae', 'Roman Empire': 'Imperium Romanum', 'Byzantine Empire': 'Imperium Byzantinum'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae', 'Roman Empire': 'Imperium Romanum', 'Byzantine Empire': 'Imperium Byzantinum', 'Latin language': 'Lingua Latina'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae', 'Roman Empire': 'Imperium Romanum', 'Byzantine Empire': 'Imperium Byzantinum', 'Latin language': 'Lingua Latina', 'Ancient Greek': 'Lingua Graeca antiqua'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae', 'Roman Empire': 'Imperium Romanum', 'Byzantine Empire': 'Imperium Byzantinum', 'Latin language': 'Lingua Latina', 'Ancient Greek': 'Lingua Graeca antiqua', 'Ancient Greece': 'Graecia antiqua'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae', 'Roman Empire': 'Imperium Romanum', 'Byzantine Empire': 'Imperium Byzantinum', 'Latin language': 'Lingua Latina', 'Ancient Greek': 'Lingua Graeca antiqua', 'Ancient Greece': 'Graecia antiqua', 'Ancient Greek literature': 'Litterae Graecae antiquae'}
{'Classical antiquity': 'Res classicae', 'Late antiquity': 'Antiquitas Posterior', 'Latin-language literature': 'Litterae Latinae', 'Roman Empire': 'Imperium Romanum', 'Byzantine Empire': 'Imperium Byzantinum', 'Latin language': 'Lingua Latina', 'Ancient Greek': 'Lingua Graeca antiqua', 'Ancient Greece': 'Graecia antiqua', 'Ancient Greek literature': 'Litterae Graecae antiquae'}

It should be immediately clear that there is a difference between batching and not using batching and, more worrisome, that using batching leads to some items being discarded. I thought that perhaps this would be the case if categories are merged in Latin and have the same name, so the API resolves to only returning one of them, but as far as I can tell that is not the case.

How can I ensure that batching my requests (titles) together, I get the same results as firing individual requests with the Wikipedia API?

EDIT: after further investigation it would seem that the API does return results for all categories (the pages variable) but for some reason the corresponding languages (langlinks) are not the same.

Solution

Lucas Werkmeister solved the issue on Meta Wikimedia: I was missing the paging mechanism, i.e. the ability to "continue" retrieving data.

Updated working code:

import requests


def get_translated_category(category_titles: str | list[str], target_lang: str, batch_size: int = 50) -> dict:
    """Fetch the translated equivalent of a Wikipedia category using paging."""
    
    endpoint = "https://en.wikipedia.org/w/api.php"
    if isinstance(category_titles, str):
        category_titles = [category_titles]

    # Prepend 'Category:' to each title
    category_titles = [f"Category:{title}" for title in category_titles]

    translated_categories = {}
    # API is limited to 50 titles per request
    for start_idx in range(0, len(category_titles), batch_size):
        batch_titles = category_titles[start_idx: start_idx + batch_size]
        # Setup initial parameters
        params = {
            "action": "query",
            "format": "json",
            "prop": "langlinks",
            "titles": "|".join(batch_titles),
            "lllimit": "max"
        }
        
        while True:
            response = requests.get(endpoint, params=params)
            data = response.json()

            pages = data.get("query", {}).get("pages", {})
            for page in pages.values():
                # Remove the "Category:" prefix for the key
                cat_title = page["title"].split(":", 1)[-1]
                # Only update if we haven't already found a translation
                if cat_title in translated_categories:
                    continue
                for link in page.get("langlinks", []):
                    if link["lang"] == target_lang:
                        # Remove the prefix if present in the translated title as well
                        translated_categories[cat_title] = link["*"].split(":", 1)[-1]
                        break

            # Check if the API response has a continue token for more langlinks.
            if "continue" in data and "llcontinue" in data["continue"]:
                params["llcontinue"] = data["continue"]["llcontinue"]
            else:
                break

    return translated_categories