pythonpython-3.xweb-scrapingpython-requests

Failed to parse the total results from a webpage, of which my existing script can parse one-third


I've created a script that issues a POST HTTP request with the appropriate parameters to fetch the town, continent, country, and inner_link from this webpage. The script can parse 69 containers, but there are 162 items in total. How can I fetch the rest?

import requests

link = 'https://wenomad.so/elasticsearch/search'
inner_link = 'https://wenomad.so/city/{}'

payload = {
    "z":"nZ9g0AdFBj7cLRX5v2wSWjjGf2Q5KPpss9DS4wZGh9pvfC4xcJvnTebBg+npAqWaQvdVUFxVD1NZ88siTRUfPo8gB70CGoJG/2MPv9Gu9kC+48KwvV4COpsB3HmER0Mgx0bz2G9pSpw6veTnEUnNR78xonQmhuvL3eztB+ikZaI3OTeuVfRVNetmdX4iDgOkKrM6kLt/2SuRKKwT2aAZHJbdhlTV1I65zj1jD7VBwrm+lJDNh7pZug0/gKCWUDQz4CgmrAdQdnxyJDde2ewzudcsGDimhnWB56bcejoli4LLvevtMB4RUMhmM6FIYn0Tl4sclUD7YLQ8gZQOMmBndDkGctxeq74bpDAwBMOG74qu9gb4WLUFxgB/lWCQ9OnJsfkT0J/kUShhQPoRVr72qUx8f8ldkliIGINoBy9i+lm1RYM3L/NfOJ0kBZ+fbKndVJk2owAZ1kLMupja4iPmpxszQlFGTstpAlF5pTckhL+QYIc6vYbslWqXVs8XrzKs955DHPe1WpWmI714MsJfHhd3XHDsuMy9lfY6mE+cfc0434amFJC5gCgoEhGIQsFQD/kGRaWvqCcMfPYiW/o++nQ017bAKzlg7qb0EfPpy/EMG+u4i7QEU/vvC9mUnVCN0ZzFpxP8HWiTTCF0djuB+UnfUaHKtXciPwwZUTV4o8PtI6v6QdrC4PvtAKSJ9CpIccW+A3SSvOgCgEwOtniCdLxezWaP1Dq3fv9G56HCOvsOGRlQ0RgzNgq/+pCwkvyqFYcs/VtX9NPuaCAAXLi+SFM0xRuI4Sq6nHQr7qs6R2C4gAVHm9bZHfByKZ5x03KJp74IGlGSd1GL9/z9CySVZw==",
    "y":"oht3SrBVqLvR2lXJSwtwWw==",
    "x":"dmpOxF/FB13c+GGFmDW4Y4SPz6jEItrcjegm/WNbqFk="
}

headers = {
    'accept': 'application/json, text/javascript, */*; q=0.01',
    'accept-language': 'en-US,en;q=0.9',
    'origin': 'https://wenomad.so',
    'referer': 'https://wenomad.so/',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest'
}

res = requests.post(link,json=payload,headers=headers)
print(res.status_code)
for item in res.json()['hits']['hits']:
    print((
        item['_source']['town_text'],
        item['_source']['continent__text__text'],
        item['_source']['country__text__text'],
        inner_link.format(item['_source']['Slug'])
    ))

Solution

  • You need to replicate the requests to the /elasticsearch/search endpoint which requires three params x, y and z. These params are generated through a cryptographic encryption in the encode3 function of run.js

    First install PyCryptodome:

    pip install pycryptodome
    

    Then you can use this script to get all (162) results:

    from Crypto.Cipher import AES
    from Crypto.Protocol.KDF import PBKDF2
    from Crypto.Hash import MD5
    from Crypto.Util.Padding import pad
    import base64
    import json
    import random
    import time
    import requests
    
    
    def encode(key, iv, text, appname):
        derived_key = PBKDF2(key, appname.encode(), dkLen=32, count=7, hmac_hash_module=MD5)
        derived_iv = PBKDF2(iv, appname.encode(), dkLen=16, count=7, hmac_hash_module=MD5)
    
        cipher = AES.new(derived_key, AES.MODE_CBC, iv=derived_iv)
        text_bytes = pad(text.encode(), AES.block_size)
        encrypted_text = cipher.encrypt(text_bytes)
        encrypted_base64 = base64.b64encode(encrypted_text).decode()
        
        return encrypted_base64
    
    
    def generate_payload(data):
        v = "1"
        appname = 'fie'
        cur_timestamp = str(int(time.time() * 1000)) 
        timestamp_version = f'{cur_timestamp}_{v}'
        key = appname + cur_timestamp
        iv = str(random.random())
    
        text = json.dumps(data, separators=(',', ':'))
        encoded = {
            'z': encode(key, iv, text, appname),
            'y': encode(appname, "po9", timestamp_version, appname),
            'x': encode(appname, "fl1", iv, appname)
        }
    
        return encoded
    
    
    def fetch_all_search_results(data):
        headers = {
            'x-requested-with': 'XMLHttpRequest',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
        }
    
        results = []
        while True:
            payload = generate_payload(data)
            response = requests.post('https://wenomad.so/elasticsearch/search', headers=headers, json=payload)
            res_json = response.json()
    
            hits = res_json.get('hits', {}).get('hits', [])
            results.extend(hits)
            data['from'] += len(hits)
            
            if res_json.get('at_end'):
                break
    
        return results
    
    
    data = {
        "appname": "fie",
        "app_version": "live",
        "type": "custom.town",
        "constraints": [
            {
                "key": "active_boolean",
                "value": True,
                "constraint_type": "equals"
            }
        ],
        "sorts_list": [
            {
                "sort_field": "overall_rating_number",
                "descending": True
            },
            {
                "sort_field": "overall_rating_number",
                "descending": True
            }
        ],
        "from": 0,
        "n": 9999,
        "search_path": "{\"constructor_name\":\"DataSource\",\"args\":[{\"type\":\"json\",\"value\":\"%p3.AAV.%el.cmQus.%el.cmSJO0.%p.%ds\"},{\"type\":\"node\",\"value\":{\"constructor_name\":\"Element\",\"args\":[{\"type\":\"json\",\"value\":\"%p3.AAV.%el.cmQus.%el.cmSJO0\"}]}},{\"type\":\"raw\",\"value\":\"Search\"}]}",
        "situation": "unknown"
    }
    
    results = fetch_all_search_results(data)
    print(f'{len(results) = }')