pythonjsonbeautifulsoupurllibtiktok

How do I get JSON from this page source? I've tried countless different methods all haven't worked


Currently here's my code:

from bs4 import BeautifulSoup as bs4
from urllib.request import urlopen

user = "khaby.lame"
u = urlopen("https://tiktok.com/@" + user).read()
soup = bs4(u, "html.parser")
print(soup)

Now, what this code does is returns the entire page source as you'd probably know, which is good but now I've got the page source (the text in the variable "soup") I want to get the "UserModule" JSON data from the page source and only that. It's kind of hard to explain but I hope I made it clear to understand. One of the things I want to obtain from "UserModule" is the User ID.


Solution

  • Tiktok's UserModule data is in the JSON contained in the innerHTML of script#SIGI_STATE.

    To get this data, you can use soup.select to get the script#SIGI_STATE element and .encode_contents() to get the innerHTML of the element to get a JSON containing UserModule data. Do some data retrieve from that JSON.

    from bs4 import BeautifulSoup as bs4
    from urllib.request import urlopen
    import json
    
    user = "khaby.lame"
    u = urlopen("https://tiktok.com/@" + user).read()
    soup = bs4(u, "html.parser")
    allData = json.loads(soup.select('script#SIGI_STATE')[0].encode_contents().decode('utf8'))
    UserModule = allData['UserModule']
    id = UserModule['users'][user]['id']
    print(json.dumps(UserModule, indent=4))
    print(id)
    

    The output will be:

    {
        "users": {
            "khaby.lame": {
                "id": "127905465618821121",
                "shortId": "",
                "uniqueId": "khaby.lame",
                "nickname": "Khabane lame",
                "avatarLarger": "https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/793e825b068a0ef5444e86e55812965a~c5_1080x1080.jpeg?x-expires=1682535600&x-signature=Q53PyPjeWtXAYXvtRT3kJjhrLMk%3D",
                "avatarMedium": "https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/793e825b068a0ef5444e86e55812965a~c5_720x720.jpeg?x-expires=1682535600&x-signature=xG2MLx%2FQP%2FsE8pnQBtPngdY%2F6KI%3D",
                "avatarThumb": "https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/793e825b068a0ef5444e86e55812965a~c5_100x100.jpeg?x-expires=1682535600&x-signature=Di%2FC22QjiIBm0Ors%2FrnkNWFEGgg%3D",
                "signature": "Se vuoi ridere sei nel posto giusto\ud83d\ude0e \nIf u wanna laugh u r in the right place\ud83d\ude0e",
                "createTime": 0,
                "verified": true,
                "secUid": "MS4wLjABAAAAwAg0rSzO65WQfz4RzQgGv2Xdv108BgPXhRrrmNVIHQZ9PO8-flwwRtEppYTS0OjA",
                "ftc": false,
                "relation": 0,
                "openFavorite": false,
                "commentSetting": 0,
                "commerceUserInfo": {
                    "commerceUser": false
                },
                "duetSetting": 0,
                "stitchSetting": 0,
                "privateAccount": false,
                "secret": false,
                "isADVirtual": false,
                "roomId": "",
                "uniqueIdModifyTime": 0,
                "ttSeller": false,
                "region": "IT",
                "downloadSetting": 0,
                "profileTab": {
                    "showMusicTab": false,
                    "showQuestionTab": false,
                    "showPlayListTab": false
                },
                "followingVisibility": 1,
                "recommendReason": "",
                "nowInvitationCardUrl": "",
                "nickNameModifyTime": 0,
                "isEmbedBanned": false,
                "canExpPlaylist": true,
                "profileEmbedPermission": 1,
                "extraInfo": {
                    "statusCode": 0
                }
            }
        },
        "stats": {
            "khaby.lame": {
                "followerCount": 156900000,
                "followingCount": 78,
                "heart": 2300000000,
                "heartCount": -2033648397,
                "videoCount": 1085,
                "diggCount": 0,
                "needFix": true
            }
        }
    }
    127905465618821121