Currently here's my code:
from bs4 import BeautifulSoup as bs4
from urllib.request import urlopen
user = "khaby.lame"
u = urlopen("https://tiktok.com/@" + user).read()
soup = bs4(u, "html.parser")
print(soup)
Now, what this code does is returns the entire page source as you'd probably know, which is good but now I've got the page source (the text in the variable "soup") I want to get the "UserModule" JSON data from the page source and only that. It's kind of hard to explain but I hope I made it clear to understand. One of the things I want to obtain from "UserModule" is the User ID.
Tiktok's UserModule
data is in the JSON contained in the innerHTML
of script#SIGI_STATE
.
To get this data, you can use soup.select
to get the script#SIGI_STATE
element and .encode_contents()
to get the innerHTML
of the element to get a JSON containing UserModule
data. Do some data retrieve from that JSON.
from bs4 import BeautifulSoup as bs4
from urllib.request import urlopen
import json
user = "khaby.lame"
u = urlopen("https://tiktok.com/@" + user).read()
soup = bs4(u, "html.parser")
allData = json.loads(soup.select('script#SIGI_STATE')[0].encode_contents().decode('utf8'))
UserModule = allData['UserModule']
id = UserModule['users'][user]['id']
print(json.dumps(UserModule, indent=4))
print(id)
The output will be:
{
"users": {
"khaby.lame": {
"id": "127905465618821121",
"shortId": "",
"uniqueId": "khaby.lame",
"nickname": "Khabane lame",
"avatarLarger": "https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/793e825b068a0ef5444e86e55812965a~c5_1080x1080.jpeg?x-expires=1682535600&x-signature=Q53PyPjeWtXAYXvtRT3kJjhrLMk%3D",
"avatarMedium": "https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/793e825b068a0ef5444e86e55812965a~c5_720x720.jpeg?x-expires=1682535600&x-signature=xG2MLx%2FQP%2FsE8pnQBtPngdY%2F6KI%3D",
"avatarThumb": "https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/793e825b068a0ef5444e86e55812965a~c5_100x100.jpeg?x-expires=1682535600&x-signature=Di%2FC22QjiIBm0Ors%2FrnkNWFEGgg%3D",
"signature": "Se vuoi ridere sei nel posto giusto\ud83d\ude0e \nIf u wanna laugh u r in the right place\ud83d\ude0e",
"createTime": 0,
"verified": true,
"secUid": "MS4wLjABAAAAwAg0rSzO65WQfz4RzQgGv2Xdv108BgPXhRrrmNVIHQZ9PO8-flwwRtEppYTS0OjA",
"ftc": false,
"relation": 0,
"openFavorite": false,
"commentSetting": 0,
"commerceUserInfo": {
"commerceUser": false
},
"duetSetting": 0,
"stitchSetting": 0,
"privateAccount": false,
"secret": false,
"isADVirtual": false,
"roomId": "",
"uniqueIdModifyTime": 0,
"ttSeller": false,
"region": "IT",
"downloadSetting": 0,
"profileTab": {
"showMusicTab": false,
"showQuestionTab": false,
"showPlayListTab": false
},
"followingVisibility": 1,
"recommendReason": "",
"nowInvitationCardUrl": "",
"nickNameModifyTime": 0,
"isEmbedBanned": false,
"canExpPlaylist": true,
"profileEmbedPermission": 1,
"extraInfo": {
"statusCode": 0
}
}
},
"stats": {
"khaby.lame": {
"followerCount": 156900000,
"followingCount": 78,
"heart": 2300000000,
"heartCount": -2033648397,
"videoCount": 1085,
"diggCount": 0,
"needFix": true
}
}
}
127905465618821121