I am trying to scrape certain open source file from GitHub but I'm having an issue with their new format. This if an example link: https://github.com/xavierLowmiller/xcodegen-action/blob/main/action.yml that leads to a YML file. I am trying to download this file through web scraping (needs to be done automatically) and then save it on my local machine.
What I did previously is to get the link to the raw file from this link and then save the content of the raw page as a yml file.
code to save the yml file:
def saveFile(raw_link, file_name):
response = requests.get(raw_link)
soup = BeautifulSoup(response.text, 'html.parser')
name = file_name + ".yml"
else:
with open(name, "w", encoding='utf-8') as file:
file.write(str(soup.getText()))
print(name + " is saved!")
code to get the raw link:
def get_raw_link(action_file_page_link):
response = requests.get(action_file_page_link)
soup = BeautifulSoup(response.text, 'html.parser')
raw = soup.find('a', class_='js-permalink-replaceable-link Button--secondary Button--small Button')
if(raw.has_attr('href')):
base_url = "https://github.com/"
newURl = base_url + raw["href"]
return newURl
Then the functions are called as follows:
link = "https://github.com/xavierLowmiller/xcodegen-action/blob/main/action.yml"
raw_link = get_raw_link(link)
save_file(raw_link, "example")
This was working until a while ago but the recently the soup from get_raw_link return the following so I don't know how to extract the raw link:
{"payload":{"allShortcutsEnabled":false,"fileTree":{"":{"items":[{"name":"node_modules","path":"node_modules","contentType":"directory"},{"name":".gitignore","path":".gitignore","contentType":"file"},{"name":"README.md","path":"README.md","contentType":"file"},{"name":"action.yml","path":"action.yml","contentType":"file"},{"name":"index.js","path":"index.js","contentType":"file"},{"name":"package-lock.json","path":"package-lock.json","contentType":"file"},{"name":"package.json","path":"package.json","contentType":"file"}],"totalCount":7}},"fileTreeProcessingTime":2.21865,"foldersToFetch":[],"reducedMotionEnabled":null,"repo":{"id":257517315,"defaultBranch":"main","name":"xcodegen-action","ownerLogin":"xavierLowmiller","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2020-04-21T07:40:50.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/16212751?v=4","public":true,"private":false,"isOrgOwned":false},"refInfo":{"name":"main","listCacheKey":"v0:1660850753.387054","canEdit":false,"refType":"branch","currentOid":"c4315213072a4ad0930970f5c739b7733cee603d"},"path":"action.yml","currentUser":null,"blob":{"rawLines":["name: 'xcodegen'","description: 'Runs the `xcodegen generate` command'","inputs:"," cache-path:"," description: 'Where the cache file will be loaded from and save to. Defaults to ~/.xcodegen/cache/{SPEC_PATH_HASH}'"," no-env:"," description: 'Disable environment variable expansions'"," only-plists:"," description: 'Generate only plist files'"," project:"," description: 'The path to the directory where the project should be generated. Defaults to the directory the spec is in. The filename is defined in the project spec'"," quiet:"," description: 'Suppress all informational and success output'"," spec:"," description: 'The path to the project spec file. Defaults to project.yml'"," use-cache:"," description: 'Use a cache for the xcodegen spec. This will prevent unnecessarily generating the project if nothing has changed'"," version:"," description: 'The version of xcodegen to be used. Check https://github.com/yonaskolb/XcodeGen/releases for valid options.'"," default: latest","runs:"," using: 'node12'"," main: 'index.js'"],"stylingDirectives":[[{"start":0,"end":4,"cssClass":"pl-ent"},{"start":6,"end":16,"cssClass":"pl-s"},{"start":6,"end":7,"cssClass":"pl-pds"},{"start":15,"end":16,"cssClass":"pl-pds"}],[{"start":0,"end":11,"cssClass":"pl-ent"},{"start":13,"end":51,"cssClass":"pl-s"},{"start":13,"end":14,"cssClass":"pl-pds"},{"start":50,"end":51,"cssClass":"pl-pds"}],[{"start":0,"end":6,"cssClass":"pl-ent"}],[{"start":2,"end":12,"cssClass":"pl-ent"}],[{"start":4,"end":15,"cssClass":"pl-ent"},{"start":17,"end":119,"cssClass":"pl-s"},{"start":17,"end":18,"cssClass":"pl-pds"},{"start":118,"end":119,"cssClass":"pl-pds"}],[{"start":2,"end":8,"cssClass":"pl-ent"}],[{"start":4,"end":15,"cssClass":"pl-ent"},{"start":17,"end":58,"cssClass":"pl-s"},{"start":17,"end":18,"cssClass":"pl-pds"},{"start":57,"end":58,"cssClass":"pl-pds"}],[{"start":2,"end":13,"cssClass":"pl-ent"}],[{"start":4,"end":15,"cssClass":"pl-ent"},{"start":17,"end":44,"cssClass":"pl-s"},{"start":17,"end":18,"cssClass":"pl-pds"},{"start":43,"end":44,"cssClass":"pl-pds"}],[{"start":2,"end":9,"cssClass":"pl-ent"}],[{"start":4,"end":15,"cssClass":"pl-ent"},{"start":17,"end":169,"cssClass":"pl-s"},{"start":17,"end":18,"cssClass":"pl-pds"},{"start":168,"end":169,"cssClass":"pl-pds"}],[{"start":2,"end":7,"cssClass":"pl-ent"}],[{"start":4,"end":15,"cssClass":"pl-ent"},{"start":17,"end":64,"cssClass":"pl-s"},{"start":17,"end":18,"cssClass":"pl-pds"},{"start":63,"end":64,"cssClass":"pl-pds"}],[{"start":2,"end":6,"cssClass":"pl-ent"}],[{"start":4,"end":15,"cssClass":"pl-ent"},{"start":17,"end":77,"cssClass":"pl-s"},{"start":17,"end":18,"cssClass":"pl-pds"},{"start":76,"end":77,"cssClass":"pl-pds"}],[{"start":2,"end":11,"cssClass":"pl-ent"}],[{"start":4,"end":15,"cssClass":"pl-ent"},{"start":17,"end":131,"cssClass":"pl-s"},{"start":17,"end":18,"cssClass":"pl-pds"},{"start":130,"end":131,"cssClass":"pl-pds"}],[{"start":2,"end":9,"cssClass":"pl-ent"}],[{"start":4,"end":15,"cssClass":"pl-ent"},{"start":17,"end":126,"cssClass":"pl-s"},{"start":17,"end":18,"cssClass":"pl-pds"},{"start":125,"end":126,"cssClass":"pl-pds"}],[{"start":4,"end":11,"cssClass":"pl-ent"},{"start":13,"end":19,"cssClass":"pl-s"}],[{"start":0,"end":4,"cssClass":"pl-ent"}],[{"start":2,"end":7,"cssClass":"pl-ent"},{"start":9,"end":17,"cssClass":"pl-s"},{"start":9,"end":10,"cssClass":"pl-pds"},{"start":16,"end":17,"cssClass":"pl-pds"}],[{"start":2,"end":6,"cssClass":"pl-ent"},{"start":8,"end":18,"cssClass":"pl-s"},{"start":8,"end":9,"cssClass":"pl-pds"},{"start":17,"end":18,"cssClass":"pl-pds"}]],"csv":null,"csvError":null,"dependabotInfo":{"showConfigurationBanner":false,"configFilePath":null,"networkDependabotPath":"/xavierLowmiller/xcodegen-action/network/updates","dismissConfigurationNoticePath":"/settings/dismiss-notice/dependabot_configuration_notice","configurationNoticeDismissed":null,"repoAlertsPath":"/xavierLowmiller/xcodegen-action/security/dependabot","repoSecurityAndAnalysisPath":"/xavierLowmiller/xcodegen-action/settings/security_analysis","repoOwnerIsOrg":false,"currentUserCanAdminRepo":false},"displayName":"action.yml","displayUrl":"https://github.com/xavierLowmiller/xcodegen-action/blob/main/action.yml?raw=true","headerInfo":{"blobSize":"1 KB","deleteInfo":{"deletePath":null,"deleteTooltip":"You must be signed in to make or propose changes"},"editInfo":{"editTooltip":"You must be signed in to make or propose changes"},"ghDesktopPath":"https://desktop.github.com","gitLfsPath":null,"onBranch":true,"shortPath":"8b2c2b2","siteNavLoginPath":"/login?return_to=https%3A%2F%2Fgithub.com%2FxavierLowmiller%2Fxcodegen-action%2Fblob%2Fmain%2Faction.yml","isCSV":false,"isRichtext":false,"toc":null,"lineInfo":{"truncatedLoc":"23","truncatedSloc":"23"},"mode":"file"},"image":false,"isCodeownersFile":null,"isValidLegacyIssueTemplate":false,"issueTemplateHelpUrl":"https://docs.github.com/articles/about-issue-and-pull-request-templates","issueTemplate":null,"discussionTemplate":null,"language":"YAML","large":false,"loggedIn":false,"newDiscussionPath":"/xavierLowmiller/xcodegen-action/discussions/new","newIssuePath":"/xavierLowmiller/xcodegen-action/issues/new","planSupportInfo":{"repoIsFork":null,"repoOwnedByCurrentUser":null,"requestFullPath":"/xavierLowmiller/xcodegen-action/blob/main/action.yml","showFreeOrgGatedFeatureMessage":null,"showPlanSupportBanner":null,"upgradeDataAttributes":null,"upgradePath":null},"publishBannersInfo":{"dismissActionNoticePath":"/settings/dismiss-notice/publish_action_from_dockerfile","dismissStackNoticePath":"/settings/dismiss-notice/publish_stack_from_file","releasePath":"/xavierLowmiller/xcodegen-action/releases/new?marketplace=true","showPublishActionBanner":false,"showPublishStackBanner":false},"renderImageOrRaw":false,"richText":null,"renderedFileInfo":null,"tabSize":8,"topBannersInfo":{"overridingGlobalFundingFile":false,"globalPreferredFundingPath":null,"repoOwner":"xavierLowmiller","repoName":"xcodegen-action","showInvalidCitationWarning":false,"citationHelpUrl":"https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-citation-files","showDependabotConfigurationBanner":false,"actionsOnboardingTip":null},"truncated":false,"viewable":true,"workflowRedirectUrl":null,"symbols":{"timedOut":false,"notAnalyzed":true,"symbols":[]}},"copilotUserAccess":null,"csrf_tokens":{"/xavierLowmiller/xcodegen-action/branches":{"post":"VDenyVH_nquI0sxAm0XoqPcds5whGFNI20OOv8KEJEb-F0Yvw5Pp58gfj5plUnygbPV_bBrmhBaqyvZegoqn8g"}}},"title":"xcodegen-action/action.yml at main · xavierLowmiller/xcodegen-action","locale":"en"}
I'd appreciate any advice you can give me :D
GitHub probably updated its app structure, so your code cannot find the raw button link. I recommend using the URL for the raw file instead. This will hopefully be less likely to change on you as well.
Raw File URL Format:
https://raw.githubusercontent.com/{username}/{repository}/{branch}/{path_to_file}
Standard URL Format:
https://github.com/{username}/{repository}/blob/{branch}/{path_to_file}
Knowing this, we can convert the standard URL to the raw file URL like this:
def get_raw_link(github_url):
raw_url = github_url.replace("github.com", "raw.githubusercontent.com")
raw_url = raw_url.replace("/blob/", "/")
return raw_url
And we can update saveFile()
accordingly:
def save_file(raw_link, file_name):
response = requests.get(raw_link)
name = file_name + ".yml"
with open(name, "w", encoding='utf-8') as file:
file.write(response.text)
print(name + " is saved!")
Then when you invoke the function, you can do the same as you were doing before:
link = "https://github.com/xavierLowmiller/xcodegen-action/blob/main/action.yml"
raw_link = get_raw_link(link)
save_file(raw_link, "example")