I am trying to write a python script that will extract all pages within all sections of a OneNote notebook and transfer the content of each page into Azure Blob Storage. The program works for the first hundred or so pages and then abrubtly stops working after reaching section 21 and produces the following error message:
HttpResponseError Traceback (most recent call last)
Cell In[39], line 72
70 print(p_title)
71 try:
---> 72 container_client.upload_blob(name=p_title,data='p_content', overwrite=True)
73 except KeyError as ere:
74 print(ere)
File .\tester\Lib\site-packages\azure\core\tracing\decorator.py:78, in distributed_trace.<locals>.decorator.<locals>.wrapper_use_tracer(*args, **kwargs) //tester is my kernel
76 span_impl_type = settings.tracing_implementation()
77 if span_impl_type is None:
---> 78 return func(*args, **kwargs)
80 # Merge span is parameter is set, but only if no explicit parent are passed
81 if merge_span and not passed_in_parent:
File: .\tester\Lib\site-packages\azure\storage\blob\_container_client.py:1125, in ContainerClient.upload_blob(self, name, data, blob_type, length, metadata, **kwargs)
1123 timeout = kwargs.pop('timeout', None)
1124 encoding = kwargs.pop('encoding', 'UTF-8')
-> 1125 blob.upload_blob(
1126 data,
1127 blob_type=blob_type,
1128 length=length,
1129 metadata=metadata,
1130 timeout=timeout,
...
ErrorCode:InvalidUri
Content: <?xml version="1.0" encoding="utf-8"?>
<Error><Code>InvalidUri</Code><Message>The requested URI does not represent any resource on the server.
RequestId:4fcbf4ad-501e-0059-1d3e-c2ae87000000
Time:2024-06-19T11:46:07.8701879Z</Message></Error>
...
This is the code I wrote:
request_rate = 250
#specify headers that will enter each graphAPI request
_headers = {
'Authorization': 'Bearer ' + curr_access_token,
'Content-Type': 'application/json'
}
def get_pages_within_section(site_id,section_id):
#specify url of pages within a specific section
url_pages_within_section = f"https://graph.microsoft.com/v1.0/sites/{site_id}/onenote/sections/{section_id}/pages"
response = requests.get(url_pages_within_section, headers=_headers).json()
return response['value']
def get_page_content(site_id,page_id):
# specify endpoint (url of a specific page from a specific section)
url_page = f"https://graph.microsoft.com/v1.0/sites/{site_id}/onenote/pages/{page_id}/content"
#call to that page to get its contents
for i in range(request_rate):
pass
page_response = requests.get(url_page,headers=_headers)
print(page_response)
#if everything went well
if page_response.status_code == 200:
soup = BeautifulSoup(page_response.text,'html.parser')
else:
# otherwise retry in the dumbest, easiest way possible
time.sleep(300)
page_response = requests.get(url_page,headers=_headers)
page_response.raise_for_status()
if page_response.status_code == 200:
soup = BeautifulSoup(page_response.text,'html.parser')
else:
# if you failed again, just give up ;/
page_title = None
page_content = None
return page_title,page_content
#if this is truthy then go on and give us the good stuff
if soup.title.string:
page_title = soup.title.string
if page_title.endswith('.'):
page_title = page_title.split('.')[0]
else:
#just in case if something STILL goes wrong return the following
page_title = None
page_content = None
page_content = page_response.text
return page_title,page_content
i = 1
#for each section within the notebook
for sec_num, section in enumerate(list_sections['value']):
#get pages for that section
print(sec_num)
section_info = get_pages_within_section(site_id=__site_id,section_id=section['id'])
if i == 1:
i=0
print(section_info)
#for each page within that section
for page in section_info:
#get page title and its .html content
p_title, p_content = get_page_content(site_id=__site_id,page_id=page['id'])
#then upload a new blob into our previously established container
if p_title:
# container_client.upload_blob(name=p_title,data=p_content, overwrite=True, content_settings = cnt_settings)
print(p_title)
try:
container_client.upload_blob(name=p_title,data='p_content', overwrite=True)
except KeyError as ere:
print(ere)
else:
print(f'came across an empty one!{p_title}')
I am expecting the program to print out response status codes along with the page title of the retrieved page and obviously to upload correctly into blob storage.
ErrorCode:InvalidUriContent:
InvalidUri
The requested URI does not represent any resource on the server.RequestId:4fcbf4ad-501e-0059-1d3e-c2ae87000000Time:2024-06-19T11:46:07.8701879Z
The above error is not with code, The error may be invalid path or names or characters provided in the blob parameters.
According to this MS-Document
Note: Avoid blob names that end with a dot (.), a forward slash (/), a backslash (), or a sequence or combination of the two. No path segments should end with a dot (.).
In your code, try to modify the page_title
like below:
page_title = re.sub(r'[^a-zA-Z0-9\-_]', '_', page_title)
If page_title
is good then change the p_title
like below:
if p_title:
safe_p_title = urllib.parse.quote(p_title, safe='')
print(safe_p_title)
container_client.upload_blob(name=safe_p_title, data=p_content, overwrite=True)
Make sure in naming and referencing Containers, Blobs, and Metadata to avoid InvalidUri.