I need to scrape links to the patents from my search result on Espacenet. Since Espacenet is a dynamic website, simple way with Beautiful Soup and Requests doesn't work. I tried to use Selenium together with Beautiful Soup, but this didn't work.
from bs4 import BeautifulSoup
from selenium import webdriver
url = 'https://worldwide.espacenet.com/patent/search?q=%28pa%20all%20%22imperial%22%20AND%20pa%20all%20%22innovations%22%29%20AND%20pd%20within%20%222014%22'
driver = webdriver.Chrome()
driver.get(url)
soup = BeautifulSoup(
driver.page_source,
'html.parser'
)
urls = []
for link in soup.find_all('a'):
print(link.get('href'))
The output gives me only one link (instead of 103 expected) and this link is not a patent link. Here is it. Probably the links to patents are hidden somehow and don't get found using 'a' tag.
I also tried this approach, described here, however it wasn't helpful either.
import requests
main_url="https://worldwide.espacenet.com/patent/search?q=%28pa%20all%20%22imperial%22%20AND%20pa%20all%20%22innovations%22%29%20AND%20pd%20within%20%222014%22"
res=requests.get("https://worldwide.espacenet.com/patent/search?q=%28pa%20all%20%22imperial%22%20AND%20pa%20all%20%22innovations%22%29%20AND%20pd%20within%20%222014%22")
main_data=res.json()
data=main_data['results']['cluster']
for i in range(len(data[0]['result'])):
num=data[0]['result'][i]['patent']['publication_number']
print(num)
print(main_url+"patent/"+num+"/en"+params)
It gave me JSONDecoder Error:
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Is it even possible to scrape these links?
I need to get a df or a dictionary with Patent title and corresponding link. For example:
First of all your target app is protected by Cloudflare and you have to use all the necessary headers, and cookies to resolve the 403 Forbidden
error. After analyzing the request I found out that Cookie
, User-Agent
, Referer
, and Epo-Trace-Id: wie4us-etqwuf-AAA-000001
is mandatory to prove you are a human to Cloudflare so don't forget to add your cookie
and user-agent
in the script. (I included my cookies and user-agent header in the script)
I used this https://worldwide.espacenet.com/3.2/rest-services/search?lang=en%2Cde%2Cfrq={query}
API endpoint in my script to load all the 103 results, here is the code:
import requests
from urllib.parse import quote
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) #disable urllib HTTP warning
headers = {
'Cookie': '__cf_bm=1_YwWsJT2LrzpGQ1DHVcB06fQKbJk3ZpZRT_GRZVDvY-1728909016-1.0.1.1-PaoOD9MzAIc9pwsgeUJoMzTcmfEr9I8wqQiF4cIuRoLu79v4tY5FIzI0KncO71jEx48cgWOUUVPXLznqYzNCew; _cfuvid=iSuLFpm4o2QijuDGvZWS.0lYYQJ5FfOYsQNf.JCDWrg-1728909017605-0.0.1.1-604800000; cf_chl_rc_m=1; cf_clearance=aQbYNLPcghnkgkGiajNDA4LOvbqsgJ5Yo0B9qzi5n7Q-1728909103-1.2.1.1-LF.LzgQxDh8c53fe4AzGnYEp7kbLV8QDs6y5H.7tXuWM.v6XzJo5Op8GXAJE2.4IBgAfGC_BbmOIZrG57udQsBUaN3RGuiHGDk3RmB19bFq6AMZp5ooi00relCmJSZvxoz2CbgDhe1csvTrwwANQYZVgDwivvCDWR0Rsz168mLCWgFe3DEShZsTjCqnNSvW8hOsxSaZjM_yFg_2o1PFh7qc0JCKNnLvrYE7msKJDh3rzW6FZNr54FVkRVtQLiJJEOe5oUaK3UtDWdHpZeb9K_onFGKlhXnm3ktGHKGMP8gS58nq_OI1AqOafZxF4stGXa05JRJ7sgmQwxrOwl3RBf2WggkPSLz98Kq890cKCKBUjep9OaxGtzP497UfIJn6UO3vnoIeX_MerVMnaSkIB_A',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
'Epo-Trace-Id': 'wie4us-etqwuf-AAA-000001',
'Referer': 'https://worldwide.espacenet.com/patent/search?q=%28pa%20all%20%22imperial%22%20AND%20pa%20all%20%22innovations%22%29%20AND%20pd%20within%20%222014%22'
}
def sendRequest(query):
q = quote(query, safe='')
data = {"query":{"fields":["publications.ti_*","publications.abs_*","publications.pn_docdb","publications.pd","publications.in_patents","publications.inc_patents","publications.pa_patents","publications.pac_patents","publications.pr_docdb","publications.app_fdate.untouched","oprid_full.untouched","opubd_full.untouched","publications.ipc_ic","publications.ipc_icci","publications.ipc_iccn","publications.ipc_icai","publications.ipc_ican","publications.ci_cpci","publications.ca_cpci","publications.cl_cpci","biblio:pa;in;in_patents;pa_patents;inc_patents;pac_patents;in_orig_patents;pa_orig_patents;in_unstd_patents;pa_unstd_patents;pd;pn_docdb;allKindCodes;"],"from":0,"size":200,"highlighting":[{"field":"publications.ti_en","fragment_words_number":20,"number_of_fragments":3,"hits_only":True},{"field":"publications.abs_en","fragment_words_number":20,"number_of_fragments":3,"hits_only":True},{"field":"publications.ti_de","fragment_words_number":20,"number_of_fragments":3,"hits_only":True},{"field":"publications.abs_de","fragment_words_number":20,"number_of_fragments":3,"hits_only":True},{"field":"publications.ti_fr","fragment_words_number":20,"number_of_fragments":3,"hits_only":True},{"field":"publications.abs_fr","fragment_words_number":20,"number_of_fragments":3,"hits_only":True},{"field":"publications.pn_docdb","fragment_words_number":20,"number_of_fragments":3,"hits_only":True},{"field":"publications.pa_patents","fragment_words_number":20,"number_of_fragments":3,"hits_only":True}]},"filters":{"publications.patent":[{"value":["true"]}]},"widgets":{}}
url = f"https://worldwide.espacenet.com/3.2/rest-services/search?lang=en%2Cde%2Cfr&q={q}&qlang=cql&"
resp = requests.post(url, headers=headers, json=data, verify=False)
main_data = resp.json()['hits']
for n, i in enumerate(main_data):
try:
pub_num = i['hits'][0]['fields']['publications.pn_docdb'][0]
pub_name = i['hits'][0]['fields']['publications.ti_en'][0]
family_num = i['familyNumber']
pub_url = f"https://worldwide.espacenet.com/patent/search/family/{family_num}/publication/{pub_num}?q={q}"
print(f"================\nPublication No: {n+1}\nPublication Title: {pub_name}\nPublication URL: {pub_url}")
except Exception:
pass
sendRequest('(pa all "imperial" AND pa all "innovations") AND pd within "2014"') # your query which fetches the data
Let me know if this works on your end!