First of all, I think it's worth saying that, I know there are a bunch of similar questions but NONE of them works for me...
I'm a newbie on Python, html and web scraper. I'm trying to scrape user information from a website which needs to login first. In my tests I use scraper my email settings from github as examples. The main page is 'https://github.com/login' and the target page is 'https://github.com/settings/emails'
Here are a list of methods I've tried
##################################### Method 1
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('https://github.com/login')
for f in br.forms():
print f
br.select_form(nr=0)
# User credentials
br.form['login'] = 'myusername'
br.form['password'] = 'mypwd'
# Login
br.submit()
br.open('github.com/settings/emails').read()
################ Method 2
import urllib, urllib2, cookielib
username = 'myusername'
password = 'mypwd'
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
login_data = urllib.urlencode({'username' : username, 'j_password' : password})
opener.open('https://github.com/login', login_data)
resp = opener.open('https://github.com/settings/emails')
print resp.read()
############# Method 3
import urllib
opener = urllib.FancyURLopener()
print opener.open('http://myusername:mypwd@github.com/settings/emails').read()
########## Method 4
import mechanize
import cookielib
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
br.addheaders = [('User-agent', 'Chrome')]
br.add_password('https://github.com/settings/emails', 'myusername', 'mypwd')
br.open('https://github.com/settings/emails')
print br.response().read()
############ Methods 5
from requests import session
payload = {
'action': 'login',
'username': 'myusername',
'password': 'mypwd'
}
with session() as c:
c.post('https://github.com/login', data=payload)
request = c.get('https://github.com/settings/emails')
print request.headers
print request.text
########### Method 6
import requests
from requests.packages.urllib3 import add_stderr_logger
import sys
from bs4 import BeautifulSoup as bs
add_stderr_logger()
s = requests.Session()
s.headers['User-Agent'] = 'Chrome'
username = 'myusername'
password = 'mypwd'
url = 'https://github.com/login'
# after examining the HTML of the website you're trying to log into
# set name_form to the name of the form element that contains the name and
# set password_form to the name of the form element that will contain the password
login = {'login': username, 'password': password}
login_response = s.post(url, data=login)
for r in login_response.history:
if r.status_code == 401: # 401 means authentication failed
print 'error!'
sys.exit(1) # abort
pdf_response = s.get('https://github.com/settings/emails') # Your cookies and headers are automatically included
soup = bs(pdf_response.content)
Also I've read some discussions about differences between HTTP Authentication and cookies. Still none of them worked.
Please help and any help would be appreciated. Thank you very much.
This works for me:
##################################### Method 1
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('https://github.com/login')
# View available forms
for f in br.forms():
print f
# Select the second (index one) form (the first form is a search query box)
br.select_form(nr=1)
# User credentials
br.form['login'] = 'mylogin'
br.form['password'] = 'mypass'
# Login
br.submit()
print(br.open('https://github.com/settings/emails').read())
You were not far off at all!