第十二章 避开采集陷阱

12.2 让网络机器人看起来像人类用户

12.2.1 修改请求头

  • HTTP 的请求头是在你每次向网络服务器发送请求时,传递的一组属性和配置信息
1
2
3
4
5
6
7
8
9
10
11
12
import requests
from bs4 import BeautifulSoup

session = requests.Session()
headers = {'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
url = "https://www.whatismybrowser.com/developers/what-http-headers-is-my-browser-sending"

req = session.get(url, headers=headers)

bsObj = BeautifulSoup(req.text, 'lxml')
print(bsObj.find('table', {'class':'table-striped'}).get_text())
ACCEPT
text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8


ACCEPT_ENCODING
gzip, deflate


CONNECTION
keep-alive


HOST
www.whatismybrowser.com


USER_AGENT
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome
1
2
3
4
5
from selenium import webdriver
driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
driver.get("http://pythonscraping.com")
driver.implicitly_wait(1)
print(driver.get_cookies())
[{'domain': '.pythonscraping.com', 'expires': '周四, 23 11月 2017 08:38:53 GMT', 'expiry': 1511426333, 'httponly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.1664590346.1511339933'}, {'domain': '.pythonscraping.com', 'expires': '周五, 22 11月 2019 08:38:53 GMT', 'expiry': 1574411933, 'httponly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1237346090.1511339933'}, {'domain': 'pythonscraping.com', 'httponly': False, 'name': 'has_js', 'path': '/', 'secure': False, 'value': '1'}]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
from selenium import webdriver
driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
driver.get("http://pythonscraping.com")
driver.implicitly_wait(1)
print(driver.get_cookies())
savedCookies = driver.get_cookies()
driver2 = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
driver2.get("http://pythonscraping.com")
driver2.delete_all_cookies()
for cookie in savedCookies:
driver2.add_cookie(cookie)
driver2.get("http://pythonscraping.com")
driver.implicitly_wait(1)
print(driver2.get_cookies())
[{'domain': '.pythonscraping.com', 'expires': '周四, 23 11月 2017 08:43:29 GMT', 'expiry': 1511426609, 'httponly': False, 'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.962480326.1511340209'}, {'domain': '.pythonscraping.com', 'expires': '周五, 22 11月 2019 08:43:29 GMT', 'expiry': 1574412209, 'httponly': False, 'name': '_ga', 'path': '/', 'secure': False, 'value': 'GA1.2.1238410045.1511340209'}, {'domain': 'pythonscraping.com', 'httponly': False, 'name': 'has_js', 'path': '/', 'secure': False, 'value': '1'}]
1
2
3
4
5
6
7
8
9
10
11
12
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
driver.get("http://pythonscraping.com/pages/itsatrap.html")
links = driver.find_elements_by_tag_name("a")
for link in links:
if not link.is_displayed():
print("The link "+link.get_attribute("href")+" is a trap")
fields = driver.find_elements_by_tag_name("input")
for field in fields:
if not field.is_displayed():
print("Do not change value of "+field.get_attribute("name"))
The link http://pythonscraping.com/dontgohere is a trap
Do not change value of phone
Do not change value of email
分享到