第三章 开始采集

3.1遍历单个域名

1
2
3
4
5
6
7
8
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://en.wikipedia.org/wiki/Eric_Idle')
bsObj = BeautifulSoup(html, 'lxml')
for link in bsObj.findAll('a'):
if 'href' in link.attrs:
print(link.attrs['href'])
#mw-head
#p-search
/wiki/Wikipedia:Biographies_of_living_persons
/wiki/Wikipedia:Citing_sources
/wiki/Wikipedia:Verifiability
/wiki/Wikipedia:Identifying_reliable_sources
/wiki/Wikipedia:Libel
/wiki/Help:Maintenance_template_removal
/wiki/File:Eric_Idle_with_Guitar.jpg
/wiki/County_Durham
/wiki/England
/wiki/United_Kingdom
/wiki/Vermont
/wiki/Pembroke_College,_Cambridge
/wiki/Monty_Python
/wiki/The_Rutles
/wiki/Spamalot
/wiki/Lyn_Ashley
http://ericidle.com/
/wiki/Monty_Python
/wiki/The_Rutles
/wiki/Spamalot
#Early_life_and_education
#Career
#Pre-Python_career_.281965.E2.80.931969.29
#Monty_Python_.281969.E2.80.931983.2C_2014.29
#Post-Python_career_.281973.E2.80.93present.29
#Personal_life
#Other_credits
#Writing
#Bibliography
#Songwriting
#Tributes
#Filmography
#Film
#Television
#Video_games
#Stage
#References
#External_links
/w/index.php?title=Eric_Idle&action=edit&section=1
/wiki/South_Shields
/wiki/County_Durham
#cite_note-1
/wiki/Health_visitor
#cite_note-Tel170207-2
......
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html = urlopen('http://en.wikipedia.org'+articleUrl)
bsObj = BeautifulSoup(html, 'lxml')
return bsObj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile('^(/wiki/)((?!:).)*$'))
links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
print(newArticle)
links = getLinks(newArticle)
/wiki/Mads_Mikkelsen
/wiki/2013_in_film
/wiki/Patricia_Blair
/wiki/Darby_Hinton
/wiki/Internet_Movie_Data_Base
/wiki/Letterboxd
/wiki/Beta_tester
/wiki/Computer_worm
/wiki/Integrated_Authority_File
/wiki/Library
/wiki/Origen
/wiki/Docetism
/wiki/International_Standard_Book_Number



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

3.2采集整个网站

  • 链接去重
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
global pages
html = urlopen('http://en.wikipedia.org'+pageUrl)
bsObj = BeautifulSoup(html, 'lxml')
for link in bsObj.findAll('a', href = re.compile('^(/wiki/)')):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage = link.attrs['href']
print(newPage)
pages.add(newPage)
getLinks(newPage)
getLinks('')
/wiki/Wikipedia
/wiki/Wikipedia:Protection_policy#semi
/wiki/Wikipedia:Requests_for_page_protection
/wiki/Wikipedia:Requests_for_permissions



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
global pages
html = urlopen('http://en.wikipedia.org'+pageUrl)
bsObj = BeautifulSoup(html, 'lxml')
try:
print(bsObj.h1.get_text())
print(bsObj.find(id='mw-content_text').findAll('p')[0])
print(bsObj.find(id='ca_edit').find('span').find('a').attrs['href'])
except AttributeError:
print('页面缺少一些属性')

for link in bsObj.findAll('a', href = re.compile('^(/wiki/)')):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage = link.attrs['href']
print('----------------\n'+newPage)
pages.add(newPage)
getLinks(newPage)
getLinks('')
Main Page
页面缺少一些属性
----------------
/wiki/Wikipedia
Wikipedia
页面缺少一些属性
----------------
/wiki/Wikipedia:Protection_policy#semi
Wikipedia:Protection policy
页面缺少一些属性
----------------
/wiki/Wikipedia:Requests_for_page_protection
Wikipedia:Requests for page protection
页面缺少一些属性

3.3通过互联网采集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

#获取页面所有内链的列表
def getInternalLinks(bsObj, includeUrl):
internalLinks = []
#找出所有以‘/’开头的链接
for link in bsObj.findAll('a', href=re.compile('^(/|.*"+includeUrl+")')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
internalLinks.append(link.attrs['href'])
return internalLinks

#获取页面所有外链的列表
def getExternalLinks(bsObj, excludeUrl):
externalLinks = []
#找出所有以'http'或'www'开头且不包含当前URL的链接
for link in bsObj.findAll('a',href=re.compile('^(http|www)((?!"+excludeUrl+").)*$')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks

def splitAddress(address):
addressParts = address.replace('http://','').split('/')
return addressParts

def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html, 'lxml')
externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])
if len(externalLinks) == 0:
internalLinks = getInternalLinks(startingPage)
return getExternalLink(bsObj, internalLinks[random.randint(0, len(internalLinks) - 1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
externalLink = getRandomExternalLink('http://oreilly.com')
print('随机外链是:'+externalLink)
followExternalOnly(externalLink)

followExternalOnly('http://oreilly.com')
随机外链是:https://www.safaribooksonline.com/your-experience/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170201+homepage+safari+platform
随机外链是:http://www.oreilly.com/privacy.html
随机外链是:http://www.oreilly.com/about/
随机外链是:https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170601+nav
随机外链是:http://www.oreilly.com/about/editorial_independence.html
随机外链是:http://twitter.com/oreillymedia
随机外链是:https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170505+homepage+get+started+now



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)
  • 采集网站所有的外链
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#收集网站上发现的所有外链列表
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
html = urlopen(siteUrl)
bsObj = BeautifulSoup(html)
internalLinks = getInternalLinks(bsObj, splitAddress(siteUrl)[0])
externalLinks = getExternalLinks(bsObj, splitAddress(siteUrl)[0])
for link in externalLinks:
if link not in allExtLinks:
allExtLinks.add(link)
print(link)
for link in internalLinks:
if link not in allIntLinks:
print('即将获取链接的URL是:'+link)
allIntLinks.add(link)
getAllExternalLinks(link)
getAllExternalLinks('http://oreilly.com')

3.4用Scrapy采集

分享到