第三章开始采集

2017-12-11

3.1遍历单个域名

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://en.wikipedia.org/wiki/Eric_Idle')
bsObj = BeautifulSoup(html, 'lxml')
for link in bsObj.findAll('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

#mw-head
#p-search
/wiki/Wikipedia:Biographies_of_living_persons
/wiki/Wikipedia:Citing_sources
/wiki/Wikipedia:Verifiability
/wiki/Wikipedia:Identifying_reliable_sources
/wiki/Wikipedia:Libel
/wiki/Help:Maintenance_template_removal
/wiki/File:Eric_Idle_with_Guitar.jpg
/wiki/County_Durham
/wiki/England
/wiki/United_Kingdom
/wiki/Vermont
/wiki/Pembroke_College,_Cambridge
/wiki/Monty_Python
/wiki/The_Rutles
/wiki/Spamalot
/wiki/Lyn_Ashley
http://ericidle.com/
/wiki/Monty_Python
/wiki/The_Rutles
/wiki/Spamalot
#Early_life_and_education
#Career
#Pre-Python_career_.281965.E2.80.931969.29
#Monty_Python_.281969.E2.80.931983.2C_2014.29
#Post-Python_career_.281973.E2.80.93present.29
#Personal_life
#Other_credits
#Writing
#Bibliography
#Songwriting
#Tributes
#Filmography
#Film
#Television
#Video_games
#Stage
#References
#External_links
/w/index.php?title=Eric_Idle&action=edit&section=1
/wiki/South_Shields
/wiki/County_Durham
#cite_note-1
/wiki/Health_visitor
#cite_note-Tel170207-2
......

from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org'+articleUrl)
    bsObj = BeautifulSoup(html, 'lxml')
    return bsObj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile('^(/wiki/)((?!:).)*$'))
links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
    newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

/wiki/Mads_Mikkelsen
/wiki/2013_in_film
/wiki/Patricia_Blair
/wiki/Darby_Hinton
/wiki/Internet_Movie_Data_Base
/wiki/Letterboxd
/wiki/Beta_tester
/wiki/Computer_worm
/wiki/Integrated_Authority_File
/wiki/Library
/wiki/Origen
/wiki/Docetism
/wiki/International_Standard_Book_Number



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

3.2采集整个网站

链接去重

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen('http://en.wikipedia.org'+pageUrl)
    bsObj = BeautifulSoup(html, 'lxml')
    for link in bsObj.findAll('a', href = re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('')

/wiki/Wikipedia
/wiki/Wikipedia:Protection_policy#semi
/wiki/Wikipedia:Requests_for_page_protection
/wiki/Wikipedia:Requests_for_permissions



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen('http://en.wikipedia.org'+pageUrl)
    bsObj = BeautifulSoup(html, 'lxml')
    try:
        print(bsObj.h1.get_text())
        print(bsObj.find(id='mw-content_text').findAll('p')[0])
        print(bsObj.find(id='ca_edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('页面缺少一些属性')
    
    for link in bsObj.findAll('a', href = re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                print('----------------\n'+newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('')

Main Page
页面缺少一些属性
----------------
/wiki/Wikipedia
Wikipedia
页面缺少一些属性
----------------
/wiki/Wikipedia:Protection_policy#semi
Wikipedia:Protection policy
页面缺少一些属性
----------------
/wiki/Wikipedia:Requests_for_page_protection
Wikipedia:Requests for page protection
页面缺少一些属性

3.3通过互联网采集

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

#获取页面所有内链的列表
def getInternalLinks(bsObj, includeUrl):
    internalLinks = []
    #找出所有以‘/’开头的链接
    for link in bsObj.findAll('a', href=re.compile('^(/|.*"+includeUrl+")')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                internalLinks.append(link.attrs['href'])
    return internalLinks

#获取页面所有外链的列表
def getExternalLinks(bsObj, excludeUrl):
    externalLinks = []
    #找出所有以'http'或'www'开头且不包含当前URL的链接
    for link in bsObj.findAll('a',href=re.compile('^(http|www)((?!"+excludeUrl+").)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def splitAddress(address):
    addressParts = address.replace('http://','').split('/')
    return addressParts

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html, 'lxml')
    externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])
    if len(externalLinks) == 0:
        internalLinks = getInternalLinks(startingPage)
        return getExternalLink(bsObj, internalLinks[random.randint(0, len(internalLinks) - 1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink('http://oreilly.com')
    print('随机外链是:'+externalLink)
    followExternalOnly(externalLink)
    
followExternalOnly('http://oreilly.com')

随机外链是:https://www.safaribooksonline.com/your-experience/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170201+homepage+safari+platform
随机外链是:http://www.oreilly.com/privacy.html
随机外链是:http://www.oreilly.com/about/
随机外链是:https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170601+nav
随机外链是:http://www.oreilly.com/about/editorial_independence.html
随机外链是:http://twitter.com/oreillymedia
随机外链是:https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170505+homepage+get+started+now



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

采集网站所有的外链

#收集网站上发现的所有外链列表
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    bsObj = BeautifulSoup(html)
    internalLinks = getInternalLinks(bsObj, splitAddress(siteUrl)[0])
    externalLinks = getExternalLinks(bsObj, splitAddress(siteUrl)[0])
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            print('即将获取链接的URL是：'+link)
            allIntLinks.add(link)
            getAllExternalLinks(link)
    getAllExternalLinks('http://oreilly.com')

3.1遍历单个域名

3.2采集整个网站

3.3通过互联网采集

3.4用Scrapy采集