第五章 存储数据

5.1媒体文件

  • urllib.request.urlretrieve 可以根据文件的 URL 下载文件
1
2
3
4
5
6
7
8
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com')
bsObj = BeautifulSoup(html, 'lxml')
imageLocation = bsObj.find('a', {'id':'logo'}).find('img')['src']
urlretrieve(imageLocation, 'logo.jpg')
('logo.jpg', <http.client.HTTPMessage at 0x7f7a9ce0bcf8>)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

downloadDirectory = 'downloaded'
baseUrl = 'http://pythonscraping.com'

def getAbsoluteURL(baseUrl, source):
if source.startswith('http://www.'):
url = 'http://' + source[11:]
elif source.startswith('http://'):
url = source
elif source.startswith('www.'):
url = source[4:]
url = 'http://'+source
else:
url = baseUrl+'/'+source
if baseUrl not in url:
return None
return url

def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace('www.','')
path = path.replace(baseUrl, '')
path = downloadDirectory+path
directory = os.path.dirname(path)

if not os.path.exists(directory):
os.makedirs(directory)

return path

html = urlopen('http://www.pythonscraping.com')
bsObj = BeautifulSoup(html, 'lxml')
downloadList = bsObj.findAll(src=True)

for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl, download['src'])
if fileUrl is not None:
print(fileUrl)

urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))
http://pythonscraping.com/misc/jquery.js?v=1.4.4
http://pythonscraping.com/misc/jquery.once.js?v=1.2
http://pythonscraping.com/misc/drupal.js?os2esm
http://pythonscraping.com/sites/all/themes/skeletontheme/js/jquery.mobilemenu.js?os2esm
http://pythonscraping.com/sites/all/modules/google_analytics/googleanalytics.js?os2esm
http://pythonscraping.com/sites/default/files/lrg_0.jpg
http://pythonscraping.com/img/lrg%20(1).jpg



---------------------------------------------------------------------------

5.2把数据存储到CSV

1
2
3
4
5
6
7
8
9
10
import csv

csvFile = open('test.csv','w+')
try:
writer = csv.writer(csvFile)
writer.writerow(('number', 'number plus 2', 'number times 2'))
for i in range(10):
writer.writerow((i, i+2, i*2))
finally:
csvFile.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://en.wikipedia.org/wiki/Comparison_of_text_editors')
bsObj = BeautifulSoup(html, 'lxml')
#主对比表格是当前页面上的第一个表格
table = bsObj.findAll('table',{'class':'wikitable'})[0]
rows = table.findAll('tr')

csvFile = open('editors.csv','wt',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td','th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()

5.3MySql

  • CREATE DATABASE scraping;
  • USE scraping;
  • CREATE TABLE pages (id BIGINT(7) NOT NULL AUTO_INCREMENT, title VARCHAR(200),content VARCHAR(10000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id));
  • DESCRIBE pages
  • INSERT INTO pages (title, content) VALUES (“Test page title”, “This is some test page content. It can be up to 10,000 characters long.”);
  • SELECT id, title FROM pages WHERE content LIKE “%page content%”;
  • DELETE FROM pages WHERE id = 1;
  • UPDATE pages SET title=”A new title”, content=”Some new content” WHERE id=2;
1
2
3
4
5
6
7
8
9
import pymysql
conn = pymysql.connect(host='127.0.0.1',user='root', passwd='1234', db='mysql')
cur = conn.cursor()
cur.execute('USE scraping')

cur.execute('SELECT * FROM pages WHERE id=3')
print(cur.fetchone())
cur.close()
conn.close()
(3, 'A new title', 'some new content', datetime.datetime(2014, 9, 21, 10, 25, 32))
  • 连接对象( conn )和光标对象( cur )
  • 连接模式除了要连接数据库之外,还要发送数据库信息,处理回滚操作(当一个查询或一组查询被中断时,数据库需要回到初始状态,一般用事务控制手段实现状态回滚),创建新的光标对象,等等。
  • 而一个连接可以有很多个光标。一个光标跟踪一种状态(state)信息,比如跟踪数据库的使用状态。如果你有多个数据库,且需要向所有数据库写内容,就需要多个光标来处理。光标还会包含最后一次查询执行的结果。通过调用光标函数,比如 cur.fetchone() ,可以获取查询结果。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
import pymysql

conn = pymysql.connect(host='127.0.0.1', user='root', passwd='1234',
db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute('USE scraping')

random.seed(datetime.datetime.now())

def store(title, content):
cur.execute('INSERT INTO pages (title, content) VALUES (\"%s\", \"%s\")', (title, content))
cur.connection.commit()

def getLinks(articleUrl):
html = urlopen('http://en.wikipedia.org'+articleUrl)
bsObj = BeautifulSoup(html, 'lxml')
title = bsObj.find('h1').get_text()
content = bsObj.find('div', {'id':'mw-content-text'}).find('p').get_text()
store(title, content)
return bsObj.find('div', {'id':'bodyContent'}).findAll('a',
href=re.compile('^(/wiki/)((?!:).)*$'))
links = getLinks('/wiki/Kevin_Bacon')
try:
while len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
print(newArticle)
links = getLinks(newArticle)
finally:
cur.close()
conn.close()
/wiki/Daniel_Day-Lewis
/wiki/Charles_Boyer
/wiki/Federico_Fellini
/wiki/Rashomon
/wiki/Son_of_Saul
/wiki/Shoeshine_(film)
/wiki/Caroline_Link
/wiki/Jos%C3%A9_Luis_Garci
/wiki/Ingmar_Bergman
/wiki/Hanif_Kureishi
/wiki/Academy_Awards
/wiki/List_of_Jewish_Academy_Award_winners_and_nominees
/wiki/Watch_on_the_Rhine
/wiki/Lillian_Hellman
/wiki/Dashiell_Hammett
/wiki/United_States_Army
/wiki/Uniformed_services_of_the_United_States



---------------------------------------------------------------------------
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from bs4 import BeautifulSoup
import re
import pymysql

conn = pymysql.connect(host='127.0.0.1', user='root', passwd='1234', db='mysql', charset='utf8')

cur = conn.cursor()
cur.execute('USE wikipedia')

def insertPageIfNotExists(url):
cur.execute('SELECT * FROM pages WHERE url = %s', (url))
if cur.rowcount == 0:
cur.execute('INSERT INTO pages (url) VALUES (%s)', (url))
conn.commit()
return cur.lastrowid
else:
return cur.fetchone()[0]
def insertLink(fromPageId, toPageId):
cur.execute('SELECT * FROM links WHERE fromPageId = %s AND toPageId = %s',
(int(fromPageId), int(toPageId)))
if cur.rowcount == 0:
cur.execute('INSERT INTO links (fromPageId, toPageId) VALUES (%s, %s)',
(int(fromPageId), int(toPageId)))
conn.commit()
pages = set()
def getLinks(pageUrl, recursionLevel):
global pages
if recursionLevel > 4:
return
pageId = insertPageIfNotExists(pageUrl)
html = urlopen('http://en.wikipedia.org'+pageUrl)
bsObj = BeautifulSoup(html, 'lxml')
for link in bsObj.findAll('a',href=re.compile('^(/wiki/)((?!:).)*$')):
insertLink(pageId, insertPageIfNotExists(link.attrs['href']))
if link.attrs['href'] not in pages:
#遇到一个新页面,加入集合并搜索里面的词条链接
newPage = link.attrs['href']
pages.add(newPage)
getLinks(newPage, recursionLevel+1)

getLinks('/wiki/Kevin_Bacon', 0)
cur.close()
conn.close()
---------------------------------------------------------------------------

5.4Email

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import smtplib
from email.mime.text import MIMEText

msg = MIMEText('The body of the email is here', 'plain', 'utf-8')

from_addr = '******@126.com'
password = '******'
to_addr = '******@qq.com'
smtp_server = 'smtp.126.com'

server = smtplib.SMTP(smtp_server, 25)
server.set_debuglevel(1)
server.login(from_addr, password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()

获取浙大数学学术讲座信息并用邮件提醒

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr

import smtplib

def _format_addr(s):
name, addr = parseaddr(s)
return formataddr((Header(name, 'utf-8').encode(), addr))

from_addr = '******@126.com'
password = '******'
to_addr = '******@qq.com'
smtp_server = 'smtp.126.com'

msg = MIMEText('数学学术讲座内容', 'plain', 'utf-8')
msg['From'] = _format_addr('Python <%s>' % from_addr)
msg['To'] = _format_addr('zc <%s>' % to_addr)
msg['Subject'] = Header('数学学术讲座内容更新', 'utf-8').encode()

server = smtplib.SMTP(smtp_server, 25)
#server.set_debuglevel(1)
server.login(from_addr, password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()
(221, b'Bye')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import time

from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr

import smtplib

def _format_addr(s):
name, addr = parseaddr(s)
return formataddr((Header(name, 'utf-8').encode(), addr))

from_addr = '******@126.com'
password = '******'
to_addr = '******@qq.com'
smtp_server = 'smtp.126.com'


#讲座通知页面
html = urlopen('http://www.math.zju.edu.cn/news.asp?id=6624&TabName=%B1%BE%BF%C6%BD%CC%D1%A7')
bsObj = BeautifulSoup(html, 'lxml')


#讲座报名链接
URL = bsObj.find('a', href=re.compile('^(http://cn.mikecrm.com/)'))

while True:
if URL is None:
break
if URL != bsObj.find('a', href=re.compile('^(http://cn.mikecrm.com/)')):
URL = bsObj.find('a', href=re.compile('^(http://cn.mikecrm.com/)'))
#讲座具体内容链接
newsURL = bsObj.find('a', href=re.compile('^http://www.math.zju.edu.cn/news'))

#获取讲座内容
htmlnews = urlopen(newsURL.get_text())
bsObj = BeautifulSoup(htmlnews, 'lxml')
news = bsObj.findAll('td', class_ = 'NewsBody')
newtext = '您好!**同学:\n以下是更新的学术讲座内容,欢迎报名\n报名地址: '+URL['href']+'\n'+'通知地址: '+newsURL['href']+'\n'
for new in news:
newtext = newtext + new.get_text() + '\n'
print(newtext)

msg = MIMEText(newtext, 'plain', 'utf-8')
msg['From'] = _format_addr('** <%s>' % from_addr)
msg['To'] = _format_addr('******* <%s>' % to_addr)
msg['Subject'] = Header('浙江大学数学科学学术报告', 'utf-8').encode()

server = smtplib.SMTP(smtp_server, 25)
#server.set_debuglevel(1)
server.login(from_addr, password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()
time.sleep(600)
您好!**同学:
以下是更新的学术讲座内容,欢迎报名
报名地址: http://cn.mikecrm.com/ajIyYIp
通知地址: http://www.math.zju.edu.cn/news.asp?id=7212&TabName=%D1%A7%CA%F5%BF%C6%D1%D0
题目 

Waves near Resonance: from Fast Train Track to Moving Loads on Very Large Floating Structures

摘要 
The problem of forced unsteady water waves under an elastic sheet is a model for waves under ice or under very large floating structures. Even though small amplitude solitary waves are not predicted to exist by standard perturbation analyses, we find large amplitude solitary waves, and explore their crucial role in the forced problem of a moving load on the surface. This is meant to represent a model of the use of extended ice sheets as roads and aircraft runways. Some open problems on the theoretical side of this topic will be mentioned in the end.

Speaker: 王展,中科院力学所



Time and place: Friday Nov 17th,15:40-16:40, 欧阳纯美楼316
联系人:张庆海老师(qinghai@zju.edu.cn)
分享到