第六章 读取文档

6.1文档编码

6.2纯文本

1
2
3
from urllib.request import urlopen
textPage = urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt')
print(textPage.read())
b'CHAPTER I\n\n"Well, Prince, so Genoa and Lucca are now just family estates of theBuonapartes. But I warn you, if you don\'t tell me that this means war,if you still try to defend the infamies and horrors perpetrated bythat Antichrist- I really believe he is Antichrist- I will havenothing more to do with you and you are no longer my friend, no longermy \'faithful slave,\' as you call yourself! But how do you do? I seeI have frightened you- sit down and tell me all the news."\n\nIt was in July, 1805, and the speaker was the well-known AnnaPavlovna Scherer, maid of honor and favorite of the Empress MaryaFedorovna. With these words she greeted Prince Vasili Kuragin, a manof high rank and importance, who was the first to arrive at herreception. Anna Pavlovna had had a cough for some days. She was, asshe said, suffering from la grippe; grippe 
......
1
2
3
from urllib.request import urlopen
textPage = urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1-ru.txt')
print(str(textPage.read()), 'utf-8')
b"\xd0\xa7\xd0\x90\xd0\xa1\xd0\xa2\xd0\xac \xd0\x9f\xd0\x95\xd0\xa0\xd0\x92\xd0\x90\xd0\xaf\n\nI\n\n\xe2\x80\x94 Eh bien, mon prince. G\xc3\xaanes et Lucques ne sont plus que des apanages, des \xd0\xbf\xd0\xbe\xd0\xbc\xd0\xb5\xd1\x81\xd1\x82\xd1\x8c\xd1\x8f, de la famille Buonaparte. Non, je vous pr\xc3\xa9viens que si vous ne me dites pas que nous avons la guerre, si vous vous permettez encore de pallier toutes les infamies, toutes les atrocit\xc3\xa9s de cet Antichrist (ma parole, j'y crois) \xe2\x80\x94 je ne vous connais plus, vous n'\xc3\xaates plus mon ami, vous n'\xc3\xaates plus \xd0\xbc\xd0\xbe\xd0\xb9 \xd0\xb2\xd0\xb5\xd1\x80\xd0\xbd\xd1\x8b\xd0\xb9
......
1
2
3
4
5
6
7
from bs4 import BeautifulSoup

html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bsObj = BeautifulSoup(html, 'lxml')
content = bsObj.find('div', {'id':'mw-content-text'}).get_text()
content = bytes(content, 'UTF-8')
content = content.decode('UTF-8')
1
print(content)
Python




Paradigm
multi-paradigm: object-oriented, imperative, functional, procedural, reflective


Designed by
Guido van Rossum


Developer
Python Software Foundation


First appeared
20 February 1991; 26 years ago (1991-02-20)[1]
......

6.3CSV

1
2
3
4
5
6
7
8
9
from urllib.request import urlopen
from io import StringIO
import csv
data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode('ascii', 'ignore')
dataFile = StringIO(data)
csvReader = csv.reader(dataFile)

for row in csvReader:
print(row)
['Name', 'Year']
["Monty Python's Flying Circus", '1970']
['Another Monty Python Record', '1971']
["Monty Python's Previous Record", '1972']
['The Monty Python Matching Tie and Handkerchief', '1973']
['Monty Python Live at Drury Lane', '1974']
['An Album of the Soundtrack of the Trailer of the Film of Monty Python and the Holy Grail', '1975']
['Monty Python Live at City Center', '1977']
['The Monty Python Instant Record Collection', '1977']
["Monty Python's Life of Brian", '1979']
["Monty Python's Cotractual Obligation Album", '1980']
["Monty Python's The Meaning of Life", '1983']
['The Final Rip Off', '1987']
['Monty Python Sings', '1989']
['The Ultimate Monty Python Rip Off', '1994']
['Monty Python Sings Again', '2014']
1
2
3
4
5
6
7
8
9
10
11
from urllib.request import urlopen
from io import StringIO
import csv
data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode('ascii', 'ignore')
dataFile = StringIO(data)
dictReader = csv.DictReader(dataFile)

print(dictReader.fieldnames)

for row in dictReader:
print(row)
['Name', 'Year']
OrderedDict([('Name', "Monty Python's Flying Circus"), ('Year', '1970')])
OrderedDict([('Name', 'Another Monty Python Record'), ('Year', '1971')])
OrderedDict([('Name', "Monty Python's Previous Record"), ('Year', '1972')])
OrderedDict([('Name', 'The Monty Python Matching Tie and Handkerchief'), ('Year', '1973')])
OrderedDict([('Name', 'Monty Python Live at Drury Lane'), ('Year', '1974')])
OrderedDict([('Name', 'An Album of the Soundtrack of the Trailer of the Film of Monty Python and the Holy Grail'), ('Year', '1975')])
OrderedDict([('Name', 'Monty Python Live at City Center'), ('Year', '1977')])
OrderedDict([('Name', 'The Monty Python Instant Record Collection'), ('Year', '1977')])
OrderedDict([('Name', "Monty Python's Life of Brian"), ('Year', '1979')])
OrderedDict([('Name', "Monty Python's Cotractual Obligation Album"), ('Year', '1980')])
OrderedDict([('Name', "Monty Python's The Meaning of Life"), ('Year', '1983')])
OrderedDict([('Name', 'The Final Rip Off'), ('Year', '1987')])
OrderedDict([('Name', 'Monty Python Sings'), ('Year', '1989')])
OrderedDict([('Name', 'The Ultimate Monty Python Rip Off'), ('Year', '1994')])
OrderedDict([('Name', 'Monty Python Sings Again'), ('Year', '2014')])

6.4PDF

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open

def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)

process_pdf(rsrcmgr, device, pdfFile)
device.close()

content = retstr.getvalue()
retstr.close()
return content

pdfFile = urlopen('http://pythonscraping.com/pages/warandpeace/chapter1.pdf')
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()
1
2
3
4
5
6
7
8
9
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO

wordFile = urlopen('http://pythonscraping.com/pages/AWordDocument.docx').read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')
print(xml_content.decode('utf-8'))
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.
......
1
2
3
4
5
6
7
8
9
10
11
12
13
14
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
from bs4 import BeautifulSoup

wordFile = urlopen('http://pythonscraping.com/pages/AWordDocument.docx').read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')

wordObj = BeautifulSoup(xml_content.decode('utf-8'), 'lxml')
textStrings = wordObj.findAll('body')
for textElem in textStrings:
print(textElem.text)
A Word Document on a WebsiteThis is a Word document, full of content that you want very much. Unfortunately, it’s difficult to access because I’m putting it on my website as a .docx file, rather than just publishing it as HTML
分享到