第七章 数据清洗

7.1编写代码清洗数据

  • n-gram,表示文字或语言中的 n 个连续的单词组成的序列
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from urllib.request import urlopen
from bs4 import BeautifulSoup

def ngrams(input, n):
input = input.split(' ')
output = []
for i in range(len(input)-n+1):
output.append(input[i:i+n])
return output
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bsObj = BeautifulSoup(html, 'lxml')
content = bsObj.find('div', {'id':'mw-content-text'}).get_text()
ngrams = ngrams(content, 2)
print(ngrams)
print('2-grams count is: '+str(len(ngrams)))
[['\nPython\n\n\n\n\nParadigm\nmulti-paradigm:', 'object-oriented,'], ['object-oriented,', 'imperative,'], ['imperative,', 'functional,'], ['functional,', 'procedural,'], ['procedural,', 'reflective\n\n\nDesigned\xa0by\nGuido'], ['reflective\n\n\nDesigned\xa0by\nGuido', 'van'], ['van', 'Rossum\n\n\nDeveloper\nPython'], ['Rossum\n\n\nDeveloper\nPython', 'Software'], ['Software'
......
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

def ngrams(input, n):
global content
content = re.sub('\n+', " ", content)
content = re.sub(' +', " ", content)
content = bytes(content, 'UTF-8')
content = content.decode('ascii', 'ignore')
print(content)
input = input.split(' ')
output = []
for i in range(len(input)-n+1):
output.append(input[i:i+n])
return output
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bsObj = BeautifulSoup(html, 'lxml')
content = bsObj.find('div', {'id':'mw-content-text'}).get_text()
ngrams = ngrams(content, 2)
print(ngrams)
print('2-grams count is: '+str(len(ngrams)))
Python Paradigm multi-paradigm: object-oriented, imperative, functional, procedural, reflective Designedby Guido van Rossum Developer Python Software Foundation Firstappeared 20February 1991; 26 years ago(1991-02-20)[1] Stable release 3.6.3 / 3October 2017; 43 days ago(2017-10-03)[2] 2.7.14 / 16September 2017; 60 days ago(2017-09-16)[3] Typing discipline duck, dynamic, strong OS Cross-platform License Python Software Foundation License Filename extensions .py, .pyc, .pyd, .pyo (prior to 3.5),[4] .pyw, .pyz (since 3.5)[5] Website 
......
  • 剔除单字符的“单词”,除非这个字符是“i”或“a”;
  • 剔除维基百科的引用标记(方括号包裹的数字,如 [1]);
  • 剔除标点符号(注意:这个规则有点儿矫枉过正,在第 9 章我们将详细介绍,本例暂时这样处理)。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string

def cleanInput(input):
input = re.sub('\n+'," ",input)
input = re.sub('\[[0-9]*\]',"",input)
input = re.sub(' +'," ",input)
input = bytes(input, "UTF-8")
input = input.decode('ascii', 'ignore')
cleanInput = []
input = input.split(' ')
for item in input:
# 清除标点符号
item = item.strip(string.punctuation)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput

def ngrams(input, n):
input = cleanInput(input)
output = []
for i in range(len(input)-n+1):
output.append(input[i:i+n])
return output

html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bsObj = BeautifulSoup(html, 'lxml')
content = bsObj.find('div', {'id':'mw-content-text'}).get_text()
ngrams = ngrams(content, 2)
print(ngrams)
print('2-grams count is: '+str(len(ngrams)))
[['Python', 'Paradigm'], ['Paradigm', 'multi-paradigm'], ['multi-paradigm', 'object-oriented'], ['object-oriented', 'imperative'], ['imperative', 'functional'], ['functional', 'procedural'], ['procedural', 'reflective'], ['reflective', 'Designedby'], ['Designedby', 'Guido'], ['Guido', 'van'], ['van', 'Rossum'], 
......
  • 数据标准化: 能够去除重复的序列
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import OrderedDict

def cleanInput(input):
input = re.sub('\n+', " ", input)
input = re.sub('\[[0-9]*\]', "", input)
input = re.sub(' +', " ", input)
input = bytes(input, "UTF-8")
input = input.decode("ascii", "ignore")
cleanInput = []
input = input.split(' ')
for item in input:
item = item.strip(string.punctuation)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput

def getNgrams(input, n):
input = cleanInput(input)
output = dict()
for i in range(len(input)-n+1):
newNGram = " ".join(input[i:i+n])
if newNGram in output:
output[newNGram] += 1
else:
output[newNGram] = 1
return output

html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj = BeautifulSoup(html, "html.parser")
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
#ngrams = getNgrams(content, 2)
#print(ngrams)
#print("2-grams count is: "+str(len(ngrams)))

ngrams = getNgrams(content, 2)
ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True))
print(ngrams)
OrderedDict([('Python Software', 37), ('Software Foundation', 37), ('of the', 34), ('Foundation Retrieved', 31), ('of Python', 28), ('in the', 23), ('in Python', 23), ('van Rossum', 21), ('such as', 20), ('to the', 20), ('is a', 17), ('February 2012', 17), ('Retrieved 24', 17), ('Python Enhancement', 15), ('from the', 14), 
......

7.2数据存储后再清洗

分享到