第八章 自然语言处理

8.1概括数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator

def cleanInput(input):
input = re.sub('\n+', ' ', input).lower()
input = re.sub('\[[0-9]*\]', '', input)
input = re.sub(' +', ' ', input)
input = bytes(input, 'UTF-8')
input = input.decode('ascii', 'ignore')
cleanInput = []
input = input.split(' ')
for item in input:
item = item.strip(string.punctuation)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput

def ngrams(input, n):
input = cleanInput(input)
output = {}
for i in range(len(input)-n+1):
ngramTemp = ' '.join(input[i:i+n])
if ngramTemp not in output:
output[ngramTemp] = 0
output[ngramTemp] += 1
return output

content = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(), 'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key = operator.itemgetter(1), reverse=True)
print(sortedNGrams)
[('of the', 213), ('in the', 65), ('to the', 61), ('by the', 41), ('the constitution', 34), ('of our', 29), ('to be', 26), ('the people', 24), ('from the', 24), ('that the', 23), ('and the', 23), ('it is', 23), ('of a', 22), ('may be', 19), ('of their', 19), ('the 
...... 
  • 过滤没意义的单词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def isCommon(ngram):
commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it",
"i", "that", "for", "you", "he", "with", "on", "do", "say", "this",
"they", "is", "an", "at", "but","we", "his", "from", "that", "not",
"by", "she", "or", "as", "what", "go", "their","can", "who", "get",
"if", "would", "her", "all", "my", "make", "about", "know", "will",
"as", "up", "one", "time", "has", "been", "there", "year", "so",
"think", "when", "which", "them", "some", "me", "people", "take",
"out", "into", "just", "see", "him", "your", "come", "could", "now",
"than", "like", "other", "how", "then", "its", "our", "two", "more",
"these", "want", "way", "look", "first", "also", "new", "because",
"day", "more", "use", "no", "man", "find", "here", "thing", "give",
"many", "well"]
for word in ngram:
if word in commonWords:
return True
return False
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator

def cleanInput(input):
input = re.sub('\n+', ' ', input).lower()
input = re.sub('\[[0-9]*\]', '', input)
input = re.sub(' +', ' ', input)
input = bytes(input, 'UTF-8')
input = input.decode('ascii', 'ignore')
cleanInput = []
input = input.split(' ')
for item in input:
item = item.strip(string.punctuation)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput

def ngrams(input, n):
input = cleanInput(input)
output = {}
for i in range(len(input)-n+1):
ngramTemp = ' '.join(input[i:i+n])
if ngramTemp not in output:
output[ngramTemp] = 0
output[ngramTemp] += 1
return output

content = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(), 'utf-8')
ngrams = ngrams(content, 2)
sortedNGrams = sorted(ngrams.items(), key = operator.itemgetter(1), reverse=True)
print(sortedNGrams)
[('the of', 269), ('of the', 264), ('to the', 96), ('the the', 52), ('by the', 49), ('the to', 39), ('to be', 29), ('of our', 29), ('from the', 27), ('the people', 26), ('of to', 21), ('of power', 21), ('the government', 21), ('for the', 20), ('of of', 20), ('the power', 17), ('power to', 17), ('to to', 12), ('but the', 12), ('be by', 12), ('upon the', 12), ('to of', 12), ('or the', 11), ('the by', 10), ('not to', 10), ('people the', 10), ('be of', 10), ('of my', 9), ('the for', 9), ('the be', 9), ('of for', 9), ('of those', 9), ('of or', 9), ('be to', 9), ('on the', 9), ('the from', 9), ('the not', 9), ('the powers', 9), ('of one', 9), ('the other', 9), ('the whole', 9), ('to me', 8), ('to them', 8), ('one of', 8), ('by of', 8), ('would be', 8), ('should be', 8), ('them to', 7), ('country the', 7), ('of such', 7),
......

8.2马尔可夫模型

  • 通过演讲内容的结构生成任意长度的(下面示例中链长为 100)马尔可夫链组成的句子
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from urllib.request import urlopen
from random import randint

def wordListSum(wordList):
sum = 0;
for word, value in wordList.items():
sum += value
return sum

def retrieveRandomWord(wordList):
randIndex = randint(1, wordListSum(wordList))
for word, value in wordList.items():
randIndex -= value
if randIndex <= 0:
return word

def buildWordDict(text):
#剔除换行符和引号
text = text.replace('\n', ' ')
text = text.replace('\"', '')

#保证每个标点符号都和前面的单词在一起
#这样不会被剔除,保留在马尔可夫链中
punctuation = [',', '.', ';', ':']
for symbol in punctuation:
text = text.replace(symbol, ' '+symbol+' ')

words = text.split(' ')
#过滤空单词
words = [word for word in words if word != '']

wordDict = {}
for i in range(1, len(words)):
if words[i-1] not in wordDict:
#为单词新建一个词典
wordDict[words[i-1]] = {}
if words[i] not in wordDict[words[i-1]]:
wordDict[words[i-1]][words[i]] = 0
wordDict[words[i-1]][words[i]] = wordDict[words[i-1]][words[i]]+1
return wordDict
text = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(), 'utf-8')
wordDict = buildWordDict(text)

#生成长度为100的马尔可夫链
length = 100
chain = ''
currentWord = 'I'
for i in range(0, length):
chain += currentWord + ' '
currentWord = retrieveRandomWord(wordDict[currentWord])

print(chain)
I believe to that officer at the sacredness and fall their best realized ; and consequently of the State governments in mine that they did not to make all necessary sacrifices and that the Executive party in a country . To me the authority to their usefulness ends . No matter in the result of want of the power which of Columbia can never to give firmness and surely nothing beyond . As was wanting no appearance of its theory , the love of the never-dying worm in Gaul or continue to the security of superintendent and on the magnitude 

维基百科六度分割:广度优先搜索

8.3自然语言工具包

  • NLTK是一个python库, 用于识别和标记单词的词性

8.3.1安装

1
2
import nltk
nltk.download()
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml





True

8.3.2统计分析

1
2
3
4
5
from nltk import word_tokenize
from nltk import Text

tokens = word_tokenize('Here is some not very interesting text')
text = Text(tokens)
1
print(text)
<Text: Here is some not very interesting text...>
1
from nltk.book import *
*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
1
2
3
from nltk import FreqDist
fdist = FreqDist(text6)
fdist.most_common(10)
[(':', 1197),
 ('.', 816),
 ('!', 801),
 (',', 731),
 ("'", 421),
 ('[', 319),
 (']', 312),
 ('the', 299),
 ('I', 255),
 ('ARTHUR', 225)]
1
fdist['Grail']
34
1
2
3
4
from nltk import bigrams
bigrams = bigrams(text6)
bigramsDist = FreqDist(bigrams)
bigramsDist[('Sir', 'Robin')]
18
1
2
3
4
from nltk import ngrams
fourgrams = ngrams(text6, 4)
fourgramsDist = FreqDist(fourgrams)
fourgramsDist[('father', 'smelt', 'of', 'elderberries')]
1
1
2
3
4
5
6
from nltk.book import *
from nltk import ngrams
fourgrams = ngrams(text6, 4)
for fourgram in fourgrams:
if fourgram[0] == 'coconut':
print(fourgram)
('coconut', 'and', 'you', "'")
('coconut', "'", 's', 'tropical')
('coconut', '?', 'ARTHUR', ':')
('coconut', '.', 'ARTHUR', ':')
('coconut', 'back', 'anyway', '...')
('coconut', 'on', 'a', 'line')

8.3.3用NLTK做词性分析

1
2
3
4
5
from nltk.book import *
from nltk import word_tokenize
text = word_tokenize('Strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony.')
from nltk import pos_tag
pos_tag(text)
[('Strange', 'JJ'),
 ('women', 'NNS'),
 ('lying', 'VBG'),
 ('in', 'IN'),
 ('ponds', 'NNS'),
 ('distributing', 'VBG'),
 ('swords', 'NNS'),
 ('is', 'VBZ'),
 ('no', 'DT'),
 ('basis', 'NN'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('system', 'NN'),
 ('of', 'IN'),
 ('government', 'NN'),
 ('.', '.'),
 ('Supreme', 'NNP'),
 ('executive', 'NN'),
 ('power', 'NN'),
 ('derives', 'VBZ'),
 ('from', 'IN'),
 ('a', 'DT'),
 ('mandate', 'NN'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('masses', 'NNS'),
 (',', ','),
 ('not', 'RB'),
 ('from', 'IN'),
 ('some', 'DT'),
 ('farcical', 'JJ'),
 ('aquatic', 'JJ'),
 ('ceremony', 'NN'),
 ('.', '.')]
1
2
3
4
5
6
7
8
9
10
from nltk import word_tokenize, sent_tokenize, pos_tag
sentences = sent_tokenize("Google is one of the best companies in the world. I constantly google myself to see what I'm up to.")
nouns = ['NN', 'NNS', 'NNP', 'NNPS']

for sentence in sentences:
if 'google' in sentence.lower():
taggedWords = pos_tag(word_tokenize(sentence))
for word in taggedWords:
if word[0].lower() == 'google' and word[1] in nouns:
print(sentence)
Google is one of the best companies in the world.
分享到