《Python网络数据采集》第七章 数据清洗
下面的代码将返回维基百科词条“Python programming language”的 2-gram列表
在语言学里面有一种模型叫n-gram,表示文字或语言中的n个连续的单词组成的序列。
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re,string
def cleanInput(input):
input = re.sub('\n+'," ",input)
input = re.sub('\[0-9*\]',"",input)
input = re.sub(' +'," ",input)
input = bytes(input,"UTF-8")
input = input.decode("ascii","ignore")
cleanInput = []
input = input.split(' ')
for item in input:
item = item.strip(string.punctuation)
if len(item) >1 or (item.lower() == 'a' or item.lower() == 'i'):
cleanInput.append(item)
return cleanInput
def ngrams(input,n):
input = cleanInput(input)
output = []
for i in range(len(input)-n+1):
output.append(input[i:i+n])
return output
html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsoj = BeautifulSoup(html,"html.parser")
content = bsoj.find("div",{"id":'mw-content-text'}).get_text()
ngrams = ngrams(content,2)
print(ngrams)
print("2-grams count is: "+str(len(ngrams)))
输出
re.sub(pattern,repl,string,count = 0,flags = 0 )
使用替换值repl替换string中最左侧的、未重叠的pattern的出现位置。repl可以是字符串或函数。
>
re.sub(r’def\s+([a-zA-Z_][a-zA-Z_0-9])\s(\s*):’,
… r’static PyObject*\npy_\1(void)\n{‘,
… ‘def myfunc():’)
‘static PyObject*\npy_myfunc(void)\n{’
如果repl是一个函数,则会为每个非重叠的模式调用调用它 。该函数接受单个匹配对象 参数,并返回替换字符串。例如:>
def dashrepl(matchobj):
… if matchobj.group(0) == ‘-‘: return ’ ’
… else: return ‘-’
re.sub(‘-{1,2}’, dashrepl, ‘pro—-gram-files’)
‘pro–gram files’
re.sub(r’\sAND\s’, ’ & ‘, ‘Baked Beans And Spam’, flags=re.IGNORECASE)
‘Baked Beans & Spam’
图案可以是字符串或图案对象。
可选参数count是要替换的模式最大出现次数; count必须是非负整数。如果省略或为零,则将替换所有出现的事件。仅当与前一个空匹配不相邻时,才会替换模式的空匹配,因此返回 。sub(‘x*’, ‘-‘, ‘abxd’)’-a-b–d-‘
在字符串型repl参数中,除了上面描述的字符转义和反向引用之外, \g还将使用name由(?P…)语法定义的命名组匹配的子字符串。\g使用相应的组号; \g<2>因此,等同于\2,但在替代品中并不含糊\g<2>0。 \20将被解释为对组20的引用,而不是对组2的引用,后跟文字字符’0’。反向引用\g<0>替代了RE匹配的整个子字符串。
纯文本
from urllib.request import urlopen
textPage = urlopen("txt URL")
print(textPage.read())
UTF-8读取
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("url")
bsObj = BeautifulSoup(html, "html.parser")
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
content = bytes(content, "UTF-8")
content = content.decode("UTF-8")
print(content)
读取CSV文件
from urllib.request import urlopen
from io import StringIO
import csv
data = urlopen("csv URL").read().decode('ascii', 'ignore')
dataFile = StringIO(data)
dictReader = csv.DictReader(dataFile)
print(dictReader.fieldnames)
for row in dictReader:
print(row)
PDF文件(这里需要pip3 install PDFMiner3K)
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
from urllib.request import urlopen
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
process_pdf(rsrcmgr, device, pdfFile)
device.close()
content = retstr.getvalue()
retstr.close()
return content
pdfFile = urlopen("PDF url")
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()
DOCX文档
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
from bs4 import BeautifulSoup
wordFile = urlopen("http://pythonscraping.com/pages/AWordDocument.docx").read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')
#过滤模式
wordObj = BeautifulSoup(xml_content.decode('utf-8'), "lxml-xml")
textStrings = wordObj.findAll("w:t")
for textElem in textStrings:
closeTag = ""
try:
style = textElem.parent.previousSibling.find("w:pStyle")
if style is not None and style["w:val"] == "Title":
print("<h1>")
closeTag = "</h1>"
except AttributeError: #不打印标签
pass
print(textElem.text)
print(closeTag)