在进行自然语言的深入学习中,很重要的一个过程是从互联网平台上抓取文本和资料。我开始尝试进行网络爬虫。
从最简单的不需要模拟登陆的百度贴吧和豆瓣等开始。
firefox浏览器
相较于chrome浏览器而言,最近发现firefox特别好用,根本不需要什么额外的抓包,就可以实时监测而且可读性很强。
F12调取网络检测器
查看器——查看整个网页的HTML脚本,支持搜索
控制台——get/post请求的参数/响应/cookie
各项参数和记录特别具体
百度贴吧爬虫
# coding=utf-8
# -*- coding = utf-8 -*-
import urllib2
import re
class Tool:
removeImg = re.compile(r'<img.*?>') # image
removeAddr = re.compile(r'<a href=.*?>|</a>')
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
replaceBR = re.compile('<br><br>|<br>')
replaceTD = re.compile('<td>')
replacePara = re.compile('<p.*?>')
removeExtraTag = re.compile('<.*?>')
def replace(self, x):
x = re.sub(self.removeImg, "", x)
x = re.sub(self.removeAddr, "", x)
x = re.sub(self.replaceLine, "\n", x)
x = re.sub(self.replaceTD, "\t", x)
x = re.sub(self.replacePara, "\n ", x)
x = re.sub(self.replaceBR, "\n", x)
x = re.sub(self.removeExtraTag, "", x)
return x.strip()
class BaiduTieba:
def __init__(self, url, seelz, floortag=1):
self.url = url
self.seeLz = '?see_lz='+str(seelz)
self.tool = Tool()
self.file = None
self.defaultTitle = "百度贴吧"
self.floortag = floortag # 判断是否添加楼层标志
self.floor = 1
def getPageContent(self, pagenum):
url = self.url + self.seeLz + '&pn=' + str(pagenum)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0'
headers = {'User-Agent': user_agent}
try:
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
return content
except urllib2.URLError, e:
if hasattr(e, 'reason'): # 检查是否有name的属性
print e.reason
def get_title(self, pagenum=1):
content = self.getPageContent(pagenum)
pattern_title = re.compile(r'<h3 class="core_title_txt pull-left text-overflow .*?>(.*?)</h3>', re.S)
title = re.search(pattern_title, content)
if title:
return title.group(1).strip()
else:
print None
def get_author(self, pagenum=1):
content = self.getPageContent(pagenum)
pattern_author = re.compile(r'<div class="louzhubiaoshi j_louzhubiaoshi" author="(.*?)">')
author = re.search(pattern_author, content)
if author:
return author.group(1).strip()
else:
return None
def get_reply_page(self, pagenum=1):
content = self.getPageContent(pagenum)
pattern_page = re.compile(
r'<li class="l_reply_num".*? style="margin-right:\dpx">(.*?)</span>.*?<span.*?>(.*?)</span>')
totalpage = re.search(pattern_page, content)
if totalpage:
return totalpage.group(1).strip(), totalpage.group(2).strip()
else:
return None
def getContent(self, pagenum):
content = self.getPageContent(pagenum)
pattern_content = re.compile(r'<div id="post_content_.*?>(.*?)</div>', re.S)
items = re.findall(pattern_content, content)
floor = 1
contents = []
for item in items:
str_floor = str(floor) + u'楼——————————\n'
tempContent = '\n'+ self.tool.replace(item)+'\n'
contents.append(str_floor)
contents.append(tempContent.encode('utf-8'))
floor += 1
return contents
def writedata2File(self, contents):
for item in contents:
print u"正在写入"+ str(self.floor) + u"楼的内容"
self.file.write(item)
self.floor += 1
def newFile(self, title):
if title:
self.file = open(title + '.txt', 'w+')
else:
self.file = open(self.defaultTitle + '.txt', 'w+')
def start_spider(self, pagenum=1): # 先获得第一页基础信息
content = self.getPageContent(pagenum)
title = self.get_title(pagenum)
author = self.get_author(pagenum)
self.newFile(title)
totalpage = self.get_reply_page(pagenum)
totalcontent = []
for i in range(1, totalpage[1]+1):
tempcontent = self.getContent(i)
totalcontent += tempcontent
try:
self.writedata2File(totalcontent)
except IOError, e:
print '写入文件发生异常' + e.message
finally:
print '写入文件完成'
# 测试记录
tips: 1. re.research().group(0,1,2,3……)
2.user_agent等信息根据firefox的get请求中的响应参数决定