爬虫
1.环境:安装Anconda,是一个科学计算的集成开发环境(集成了好多库,ipython等)
2.chrome浏览器插件:XPath Helper,Proxy-SwitchyOmega-Chromium-2.5.15
3.爬虫的请求模块:
1、版本
1、python2 :urllib、urllib2
2、python3 :urllib.request
2.urllib.request 用法
#1.获取请求对象,req
req = urllib.request.Request(url, data=None, headers={})
参数:url 请求路径
data get请求时data=None,post请求时data数据需要用urllib.parse.urlencode(data).encode(‘utf-8’)模块解析
headers 一般需要设置User-Agent
#2. 发送请求
res = urllib.request.urlopen(req)
#3.获取相应内容
html = res.read().decode("utf-8")
#响应对象(res)的方法
1. res.read() : 读取服务器响应的内容
2. res.getcode() : 返回HTTP的响应码
3. res.geturl(): 返回实际数据的URL地址
3.urllib.parse模块 : URL编码模块
#参数解析
urllib.parse.urlencode(字典)
urllib.parse.quote(字符串)
4.urllib.erroe模块
错误类型:URLError HTTPError
URLError一般是因为没有网络连接或者server不存在。这种情况下,会产生一个reason属性,是一个tuple,包含了错误码和错误文本
import urllib.request
import urllib.error
req=urllib.request.Request('http://www.pretend_server.com')
try:
urllib.request.urlopen(req)
except urllib.error.URLError as e:
print(e.reason)
5.请求方式及实例
1.GET
特点:查询参数在URL地址中显示
#案例
import urllib.request
import urllib.parse
class BaiDuSpider:
def __init__(self):
self.baseurl = "http://tieba.baidu.com/f?"
self.headers = {"User-Agent":"Mozilla/5.0 "}
# 获取页面
def get_page(self,url):
req = urllib.request.Request(url,headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode("utf-8")
return html
# 解析页面
def parse_page(self):
pass
# 保存数据
def write_page(self,filename,html):
with open(filename,"w",encoding="utf-8") as f:
f.write(html)
# 主函数
def workOn(self):
name = input("请输入贴吧名称:")
begin = int(input("请输入起始页:"))
end = int(input("请输入终止页:"))
# 拼接贴吧主页URL地址
kw = urllib.parse.urlencode({"kw":name})
for page in range(begin,end+1):
# 拼接第page页完整URL地址
pn = (page - 1) * 50
url = self.baseurl + kw + "&pn=" + str(pn)
html = self.getPage(url)
filename = "第" + str(page) + "页.html"
self.writePage(filename,html)
print("第 %d 页爬取成功" % page)
if __name__ == "__main__":
spider = BaiduSpider()
spider.workOn()
2.POST
1.特点:URL地址无变化,数据是在Form表单中
2.data:表单数据要以bytes类型提交,不能是string,使用urllib.parse.urlencode(data).encode(‘utf-8’)
#案例
import urllib.request
import urllib.parse
import json
# 接收用户输入
key = input("请输入要翻译的内容:")
# 把Form Data定义成1个大字典
data = {
"i":key,
"from":"AUTO",
"to":"AUTO",
"smartresult":"dict",
"client":"fanyideskweb",
"salt":"15458120942800",
"sign":"108feafc7c01c7461a41034463a8df9b",
"ts":"1545812094280",
"bv":"363eb5a1de8cfbadd0cd78bd6bd43bee",
"doctype":"json",
"version":"2.1",
"keyfrom":"fanyi.web",
"action":"FY_BY_REALTIME",
"typoResult":"false"
}
# 把data转为bytes数据类型
data = urllib.parse.urlencode(data).encode("utf-8")
# 发请求,获响应,获取内容
# 此处的URL地址为F12抓到的POST的地址,去掉translate_o中的 "_o"
url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
headers = {"User-Agent":"Mozilla/5.0"}
req = urllib.request.Request(url,data=data,headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode("utf-8")
# 把json格式的字符串转为python中字典
rDict = json.loads(html)
result = rDict["translateResult"][0][0]["tgt"]
print(result)
6.正则表达式匹配响应回来的内容
import re
# 1. 创建编译对象
regex = re.compile(r'正则表达式')
result = regex.findall(html)
#案例
import urllib.request
import re
class NeiHanSpider:
def __init__(self):
self.baseurl = "https://www.neihan8.com/njjzw/"
self.headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)"}
self.page = 2
# 获取页面
def get_page(self,url):
req = urllib.request.Request(url,
headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode("utf-8")
self.parsePage(html)
# 解析页面
def parse_page(self,html):
regex = re.compile('<div class="text-.*?title="(.*?)">.*?class="desc">(.*?)</div>',re.S)
rList = regex.findall(html)
# print(rList)
# [("动物贴墙","海豹"),(),(),()]
self.writePage(rList)
# 保存数据
def write_page(self,rList):
for rTuple in rList:
with open("内涵.txt","a") as f:
f.write(rTuple[0].strip()+"\n")
f.write(rTuple[1].strip()+"\n\n")
# 主函数
def workOn(self):
self.getPage(self.baseurl)
while True:
c = input("成功,是否继续(y/n):")
if c.strip().lower() == "y":
url = self.baseurl + "index_" + str(self.page) + ".html"
self.getPage(url)
self.page += 1
else:
print("爬取结束")
break
if __name__ == "__main__":
spider = NeihanSpider()
spider.workOn()
7.csv模块使用流程
1. 打开csv文件
with open("测试.csv","w",newline="",encoding="gb18030") as f:
2. 初始化写入对象
writer = csv.writer(f)
3. 写入数据
writer.writerow(列表)
import urllib.request
import re
import csv
class MaoyanSpider:
def __init__(self):
self.baseurl = "https://maoyan.com/board/4?offset="
self.headers = {"User-Agent":"Mozilla/5.0"}
self.offset = 0
# 获取页面
def get_page(self,url):
req = urllib.request.Request(url,
headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode("utf-8")
self.parsePage(html)
# 解析页面
def parse_page(self,html):
# 创建编译对象
p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
rList = p.findall(html)
# rList:[("霸王别姬","张国荣","1993"),()]
self.writeToCSV(rList)
# 保存数据
def write_to_csv(self,rList):
for r in rList:
#r = list(r)
r = [r[0].strip(),r[1].strip(),r[2].strip()]
with open("猫眼.csv","a",newline="",encoding="gb18030") as f:
# 创建写入对象
writer = csv.writer(f)
# 调用writerow()方法
writer.writerow(r)
# 主函数
def workOn(self):
while True:
c = input("爬取按y,退出按q:")
if c.strip().lower() == "y":
url = self.baseurl + str(self.offset)
self.getPage(url)
self.offset += 10
else:
print("爬取结束")
break
#for i in range(0,91,10):
# url = self.baseurl + str(i)
# self.getPage(url)
# time.sleep(0.1)
if __name__ == "__main__":
spider = MaoyanSpider()
spider.workOn()