目录:
文章目录
Day03笔记
requests模块
常用方法
- get() : 发起请求,获取响应对象response
-
response对象
response.text
: 字符串
字符编码 :ISO-8859-1
response.encoding = "utf-8"
2.response.content
: bytesresponse.status_code
: 返回响应码
-
没有查询参数
res = requests.get(url,headers=headers) -
有查询参数
params={“wd”:“美女”}
res = requests.get(url,params=params,headers=headers) -
for example:
import requests url = "http://www.baidu.com/s?" headers = {"User-Agent":"Mozilla5.0/"} s = input("请输入要搜索的内容:") # get方法params参数必须要为 字典 格式,自动编码 wd = {"wd":s} res = requests.get(url,params=wd,headers=headers) res.encoding = "utf-8" print(res.text)
-
- post() : 参数名data
-
data={} #data参数为字典,不用转为 bytes 数据类型
-
for example:
import requests import json # 处理表单数据 # Form表单的数据要放到字典中,然后再进行编码转换 word = input("请输入要翻译的内容:") data = {"i":word, "from":"AUTO", "to":"AUTO", "smartresult":"dict", "client":"fanyideskweb", "salt":"1536648321283", "sign":"1e7948e25551448dbfb7184f23dc126c", "doctype":"json", "version":"2.1", "keyfrom":"fanyi.web", "action":"FY_BY_REALTIME", "typoResult":"false" } url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" headers = {"User-Agent":"Mozilla5.0/"} res = requests.post(url,data=data,headers=headers) res.encoding = "utf-8" result = res.text # 把json格式的字符串转换为 Python字典 # json模块中的loads方法 :json格式字符串 -> Python字典 result_dict = json.loads(result) r = result_dict["translateResult"][0][0]["tgt"] print(r)
-
代理 :proxies
- 爬虫和反爬虫斗争的第二步
获取代理IP的网站 - 西刺代理
- 快代理
- 全网代理
- 普通代理 :proxies={“协议”:“IP地址:端口号”}
-
proxies={"HTTP":"222.221.11.119:3128"}
-
for example:
import requests url = "http://www.taobao.com/" proxies = {"HTTP":"222.221.11.119:3128"} headers = {"User-Agent":"Mozilla5.0/"} res = requests.get(url,proxies=proxies,headers=headers) res.encoding = "utf-8" print(res.text)
-
- 私密代理 :
-
proxies={"HTTP":"http://309435365:szayclhp@114.67.228.126:16819"}
-
for example :
import requests url = "http://www.taobao.com/" headers = {"User-Agent":"Mozilla5.0/"} proxies={"HTTP":"http://309435365:szayclhp@114.67.228.126:16819"} #114.67.228.126:16819 res = requests.get(url,proxies=proxies,headers=headers) res.encoding = "utf-8" print(res.status_code)
-
爬取链家MongoDb
import requests import re import pymongo class LianJiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.headers = {"User-Agent":"Mozilla5.0/"} self.proxies = {"HTTP":"http://309435365:szayclhp@114.67.228.126:16819"} self.page = 1 # 创建连接对象 self.conn = pymongo.MongoClient("localhost",27017) # 创建数据库对象 self.db = self.conn.Lianjia#库名 # 创建集合对象 self.myset = self.db.housePrice#表名 # 获取页面 def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 用正则解析页面 def parsePage(self,html): p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S) r_list = p.findall(html) # [("首科花园","595"),(),()] self.writeToMogo(r_list) # 保存本地文件 def writeToMogo(self,r_list): for r_tuple in r_list: d = {"houseName":r_tuple[0].strip(), "housePrice":float(r_tuple[1].strip())*10000} self.myset.insert(d) print("存入mongodb数据库成功") # mongo # show dbs; # use Lianjia; # show tables; # db.housePrice.find().pretty(); # 主函数 def workOn(self): while True: print("正在爬取%d页" % self.page) # 拼接URL url = self.baseurl + str(self.page) + "/" self.getPage(url) print("第%d页爬取成功" % self.page) c = input("是否继续爬取(y/n):") if c.strip().lower() == "y": self.page += 1 else: print("爬取结束,谢谢使用!") break if __name__ == "__main__": spider = LianJiaSpider() spider.workOn()
-
爬取链家MySql
import requests import re import pymysql import warnings class LianJiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.headers = {"User-Agent":"Mozilla5.0/"} self.proxies = {"HTTP":"http://309435365:szayclhp@114.67.228.126:16819"} self.page = 1 # 创建数据库连接对象 self.db = pymysql.connect("localhost","root", "123456",charset="utf8") # 创建游标对象 self.cursor = self.db.cursor() # 获取页面 def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers) res.encoding = "utf-8" html = res.text print("页面已获取,正在解析页面...") self.parsePage(html) # 用正则解析页面 def parsePage(self,html): p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S) r_list = p.findall(html) # [("首科花园","595"),(),()] print("正在存入mysql数据库...") self.writeToMysql(r_list) # 保存到MySQL数据库 def writeToMysql(self,r_list): c_db = "create database if not exists spider;" u_db = "use spider;" c_tab = "create table if not exists lianjia(\ id int primary key auto_increment,\ name varchar(30),\ price decimal(20,2))charset=utf8;" # 过滤警告 warnings.filterwarnings("error") try: self.cursor.execute(c_db) except Warning: pass self.cursor.execute(u_db) try: self.cursor.execute(c_tab) except Warning: pass # r_list : [("首科花园","595"),(),()] for r_tuple in r_list: s_insert = "insert into lianjia(name,price) \ values('%s','%s');" % \ (r_tuple[0].strip(), float(r_tuple[1].strip())*10000) self.cursor.execute(s_insert) self.db.commit() print("第%d页存入数据库成功" % self.page) # 主函数 def workOn(self): while True: print("正在爬取%d页" % self.page) # 拼接URL url = self.baseurl + str(self.page) + "/" self.getPage(url) print("第%d页爬取成功" % self.page) c = input("是否继续爬取(y/n):") if c.strip().lower() == "y": self.page += 1 else: print("爬取结束,谢谢使用!") break if __name__ == "__main__": spider = LianJiaSpider() spider.workOn()
-
链家信息保存到本地
import requests import re class LianJiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.headers = {"User-Agent":"Mozilla5.0/"} self.proxies = {"HTTP":"http://309435365:szayclhp@114.67.228.126:16819"} self.page = 1 # 获取页面 def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 用正则解析页面 def parsePage(self,html): p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S) r_list = p.findall(html) # [("首科花园","595"),(),()] self.writePage(r_list) # 保存本地文件 def writePage(self,r_list): for r_tuple in r_list:# r_tuple ("首科花园","595") for r_str in r_tuple: with open("链家二手房.txt","a") as f: f.write(r_str.strip() + " ") with open("链家二手房.txt","a") as f: f.write("\n") # 主函数 def workOn(self): while True: print("正在爬取%d页" % self.page) # 拼接URL url = self.baseurl + str(self.page) + "/" self.getPage(url) print("第%d页爬取成功" % self.page) c = input("是否继续爬取(y/n):") if c.strip().lower() == "y": self.page += 1 else: print("爬取结束,谢谢使用!") break if __name__ == "__main__": spider = LianJiaSpider() spider.workOn()
-
Web客户端验证 :auth
- auth = (“用户名”,“密 码”)
auth = ("lht4815","123456789lht")
requests.get(url,auth=auth,headers=headers)
SSL证书认证 : verify
-
verify=True : 默认,做SSL证书认证
-
verify=False : 忽略证书认证
-
for example:
import requests url = "https://www.12306.cn/mormhweb/" headers = {"User-Agent":"Mozilla5.0/"} res = requests.get(url,verify=False,headers=headers) res.encoding = "utf-8" print(res.text)
Handler处理器(urllib.request)
定义
自定义的urlopen()方法,urlopen方法是一个特殊的opener
常用方法
build_opener(Handler处理器对象)
opener.open(url)
使用流程
- 创建相关的Handler处理器对象
http_handler = urllib.request.HTTPHandler()
- 创建自定义opener对象
opener = urllib.request.build_opener(http_handler)
- 利用opener对象的open方法发请求
Handler处理器分类
- HTTPHandler()
import urllib.request url = "http://www.baidu.com/" # 1.创建HTTPHandler处理器对象 http_hander = urllib.request.HTTPHandler() # 2.创建自定义的opener对象 opener = urllib.request.build_opener(http_hander) # 3.利用opener对象的open方法发请求 req = urllib.request.Request(url) res = opener.open(req) print(res.read().decode("utf-8"))
- ProxyHandler(代理IP) : 普通代理
import urllib.request url = "http://www.baidu.com/" proxy = {"HTTP":"120.78.196.33:3128"} # 1.创建Handler proxy_handler = urllib.request.ProxyHandler(proxy) # 2.创建自定义opener opener = urllib.request.build_opener(proxy_handler) # 3.利用open方法发请求 req = urllib.request.Request(url) res = opener.open(req) print(res.read().decode("utf-8"))
- ProxyBasicAuthHandler(密码管理器对象) : 私密代理