post请求
一般get请求的话,url是会有变化的,如果没有的话就不是get请求。
urllib是怎么区分get和post的
点进去看源码
data是none,他就是get请求,如果有值就是post请求
urllib的ajax请求
像这种url没有变化的,一定是ajax请求
那遇到这种url没有变化的应该怎么办呢
如果遇到是ajax请求的话
- 先找到请求接口
- 找请求参数 ,既然是post,那么他的参数一定在form表单里面
拿有道来说
分析完之后就写代码
盘它
import urllib.request
import urllib.parse
headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
# POST请求的目标URL(这个代码是之前的链接,方便我们使用,不用传递sign参数,新版中该参数是加密的)
url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null"
#构建表单数据
formdata = {
'i': '你好',
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_CLICKBUTTION',
'typoResult': 'false',
}
#上面的参数不能直接传,得编码。编完码之后还得转转成Unicode
formdata = urllib.parse.urlencode(formdata).encode('utf-8')
#创建请求,上面说过根据源码,data是判断get post 请求的标准,所以必须自己创建请求,给data赋值,把参数传给他
req = urllib.request.Request(url, data = formdata, headers = headers)
#urlopen爬取数据并返回响应结果
response = urllib.request.urlopen(req)
#打印获取的响应结果
print (response.read().decode('utf-8'))
还有的网页他的https是不安全的。那如果遇到了不是安全的https的网站,需要忽略验证
就比如下面这种情况
https是安全的传输,他有一个ssl验证
这个验证需要CA证书,需要这个机构审核,通过之后,网址前面会有一个绿色的锁。
那我们爬虫的话,需要忽略这个验证
from urllib import request
# 导入Python SSL处理模块
import ssl
# 构造上下文表示忽略未经核实的SSL证书认证
context = ssl._create_unverified_context()
# 目标url
url = "https://www.12306.cn/mormhweb/"
#设置请求头
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
#构建request对象
request = urllib.request.Request(url=url, headers = headers)
# 在urlopen()方法里 指明添加 context 参数,跳过验证。
#这个方法是抓取数据的,抓上之后就返回数据。
response = urllib.request.urlopen(request, context = context)
html = response.read().decode()
print (html)
这个验证出现了再加
不出现就不加
案例
拉钩网爬取
存在pycharm的文件中
遇到这种的,就在请求头带参数,挨个带。
要是都不行,只能换ip了
#https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
import urllib.request
import urllib.parse
import re
import ssl
import json
class LaGouSpider():
def __init__(self):
self.url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
self.context = ssl._create_unverified_context()
self.headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
'referer': 'https: // www.lagou.com / jobs / list_python / p - city_0? & cl = false & fromSearch = true & labelWords = & suginput =',
'Cookie:':'user_trace_token=20200610152531-cf8b029a-c3c2-4eba-87a8-9ddbfcc820eb; _ga=GA1.2.1440924686.1591773934; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1591773934; LGUID=20200610152533-c7bc33b9-beb3-4165-a454-9c2299415176; _gid=GA1.2.1630897312.1591774005; LG_LOGIN_USER_ID=343d209a30f90ddb4cff48fa39994c8b707e66f4f9a776a38cc8ab81ceeeb620; LG_HAS_LOGIN=1; _putrc=B77E2932A555A74C123F89F2B170EADC; JSESSIONID=ABAAAECABFAACEA8C9847C1C3638219CF8C5A51F793AFEE; login=true; hasDeliver=0; privacyPolicyPopup=false; sensorsdata2015session=%7B%7D; unick=%E7%94%A8%E6%88%B71055; gate_login_token=fec807de2c19106e1e04ad73f71d0d5e4c0d9d5f3f104cac8a33f5f08058e043; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2217463452%22%2C%22%24device_id%22%3A%221729d1fc5a675-03668b3c0c94c2-4313f6a-1327104-1729d1fc5a7613%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2280.0.3987.132%22%7D%2C%22first_id%22%3A%221729d1fc5a675-03668b3c0c94c2-4313f6a-1327104-1729d1fc5a7613%22%7D; WEBTJ-ID=20200610155018-1729d35592473f-089c543ae1d541-4313f6a-1327104-1729d355925b17; RECOMMEND_TIP=true; index_location_city=%E5%85%A8%E5%9B%BD; _gat=1; LGSID=20200611123341-ba4ad16c-7074-4870-b5a9-c8a74325eb3f; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist%5Fpython%2Fp-city%5F0%3F%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; X_MIDDLE_TOKEN=1d190f2b9c21e7822cf57c588fa8179c; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; X_HTTP_TOKEN=1b104ff15e1198b3292058195127f342ce2fa9871f; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1591850293; LGRID=20200611123813-91ab31d2-20c1-4813-b708-e6eb78639b73; SEARCH_ID=a112c0596f524db69fd01df2a021e334'
}
self.result = [] #为了保存数据
def send_request(self,form_data):
#创建请求
request = urllib.request.Request(url=self.url,data=form_data,headers=self.headers)
response = urllib.request.urlopen(request,context=self.context)
if response.status == 200:
return response
def save_content(self,conntent):
with open('value.json','a') as f:
f.write(conntent)
#拿到数据之后可以去,bjson这个网站去解析数据
def parse_content(self,response):
content = response.read().decode()#返回的是json数据,存json数据,也就是字符串
#把这个看不懂的东西转换成pytho 里面的语言
dic_result = json.loads(content)#变成字典
if not self.result:
self.result = dic_result
else:
self.result['content']['positionResult']['result'] = self.result.get('content').get('positionResult').get('result') + \
dic_result.get('content').get('positionResult').get('result')#取出来了一个列表
self.save_content(content)
def start(self):
for i in range(1,3):
form_data = {
"first": "true",
"pn": 'i',
"kd": 'python',
}
#编码
form_data = urllib.parse.urlencode(form_data).encode('utf8')
response = self.send_request(form_data)
if response:
self.parse_content(response)
self.save_content(json.dumps(self.result))
if __name__ == '__main__':
lg = LaGouSpider()
lg.start()
爬猫眼
存数据库
#https://maoyan.com/board/6?offset=20
import urllib.request
import urllib.parse
import re
import ssl
import json
import pymysql
import time
#注意,在网址中,只有数字和字母不编码,中文需要编码。
# 在浏览器的时候那个网址的中文部分是编码的,你是看不明白的
class MaoYanSpider():
def __init__(self):
self.url = 'https://maoyan.com/board/6?offset='
self.headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
self.context = ssl._create_unverified_context()
#连接数据库
self.connection = pymysql.connect(host='localhost',
user='root',
password='123456',
db='sky_cat',
)
self.cursor = self.connection.cursor()
def send_request(self,full_url):
request = urllib.request.Request(url=full_url,headers=self.headers)
response = urllib.request.urlopen(request,context=self.context)
if response.status == 200:
return response
def save_content(self):
pass
def save_mysql(self,content):
sql = 'insert into cat (%s) values (%s)'%(','.join([k for k in content.keys()]),','.join(['%s']* len(content)))
#列表推导式,相当鱼遍历字典取值,或者取参数
#但是','.join([k for k in content.keys()]执行出来的数据,是"rank,pic...."这样的
print(sql)
self.cursor.execute(sql,[v for v in content.values()])
#执行然后提交
self.connection.commit()
def parse_content(self,response):
content = response.read().decode()
pattern = re.compile(r'<dd>.*?<i.*?>(.*?)</i>.*?<img\sdata-src="(.*?)".*?alt="(.*?)".*?<p\sclass="star">(.*?)</p>.*?>(.*?)<.*?<i.*?>(.*?)</i>.*?>(.*?)<.*?</dd>',
re.S)
result = re.findall(pattern,content)
for movie in result:
dict_movie = {}
dict_movie['rank'] = movie[0]
dict_movie['pic'] = movie[1]
dict_movie['name'] = movie[2]
dict_movie['actor'] = movie[3]
dict_movie['time'] = movie[4]
dict_movie['grate'] = movie[5]+movie[6]
self.save_mysql(dict_movie)
def start(self):
for i in range(1,6):
offset = (i - 1) * 10
full_url = self.url + str(offset)
print(full_url)#检查一下
#网址没问题了就发请求
response = self.send_request(full_url)
#print(response.read().decode())
#解码之后打印出来,如果获得的话,就开始解析
self.parse_content(response)
time.sleep(1)
if __name__ == '__main__':
my = MaoYanSpider()
my.start()
csv存储
csv既可以写列表也可以写字典
上面是保存到了数据库
那如果要保存到csv呢
#https://maoyan.com/board/6?offset=20
import urllib.request
import urllib.parse
import re
import ssl
import json
import pymysql
import time
import csv
#注意,在网址中,只有数字和字母不编码,中文需要编码。
# 在浏览器的时候那个网址的中文部分是编码的,你是看不明白的
class MaoYanSpider():
def __init__(self):
self.url = 'https://maoyan.com/board/6?offset='
self.headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
}
self.context = ssl._create_unverified_context()
#连接数据库
self.connection = pymysql.connect(host='localhost',
user='root',
password='123456',
db='sky_cat',
)
self.cursor = self.connection.cursor()
#存csv
csv_file = open('movies.csv','a')
filenames = ['rank','pic','name','actor','time','grate']
self.writer = csv.DictWriter(csv_file,filenames)
self.writer.writeheader()#写头部
def send_request(self,full_url):
request = urllib.request.Request(url=full_url,headers=self.headers)
response = urllib.request.urlopen(request,context=self.context)
if response.status == 200:
return response
def save_csv(self,dic_movies):
self.writer.writerow(dic_movies)
def save_mysql(self,content):
sql = 'insert into cat (%s) values (%s)'%(','.join([k for k in content.keys()]),','.join(['%s']* len(content)))
#列表推导式,相当鱼遍历字典取值,或者取参数
#但是','.join([k for k in content.keys()]执行出来的数据,是"rank,pic...."这样的
print(sql)
self.cursor.execute(sql,[v for v in content.values()])
#执行然后提交
self.connection.commit()
def parse_content(self,response):
content = response.read().decode()
pattern = re.compile(r'<dd>.*?<i.*?>(.*?)</i>.*?<img\sdata-src="(.*?)".*?alt="(.*?)".*?<p\sclass="star">(.*?)</p>.*?>(.*?)<.*?<i.*?>(.*?)</i>.*?>(.*?)<.*?</dd>',
re.S)
result = re.findall(pattern,content)
for movie in result:
dict_movie = {}
dict_movie['rank'] = movie[0]
dict_movie['pic'] = movie[1]
dict_movie['name'] = movie[2]
dict_movie['actor'] = movie[3]
dict_movie['time'] = movie[4]
dict_movie['grate'] = movie[5]+movie[6]
self.save_mysql(dict_movie)
self.save_csv(dict_movie)
def start(self):
for i in range(1,6):
offset = (i - 1) * 10
full_url = self.url + str(offset)
print(full_url)#检查一下
#网址没问题了就发请求
response = self.send_request(full_url)
#print(response.read().decode())
#解码之后打印出来,如果获得的话,就开始解析
self.parse_content(response)
time.sleep(1)
if __name__ == '__main__':
my = MaoYanSpider()
my.start()
写完之后,会多一个csv文件,其实就是一个文件,但是有规律
这个可以用表格打开,电脑上有表格的话,它可以自动打开
打开是这样
其实就是有规律的文本