不积跬步无以至千里
上一篇: https://blog.csdn.net/cht2511/article/details/99085532
继续下一篇
爬拉勾网我真的不会,好难呀!不亏都是夸奖
之前虽然自学了点,感觉会了很多爬虫技能,requests、selenium、还是phantomjs+selenium,之前为了逼格高,选择requests方法,请求头里面只有headers,但是都是出现“您的请求太过频繁,请稍后再试",立马想到代理IP(爬来的)呀,可是还是这样,我狠心买了代理IP。果然还是一样出现“您的请求太过频繁,请稍后再试”,我还没请求成功过好嘛,哪里有请求频繁!气死个人,继续学习,要加cooiks,但是里面有不确定参数,加时间戳,居然还是不行,心态快要崩了,我需要女朋友的安慰。继续学习,用selenium试试,了解到很慢,大神的不屑用(嫌弃慢的,那是你不会用),当然我最后也没用,因为学到了另一个方法,很厉害。对拉勾网造成危害,在这给您道歉,如有侵权,请联系本人删除!
直接上代码
"""
@author: cht
@time: 2019/7/30 21:23
"""
# -*- coding: utf-8 -*-
import requests
import time
import json
import random
import urllib
import csv
def gethttpIp():
httprul = "购买的生产代理IP的URL"
result = requests.get(httprul)
print(result.text)
IPList = result.text.split(":")
print(IPList)
# 代理服务器
proxyHost = IPList[0]
proxyPort = IPList[1].split(" ")[0]
proxyMeta = "http://%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
}
proxies = {"https": proxyMeta, }
#如果是http协议换成proxies = {"http": proxyMeta, }
print(proxies)
return proxies
def geturl():
city = ["全国", "北京", "深圳", "广东", "杭州", "苏州", "上海", "成都", "厦门", "长沙", "南京", "西安", "天津", "重庆", "合肥"]
citydict = {}
for i in city:
note = urllib.parse.quote(i)
citydict[i] = note
return citydict
def decode_city():
position = ["python","java","php","大数据","数据分析","测试工程师","爬虫"]
positionList = {}
for i in position:
note = urllib.parse.quote(i)
positionList[i] = note
return positionList
def main(url_start, url_parse,proxies,position):
# proxie = [
# "134.249.156.3:82",
# "1.198.72.239:9999",
# "103.26.245.190:43328", ]
# proxies = {
# "http": str(random.sample(proxie, 1))
# # "http":"103.26.245.190:43328"
# }
# agents = random.sample(agent, 1)
# url_start = "https://www.lagou.com/jobs/list_python?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput="
# url_parse = "https://www.lagou.com/jobs/positionAjax.json?city=全国&needAddtionalResult=false"
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_%E8%BF%90%E7%BB%B4?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
for x in range(1, 30):
data = {'first': 'true', 'pn': str(x), 'kd': 'python'}
s = requests.Session()
s.get(url_start, headers=headers, timeout=3) # 请求首页获取cookies
cookie = s.cookies # 为此次获取的cookies
response = s.post(url_parse, data=data, headers=headers, proxies=proxies, cookies=cookie, timeout=10) # 获取此次文本
time.sleep(5)
response.encoding = response.apparent_encoding
text = json.loads(response.text)
print("js数据:%s"%text)
info = text["content"]["positionResult"]["result"]
if info == []:
break
jobinfolist = []
for i in info:
List1 = []
print(i["companyFullName"])
List1.append(i["companyFullName"])
print(i["positionName"])
List1.append(i["positionName"])
print(i["salary"])
List1.append(i["salary"])
print(i["companySize"])
List1.append(i["companySize"])
print(i["skillLables"])
List1.append(i["skillLables"])
print(i["createTime"])
List1.append( i["createTime"])
print(i["district"])
List1.append(i["district"])
print(i["stationname"])
List1.append( i["stationname"])
jobinfolist.append(List1)
writeCSV(jobinfolist,position)
def writeCSV(jobInfo,position):
file = open('C:\\Users\\Administrator\\PycharmProjects\\boss\\laguodata\\lagou_%s.csv'%position, 'a+', newline='',
encoding='gb18030') # 打开文件,csv的命名不能有中文,否则创建不了
content = csv.writer(file, dialect='excel') # 设定文件写入模式
for unitinfo in jobInfo:
content.writerow(unitinfo)
if __name__ == '__main__':
positionList = decode_city()
pt = 1
for p in positionList:
pt = pt+1
citydict = geturl()
for c in citydict:
url_start = "https://www.lagou.com/jobs/list_{}?city={}&cl=false&fromSearch=true&labelWords=&suginput=".format(positionList[p],
citydict[c])
url_parse = "https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false".format(citydict[c])
print(url_start)
print(url_parse)
proxies = gethttpIp()
try:#捕捉错误,保持流畅
main(url_start, url_parse,proxies,pt)
except Exception as e:
print(e)
print("歇歇吧")
time.sleep(60)
s = requests.Session()
s.get(url_start, headers=headers, timeout=3) # 请求首页获取cookies
cookie = s.cookies # 为此次获取的cookies
这段代码是重中之重,没有这个直接使用post请求,是会失败的,原因查询过,因为将请求头里面的keep_connect的值设置为false,也就是每次请求前都获取新的会话。