import requests
from lxml import etree
import pymysql
db = pymysql.connect('localhost','root','mysql','lagou',charset='utf8')
cursor =db.cursor()
主要是这个函数
def get_ip_port():
这个是蘑菇代理生成的api网址
url = 'http://piping.mogumiao.com/proxy/api/get_ip_al?appKey=67e2b2f3b2da46dc89a5b5668760160e&count=1&expiryDate=0&format=1&newLine=2'
response = requests.get(url)
proxies = {}
ip_port = response.json()['msg'][0]['ip']+':'+response.json()['msg'][0]['port']
proxies['https'] = ip_port
return proxies
proxies = get_ip_port()
headers = {
"Cookie": "_ga=GA1.2.1051578399.1536212058; user_trace_token=20180906133423-82cb3f49-b196-11e8-b620-5254005c3644; LGUID=20180906133423-82cb4620-b196-11e8-b620-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; _gid=GA1.2.924982988.1536540531; WEBTJ-ID=20180910185745-165c321533a2b9-0d45099f38a143-9393265-2073600-165c321533c65a; _gat=1; LGSID=20180910185749-5b7b0120-b4e8-11e8-b62b-5254005c3644; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D1%26tn%3D78000241_9_hao_pg%26wd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_pq%3Dfcce7cd300055386%26rsv_t%3Dbe9dCkB0FbwCshvm9Zlr22uBUpxJStjGJItlVRp9qB%252BMMMV0WqkIz3tvi3WlfoaaGI0TQA4ueo0%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D3%26rsv_sug1%3D2%26rsv_sug7%3D101%26rsv_sug2%3D0%26inputT%3D1195%26rsv_sug4%3D2225%26rsv_sug%3D1; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536212060,1536216177,1536540530,1536577066; JSESSIONID=ABAAABAAAIAACBIC48E7DAFD2D94AF4BB863526FF95C5E6; TG-TRACK-CODE=index_search; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536577352; LGRID=20180910190235-06381294-b4e9-11e8-b62b-5254005c3644; SEARCH_ID=4181860115184d139da22e79442e58a7",
"Referer": "https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
data = {
"first": "true",
"pn": "1",
"kd": "python"
}
response = requests.post(url=url,headers=headers,data=data,proxies=proxies)
all = response.json()['content']['positionResult']['result']
for a in all:
id = a['positionId']
url1 = "https://www.lagou.com/jobs/"+str(id)+".html"
response1 = requests.get(url=url1,headers=headers)
html = etree.HTML(response1.text)
name = html.xpath("//div[@class='job-name']/span[@class='name']/text()")[0]
money = html.xpath("//dd[@class='job_request']/p[1]/span[@class='salary']/text()")[0]
youhuo = html.xpath("//dl[@id='job_detail']/dd[@class='job-advantage']/p/text()")[0]
zhize = html.xpath("//dl/dd[@class='job_bt']/div")
zhize1 = ''
for zhi in zhize:
zhize1 += zhi.xpath('string(.)')
dizhi = html.xpath("//dd[@class='job-address clearfix']/div[@class='work_addr']")
address = ''
for di in dizhi:
address += di.xpath('string(.)').strip()
src = 'http:'+html.xpath("//dl[@id='job_company']/dt/a/img[@class='b2']/@src")[0]
# response2 = requests.get(url=src)
# with open(src[-10:]+'.jpg','wb+') as f:
# f.write(response2.content)
# print('下载成功....')
gongsiqingkuang = html.xpath("//dd/ul[@class='c_feature']/li")
qingkuang = ''
for g in gongsiqingkuang:
qingkuang += g.xpath('string(.)').strip()
fabuzhe = html.xpath("//div[@class='publisher_name']/a/span[@class='name']/text()")[0]
try:
sql = 'insert into lagou_info values (0,%s,%s,%s,%s,%s,%s,%s,%s)'
cursor.execute(sql,(name,money,youhuo,zhize1,address,src,qingkuang,fabuzhe))
db.commit()
except Exception as e:
print(e)
db.rollback()
cursor.close()
db.close()
使用ip代理爬数据
最新推荐文章于 2023-03-11 01:43:50 发布