使用ip代理爬数据

最新推荐文章于 2023-03-11 01:43:50 发布

不谙世事的少年

最新推荐文章于 2023-03-11 01:43:50 发布

阅读量800

点赞数

本文链接：https://blog.csdn.net/qq_42491946/article/details/86286795

版权

import requests
from lxml import etree
import pymysql
db = pymysql.connect('localhost','root','mysql','lagou',charset='utf8')
cursor =db.cursor()
主要是这个函数
def get_ip_port():
	这个是蘑菇代理生成的api网址
    url = 'http://piping.mogumiao.com/proxy/api/get_ip_al?appKey=67e2b2f3b2da46dc89a5b5668760160e&count=1&expiryDate=0&format=1&newLine=2'
    response = requests.get(url)
    proxies = {}
    ip_port = response.json()['msg'][0]['ip']+':'+response.json()['msg'][0]['port']
    proxies['https'] = ip_port
    return proxies
proxies = get_ip_port()
headers = {
"Cookie": "_ga=GA1.2.1051578399.1536212058; user_trace_token=20180906133423-82cb3f49-b196-11e8-b620-5254005c3644; LGUID=20180906133423-82cb4620-b196-11e8-b620-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; _gid=GA1.2.924982988.1536540531; WEBTJ-ID=20180910185745-165c321533a2b9-0d45099f38a143-9393265-2073600-165c321533c65a; _gat=1; LGSID=20180910185749-5b7b0120-b4e8-11e8-b62b-5254005c3644; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D1%26tn%3D78000241_9_hao_pg%26wd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_pq%3Dfcce7cd300055386%26rsv_t%3Dbe9dCkB0FbwCshvm9Zlr22uBUpxJStjGJItlVRp9qB%252BMMMV0WqkIz3tvi3WlfoaaGI0TQA4ueo0%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D3%26rsv_sug1%3D2%26rsv_sug7%3D101%26rsv_sug2%3D0%26inputT%3D1195%26rsv_sug4%3D2225%26rsv_sug%3D1; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536212060,1536216177,1536540530,1536577066; JSESSIONID=ABAAABAAAIAACBIC48E7DAFD2D94AF4BB863526FF95C5E6; TG-TRACK-CODE=index_search; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536577352; LGRID=20180910190235-06381294-b4e9-11e8-b62b-5254005c3644; SEARCH_ID=4181860115184d139da22e79442e58a7",
"Referer": "https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
data = {
"first": "true",
"pn": "1",
"kd": "python"
}
response = requests.post(url=url,headers=headers,data=data,proxies=proxies)
all = response.json()['content']['positionResult']['result']

for a in all:
    id = a['positionId']
    url1 = "https://www.lagou.com/jobs/"+str(id)+".html"
    response1 = requests.get(url=url1,headers=headers)
    html = etree.HTML(response1.text)
    name = html.xpath("//div[@class='job-name']/span[@class='name']/text()")[0]
    money = html.xpath("//dd[@class='job_request']/p[1]/span[@class='salary']/text()")[0]
    youhuo = html.xpath("//dl[@id='job_detail']/dd[@class='job-advantage']/p/text()")[0]
    zhize = html.xpath("//dl/dd[@class='job_bt']/div")
    zhize1 = ''
    for zhi in zhize:
        zhize1 += zhi.xpath('string(.)')
    dizhi = html.xpath("//dd[@class='job-address clearfix']/div[@class='work_addr']")
    address = ''
    for di in dizhi:
        address += di.xpath('string(.)').strip()
    src = 'http:'+html.xpath("//dl[@id='job_company']/dt/a/img[@class='b2']/@src")[0]
    # response2 = requests.get(url=src)
    # with open(src[-10:]+'.jpg','wb+') as f:
    #     f.write(response2.content)
    #     print('下载成功....')
    gongsiqingkuang = html.xpath("//dd/ul[@class='c_feature']/li")
    qingkuang = ''
    for g in gongsiqingkuang:
        qingkuang += g.xpath('string(.)').strip()

    fabuzhe = html.xpath("//div[@class='publisher_name']/a/span[@class='name']/text()")[0]
    try:
        sql = 'insert into lagou_info values (0,%s,%s,%s,%s,%s,%s,%s,%s)'
        cursor.execute(sql,(name,money,youhuo,zhize1,address,src,qingkuang,fabuzhe))
        db.commit()
    except Exception as e:
        print(e)
        db.rollback()
cursor.close()
db.close()

不谙世事的少年

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
使用ip代理爬数据

import requestsfrom lxml import etreeimport pymysqldb = pymysql.connect('localhost','root','mysql','lagou',charset='utf8')cursor =db.cursor()主要是这个函数def get_ip_port(): 这个是蘑菇代理生成的api网址 url = ...
复制链接

扫一扫