数据分析---1.数据获取----拉勾网职位信息获取


上一篇: https://blog.csdn.net/cht2511/article/details/99085532

继续下一篇

爬拉勾网我真的不会,好难呀!不亏都是夸奖

之前虽然自学了点,感觉会了很多爬虫技能,requests、selenium、还是phantomjs+selenium,之前为了逼格高,选择requests方法,请求头里面只有headers,但是都是出现“您的请求太过频繁,请稍后再试",立马想到代理IP(爬来的)呀,可是还是这样,我狠心买了代理IP。果然还是一样出现“您的请求太过频繁,请稍后再试”,我还没请求成功过好嘛,哪里有请求频繁!气死个人,继续学习,要加cooiks,但是里面有不确定参数,加时间戳,居然还是不行,心态快要崩了,我需要女朋友的安慰。继续学习,用selenium试试,了解到很慢,大神的不屑用(嫌弃慢的,那是你不会用),当然我最后也没用,因为学到了另一个方法,很厉害。对拉勾网造成危害,在这给您道歉,如有侵权,请联系本人删除!

直接上代码

"""
@author: cht
@time: 2019/7/30 21:23
"""
# -*- coding: utf-8 -*-

import requests
import time
import json
import random
import urllib
import csv



def gethttpIp():
	
    httprul = "购买的生产代理IP的URL"
    result = requests.get(httprul)
    print(result.text)
    IPList = result.text.split(":")
    print(IPList)
    # 代理服务器
    proxyHost = IPList[0]
    proxyPort = IPList[1].split(" ")[0]
    proxyMeta = "http://%(host)s:%(port)s" % {
        "host": proxyHost,
        "port": proxyPort,
    }
    proxies = {"https": proxyMeta, }
    #如果是http协议换成proxies = {"http": proxyMeta, }
    print(proxies)
    return proxies

def geturl():
    city = ["全国", "北京", "深圳", "广东", "杭州", "苏州", "上海", "成都", "厦门", "长沙", "南京", "西安", "天津", "重庆", "合肥"]

    citydict = {}
    for i in city:
        note = urllib.parse.quote(i)
        citydict[i] = note
    return citydict

def decode_city():
    position = ["python","java","php","大数据","数据分析","测试工程师","爬虫"]
    positionList = {}
    for i in position:
        note = urllib.parse.quote(i)
        positionList[i] = note
    return positionList

def main(url_start, url_parse,proxies,position):
    # proxie = [
    #     "134.249.156.3:82",
    #     "1.198.72.239:9999",
    #     "103.26.245.190:43328", ]
    # proxies = {
    #     "http": str(random.sample(proxie, 1))
    #     #         "http":"103.26.245.190:43328"
    # }
    #     agents = random.sample(agent, 1)
    # url_start = "https://www.lagou.com/jobs/list_python?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput="
    # url_parse = "https://www.lagou.com/jobs/positionAjax.json?city=全国&needAddtionalResult=false"
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Referer': 'https://www.lagou.com/jobs/list_%E8%BF%90%E7%BB%B4?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
    for x in range(1, 30):
        data = {'first': 'true', 'pn': str(x), 'kd': 'python'}
        s = requests.Session()
        s.get(url_start, headers=headers, timeout=3)  # 请求首页获取cookies
        cookie = s.cookies  # 为此次获取的cookies
        response = s.post(url_parse, data=data, headers=headers, proxies=proxies, cookies=cookie, timeout=10)  # 获取此次文本
        time.sleep(5)
        response.encoding = response.apparent_encoding
        text = json.loads(response.text)
        print("js数据:%s"%text)
        info = text["content"]["positionResult"]["result"]
        if info == []:
            break
        jobinfolist = []
        for i in info:
            List1 = []
            print(i["companyFullName"])
            List1.append(i["companyFullName"])
            print(i["positionName"])
            List1.append(i["positionName"])
            print(i["salary"])
            List1.append(i["salary"])
            print(i["companySize"])
            List1.append(i["companySize"])
            print(i["skillLables"])
            List1.append(i["skillLables"])
            print(i["createTime"])
            List1.append( i["createTime"])
            print(i["district"])
            List1.append(i["district"])
            print(i["stationname"])
            List1.append( i["stationname"])
            jobinfolist.append(List1)
        writeCSV(jobinfolist,position)


def writeCSV(jobInfo,position):
    file = open('C:\\Users\\Administrator\\PycharmProjects\\boss\\laguodata\\lagou_%s.csv'%position, 'a+', newline='',
                encoding='gb18030')  # 打开文件,csv的命名不能有中文,否则创建不了
    content = csv.writer(file, dialect='excel')  # 设定文件写入模式
    for unitinfo in jobInfo:
        content.writerow(unitinfo)


if __name__ == '__main__':
    positionList = decode_city()
    pt = 1
    for p in positionList:
        pt = pt+1
        citydict = geturl()
        for c in citydict:
            url_start = "https://www.lagou.com/jobs/list_{}?city={}&cl=false&fromSearch=true&labelWords=&suginput=".format(positionList[p],
                citydict[c])
            url_parse = "https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false".format(citydict[c])
            print(url_start)
            print(url_parse)
            proxies = gethttpIp()
            try:#捕捉错误,保持流畅
                main(url_start, url_parse,proxies,pt)
            except Exception as e:
                print(e)
                print("歇歇吧")
                time.sleep(60)
    s = requests.Session()
    s.get(url_start, headers=headers, timeout=3)  # 请求首页获取cookies
    cookie = s.cookies  # 为此次获取的cookies
    这段代码是重中之重,没有这个直接使用post请求,是会失败的,原因查询过,因为将请求头里面的keep_connect的值设置为false,也就是每次请求前都获取新的会话。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值