自己使用-爬虫测试

import requests
from lxml import etree
import re
import time
import csv
import pandas as pd
import os

lis_firm=[]
lis_name=[]
lis_workplace=[]
lis_pay=[]
lis_time=[]


keyword=input("请输入你想找到的工作:")
page=input("请输入你想爬取的页数:")
page=int(page)
headers = {
    'Accept': 'textml,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    # Requests sorts cookies= alphabetically
    # 'Cookie': '_uab_collina=165494029760362106180467; guid=038a32b83973a819c180179ba511742c; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; acw_tc=2f624a4816549990797972577e0f84a5a8fe2c1095ecfd612196c594d90db2; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60120500%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60120500%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; ssxmod_itna=QqAx9D2DRQi=f4Cq0d48+Q=4Y5NDKUNDC9IKr2DBqEO4iNDnD8x7YDvIIoKVIW/AAxEYfKtDTwxKW=RDhI+WPOwfNFV=x0aDbqGkqWC84GGUxBYDQxAYDGDDPDogPD1D3qDkD7EZlMBsqDEDYp9DA3Di4D+8MQDmqG0DDU7B4G2D7U9Q7GN8TrUCntdEkDPrDh9D0tQxBLK8cTo1P9NBTrTr1iatqGySPGu0uU/lRbDCxtVRk0sGbx4I05PKO+K7ODeKhq4/7EAaDxt3AxqD4EPYAqckhqQ/ESd/DDAiBwd+HD==; ssxmod_itna2=QqAx9D2DRQi=f4Cq0d48+Q=4Y5NDKUNDC9IKrx8dPEwqGNLKGaWB+Ikqw/+zx8r2QCeKxC00CKDbYvie/4ILoWGYRhLSXLYBAlcvCnf8A9Tsphl1W=mareFxHs6fPtudewZ+07IE7p5swgw8YB9bf2Kz3WKs/QiOOgqx4=9bPpWa1AopYaKzqYWF/gPa=l4kvpHtxza7KjnaVipNhZqhDonFyPaTx1ybBtuNqBIXeT02SIlmQTMRkrj2x3ZFN8P2G3QH3h82umLnL3=HotT7r3Lfx9BQdTiCspO620FZNl/H=D8GeQIV0r0+xb35m/cCzhqiyHePqLRDzaG+Y2Qyd7D2Fa1mba7TgFbxTAiSp4sAjz7WBiOK05B+4/0DDTPurdjR69Ia/c++bHfAPq4=9+u3Fxa0tObiLnH0cX9ic8G5h8cbD280i17iR+0b8BD+H/hED+xGgnQSMQie+gN3wn/9KP4xekKsvHvOCxMQ1Mji+kXPCx+5P+8qSaUMHVMNlBaWL+v212rk6bxgL=vM1huQP9HXRyiY1VhD1C3D07S7Dwix2Pur3tw130ecfz2UUrhOqgo1KG3wGdZBqgO9MU0R3QrYTo7QsGDng3Kv=YsKAFhygDEd4BAxqBS3mdaV1HsKKixD7=DY95eD',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}
cookies = {
    '_uab_collina': '165494029760362106180467',
    'guid': '038a32b83973a819c180179ba511742c',
    'nsearch': 'jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D',
    'acw_tc': '2f624a4816549990797972577e0f84a5a8fe2c1095ecfd612196c594d90db2',
    'search': 'jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60120500%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60120500%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21',
    'ssxmod_itna': 'QqAx9D2DRQi=f4Cq0d48+Q=4Y5NDKUNDC9IKr2DBqEO4iNDnD8x7YDvIIoKVIW/AAxEYfKtDTwxKW=RDhI+WPOwfNFV=x0aDbqGkqWC84GGUxBYDQxAYDGDDPDogPD1D3qDkD7EZlMBsqDEDYp9DA3Di4D+8MQDmqG0DDU7B4G2D7U9Q7GN8TrUCntdEkDPrDh9D0tQxBLK8cTo1P9NBTrTr1iatqGySPGu0uU/lRbDCxtVRk0sGbx4I05PKO+K7ODeKhq4/7EAaDxt3AxqD4EPYAqckhqQ/ESd/DDAiBwd+HD==',
    'ssxmod_itna2': 'QqAx9D2DRQi=f4Cq0d48+Q=4Y5NDKUNDC9IKrx8dPEwqGNLKGaWB+Ikqw/+zx8r2QCeKxC00CKDbYvie/4ILoWGYRhLSXLYBAlcvCnf8A9Tsphl1W=mareFxHs6fPtudewZ+07IE7p5swgw8YB9bf2Kz3WKs/QiOOgqx4=9bPpWa1AopYaKzqYWF/gPa=l4kvpHtxza7KjnaVipNhZqhDonFyPaTx1ybBtuNqBIXeT02SIlmQTMRkrj2x3ZFN8P2G3QH3h82umLnL3=HotT7r3Lfx9BQdTiCspO620FZNl/H=D8GeQIV0r0+xb35m/cCzhqiyHePqLRDzaG+Y2Qyd7D2Fa1mba7TgFbxTAiSp4sAjz7WBiOK05B+4/0DDTPurdjR69Ia/c++bHfAPq4=9+u3Fxa0tObiLnH0cX9ic8G5h8cbD280i17iR+0b8BD+H/hED+xGgnQSMQie+gN3wn/9KP4xekKsvHvOCxMQ1Mji+kXPCx+5P+8qSaUMHVMNlBaWL+v212rk6bxgL=vM1huQP9HXRyiY1VhD1C3D07S7Dwix2Pur3tw130ecfz2UUrhOqgo1KG3wGdZBqgO9MU0R3QrYTo7QsGDng3Kv=YsKAFhygDEd4BAxqBS3mdaV1HsKKixD7=DY95eD',
}
params = {
    'lang': 'c',
    'postchannel': '0000',
    'workyear': '99',
    'cotype': '99',
    'degreefrom': '99',
    'jobterm': '99',
    'companysize': '99',
    'ord_field': '0',
    'dibiaoid': '0',
    'line': '',
    'welfare': '',
}

for pageNum in range(1,page+1):
    pageNum=str(pageNum)
    print('===============正在爬取第{'+pageNum+'}页数据内容===============')
    time.sleep(2)
    url = "https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html?".format(keyword, pageNum)
    response=requests.get(url=url,headers=headers,cookies=cookies,params=params)
    response.encoding =response.apparent_encoding
    responds=response.text
    print(url)
    print(responds)
#     =========公司名==li_firm
    ex = r'"company_name":"(.*?)","'
    li_firm = re.findall(ex, responds, re.S)
    print("=========公司名==li_firm")
    print(li_firm)
    print(len(li_firm))
    for i in li_firm:
        lis_firm.append(i)
#     =========职位名==li_name
    ex = r'"job_name":"(.*?)","'
    li_name = re.findall(ex,responds,re.S)
    print("=========职位名==li_name")
    print(li_name)
    print(len(li_name))
    for i in li_name:
        lis_name.append(i)
#     =========工作地点==li_workplace
    ex = r'"workarea_text":"(.*?)","'
    li_workplace = re.findall(ex, responds, re.S)
    print("=========工作地点==li_workplace")
    print(li_workplace)
    print(len(li_workplace))
    for i in li_workplace:
        lis_workplace.append(i)
#     =========薪资==li_pay
    ex = r'"providesalary_text":"(.*?)","'
    li_pay = re.findall(ex, responds, re.S)
    print("=========薪资==li_pay")
    print(li_pay)
    print(len(li_pay))
    for i in li_pay:
        lis_pay.append(i)
#     =========发布时间==li_time
    ex = r'"issuedate":"(.*?)","'
    li_time = re.findall(ex, responds, re.S)
    print("=========发布时间==li_time")
    print(li_time)
    li_time.pop()
    print(len(li_time))
    for i in li_time:
        lis_time.append(i)

    a = [x for x in lis_firm]
    b = [x for x in lis_name]
    c = [x for x in lis_workplace]
    d = [x for x in lis_pay]
    e = [x for x in lis_time]
    dataframe = pd.DataFrame({'公司名': a, '职位名': b, '工作地点': c, '薪资': d, '发布时间': e})
    dataframe.to_csv("爬取数据.csv", index=False, sep=',')



  • 4
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
在IntelliJ IDEA(IDEA)中搭建爬虫环境并进行测试是一个常见的步骤,通常涉及到以下几个步骤: 1. **安装相关插件**: - 安装Java开发支持:确保你已经安装了Java SDK,并且在IDEA中配置了正确的版本。 - 安装爬虫库:如Jsoup、Apache HttpClient或Spring框架的WebClient等,有时可能需要额外的插件支持,例如for Jsoup叫做"org.jsoup"。 2. **创建项目结构**: - 创建一个新的Java项目,选择"Maven"或"Gradle"作为构建工具,因为它们都支持模块化和依赖管理。 - 在项目结构中,你可以创建一个专门的模块(如"spiders"或"web-scraping")来存放爬虫相关的代码。 3. **添加依赖**: 在pom.xml(Maven)或build.gradle(Gradle)文件中添加所需的爬虫库依赖。例如,如果用Jsoup,你可以这样添加: ```xml <dependencies> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>最新版本号</version> </dependency> </dependencies> ``` 4. **编写爬虫代码**: 使用所选库开始编写爬虫代码。例如,用Jsoup的基本步骤可能包括: - `Document doc = Jsoup.connect("http://example.com").get();` - 解析HTML内容:`Elements elements = doc.select("selector");` - 提取信息:`String data = elements.text();` 5. **测试爬虫**: - 在IDEA中运行测试类,可以使用JUnit或其他单元测试框架编写针对爬虫功能的测试用例。 - 测试HTTP请求是否正确响应,解析是否准确无误。 6. **异常处理和日志**: 不要忘记处理可能出现的网络异常,使用try-catch块并记录日志,以便调试。 7. **遵循网站规则**: 在编写爬虫时要遵守Robots协议,并尊重网站的抓取政策。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值