我的毕设需要爬取招聘网站的数据做数据分析,我的代码在
items = res["resultbody"]['job']["items"]
这里遇到了报错,在网上搜索了也不知道怎么解决
import json
import pprint
from urllib.parse import quote
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
import time
import pandas as pd
import requests
import csv
need = input("请输入要收集的内容:")
end = int(input("请输入收集的页数:"))
text = quote(need)
print(text)
for i in range(end):
time.sleep(2)
page = i + 1
url = f"https://we.51job.com/api/job/search-pc?api_key=51job×tamp={str(int(time.time()))}&keyword={text}&searchType=2&function=&industry=&jobArea=000000&jobArea2=&landmark=&metro=&salary=&workYear=°ree=&companyType=&companySize=&jobType=&issueDate=&sortType=0&pageNum={page}&requestId=&pageSize=20&source=1&accountId=&pageCode=sou%7Csou%7Csoulb&u_atoken=e08469ac-756f-481e-a8ec-67d3ab35d3bf&u_asession=01c2eg43lMbm3V9il2PXF2HW2ULNr6jRSZXJHv4eKftJMDKbShoLKxItTxt0Q0Shb0lqY2cxDu5LwOIGIqJJMMwtsq8AL43dpOnCClYrgFm6o&u_asig=05C3Y5rfKfZP8Rhggch0Gw0I4wu1zQqxqM08xWL1bQtyd1-B0JdaPZNMbZ5N8A6aeJ_Go9ku2ftGILehsFowCPaRrAA7na6F_OTOWOUcIO51lPgglxhBMqEtGcx5NCqgAcrWQnKt4jSt7Bv88I894lhIkOgasq7AcPFgqcJUJJ2h80yE8FAmyRoU17dknn0HEfksmHjM0JOodanL5-M1Qs1bpSIftQhRA8hsaTv-WvOWJgd-Eqjr8MAF1qDznEw3rfsuvjmrCHdFZl2xk2B9ND9RrxMjR1zjahit8hjru27ZLUpLHxH1iRKZmnjAu0Zefw&u_aref=bQO7OIl0zsYzj2B6PmHj8VYaq%2FA%3D"
print(url)
value = f"/api/job/search-pc?api_key=51job×tamp={str(int(time.time()))}&keyword={text}&searchType=2&function=&industry=&jobArea=000000&jobArea2=&landmark=&metro=&salary=&workYear=°ree=&companyType=&companySize=&jobType=&issueDate=&sortType=0&pageNum={page}&requestId=&pageSize=20&source=1&accountId=&pageCode=sou%7Csou%7Csoulb&u_atoken=e08469ac-756f-481e-a8ec-67d3ab35d3bf&u_asession=01c2eg43lMbm3V9il2PXF2HW2ULNr6jRSZXJHv4eKftJMDKbShoLKxItTxt0Q0Shb0lqY2cxDu5LwOIGIqJJMMwtsq8AL43dpOnCClYrgFm6o&u_asig=05C3Y5rfKfZP8Rhggch0Gw0I4wu1zQqxqM08xWL1bQtyd1-B0JdaPZNMbZ5N8A6aeJ_Go9ku2ftGILehsFowCPaRrAA7na6F_OTOWOUcIO51lPgglxhBMqEtGcx5NCqgAcrWQnKt4jSt7Bv88I894lhIkOgasq7AcPFgqcJUJJ2h80yE8FAmyRoU17dknn0HEfksmHjM0JOodanL5-M1Qs1bpSIftQhRA8hsaTv-WvOWJgd-Eqjr8MAF1qDznEw3rfsuvjmrCHdFZl2xk2B9ND9RrxMjR1zjahit8hjru27ZLUpLHxH1iRKZmnjAu0Zefw&u_aref=bQO7OIl0zsYzj2B6PmHj8VYaq%2FA%3D"
print(value)
chrome_driver = './chromedriver.exe' # 导入Google驱动
options = webdriver.ChromeOptions()
options.add_experimental_option('useAutomationExtension', False)
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(chrome_options=options, executable_path=chrome_driver)
# 获取url
driver.get(url)
time.sleep(1)
# 找到需要滑动的滑块元素
slider = driver.find_element(By.XPATH, r'//*[@id="nc_1_n1z"]')
# 创建操作链
action_chains = ActionChains(driver)
# 将鼠标移动到滑块上
action_chains.move_to_element(slider)
# 模拟按下鼠标左键并保持不松开
action_chains.click_and_hold()
# 移动鼠标使滑块达到目标位置
action_chains.move_by_offset(100, 0)
action_chains.move_by_offset(-9, 0)
action_chains.move_by_offset(101, 0)
action_chains.move_by_offset(100, 0)
# 松开鼠标左键
action_chains.release()
# 执行操作链
action_chains.perform()
time.sleep(10)
html = driver.page_source # 获取网页源码
print(html)
print(1111111111111)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
,
"Cookie":"guid=33e05054509b3f433e10192bb65d3607; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2233e05054509b3f433e10192bb65d3607%22%2C%22first_id%22%3A%2218755965a0340-033c4294c10036-26031851-1327104-18755965a043f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg3NTU5NjVhMDM0MC0wMzNjNDI5NGMxMDAzNi0yNjAzMTg1MS0xMzI3MTA0LTE4NzU1OTY1YTA0M2YxIiwiJGlkZW50aXR5X2xvZ2luX2lkIjoiMzNlMDUwNTQ1MDliM2Y0MzNlMTAxOTJiYjY1ZDM2MDcifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%2233e05054509b3f433e10192bb65d3607%22%7D%2C%22%24device_id%22%3A%2218755965a0340-033c4294c10036-26031851-1327104-18755965a043f1%22%7D; Hm_lvt_1370a11171bd6f2d9b1fe98951541941=1707988755; search=jobarea%7E%60%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%B9%E3%B6%AB%B4%F3%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%B4%F3%CA%FD%BE%DD%B9%E3%B6%AB%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAjava%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; ssxmod_itna=eqGhGKY5iKBK0KDtDXtwgAr+m2xWurTWW4FRxKDsqsTDSxGKidDqxBnnlxyn42+Nk8AbYGO0l2BqarzCiawNd+b0A347Z=x0aDbqGkKHGg2eDx1q0rD74irDDxD3yD7PGmDinZuD7xU1S25CfxiOD7eDXxGCDQFh0xGWDiPD7xvreYeCjxi7DD5DnpBve4DWDWcCeDDzK7iiU6DDEDHzz67h=Kfrg=0D0xG1DQ5DsZr2eDyvKwQxVpHXVtL3fp7+DCKDjxg1GcYDUQHGM19j/Dc4EiGNWlxdK=io57cTT02xKPhrw1hrb=GPBxpGmd70oiDTaGd0UxYD; ssxmod_itna2=eqGhGKY5iKBK0KDtDXtwgAr+m2xWurTWW4FRxikErqmDlxQTT403sNscrN522XDnRDk9Dtl0Uk45AL2RhtL1Cij8NE8Zr65QTUZquxyf6Goh8sOUGMWIFcPzYSm5zv91iwgyfaAtLKiYLYmE8KLxqOujG7kQMikQgKw=MKS0lKTRH+Q=PdwTQHi0tTYW3NZi4b6LtTjTuGc0=rw6o2nb3ZlR3q1rsgSukVpy8Qa92idKxTtBIwYndxfgU7TSHKwyH7dRa6KcjrWI8UtCi646foV26HyTnZlkyGytf5HkUwXKn4dscwekMkogutgk84jG4eY1fs4QQ=Lz+3GwgszbTSq=oRrGSOTeWknQHjd=dr=wOYQa5e=5qTsGW78h7fabWHibIEQGd3PD7jaxGcDG7eiDD===; NSC_ohjoy-bmjzvo-200-159=ffffffffc3a0d61945525d5f4f58455e445a4a423660; acw_tc=ac11000117081668581892599e00df82b417f67a772b57fa9a45250dbc3f53; tfstk=ehyyvxjUCTBzVsK_ajDEgCvgiGM-pAQ_tJgIxkqnV40kFgF2nDaSPMEhyyyEoyuQVYVQ8Xu0v7s-OaKU3kUL8XI-R20URkV5cO6_2uHKK271COTaucUV5K5h_Bq-pvbf5O6_2uU6tmJVXNJ3KJols69hIj0g30llGu0DKmeq4bugqB9HKCnraVqougz6JmDiGJFy-BloDmu10io34F-S9iHdQBdKwSnq5gIJ9BhoDmu10iRp9bHt0VsR2"
}
# res = requests.get(url=html, headers=headers).json()
res=json.dumps(html)
print(res)
res=json.loads(res)
pprint.pprint(res)
items = res["resultbody"]['job']["items"]
aa = []
print("?????????")
for i in range(50):
data = {'jobName': items[i]['jobName'], 'companyName': items[i]['companyName'],
'jobAreaString': items[i]['jobAreaString'],
'provideSalaryString': items[i]['provideSalaryString'],
'confirmDateString': items[i]['confirmDateString']}
print(data)
aa.append(data)
with open('51job.csv', 'a', encoding='ANSI') as f:
wr = csv.writer(f)
# wr.writerow(['jobName', 'companyName', 'provideSalaryString'])
df = pd.DataFrame.from_dict(aa)
df.to_csv("51job.csv", header=None, mode="a")
print("------------------------------------------写入成功--------------------------------------------------")
print("-----------------------------------已收集" + str(page) + "页----------------------------------------")
driver.quit()
以上是我的代码