from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import json
import csv
import random
from pyquery import PyQuery as pq
# 声明一个谷歌驱动器,并设置不加载图片,间接加快访问速度
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
browser = webdriver.Chrome(options=options)
# 窗口最大化
browser.maximize_window()
# 隐式等待 wait = browser.implicitly_wait(10)
# 显示等待
wait = WebDriverWait(browser, 10)
# url
url = 'https://www.51job.com/'
keyword = input('请输入职位:').strip() # 比如ipad
# 声明一个list,存储dict
data_list = []
def star_spider():
# 请求url
browser.get(url)
# 获取输入框的id,并输入关键字python爬虫
zhiwei_base = browser.find_elements_by_css_selector('.nlink a')
# print(type(zhiwei_base))
# print(len(zhiwei_base))
zhiwei_base[1].click()
browser.find_element_by_id('keywordInput').send_keys(keyword)
browser.find_element_by_id('search_btn').click()
# 定位浏览器弹窗元素------div弹窗
# 操作:明明可以找到这个按钮,但是就是定位不到。这个就是因为当前有div弹窗弹出的时候,需要设置一下等待时间,等页面元素加载完毕,再去做其他操作。
time.sleep(5)
browser.find_element_by_class_name('allcity').click()
browser.find_element_by_class_name('ttag').click()
time.sleep(5)
browser.find_element_by_class_name('but_box').find_element_by_class_name('p_but').click()
time.sleep(5)
print('_________________________________________________________________________')
# 显示等待下一页的元素加载完成
wait.until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'next')
)
)
# 将滚动条拉到最下面的位置,因为往下拉才能将这一页的商品信息全部加载出来
browser.execute_script('document.documentElement.scrollTop=10000')
# 随机延迟,等下元素全部刷新
time.sleep(random.randint(1, 3))
browser.execute_script('document.documentElement.scrollTop=0')
# 先获取一个有多少页
page = browser.find_element_by_class_name('rt.rt_page').text
print(page)
pages = page.split('/')
all_pages = pages[1].split()
return all_pages
def index_page():
all_page = star_spider()
# 设置一个计数器
count = 0
# 无限循环
while True:
input = browser.find_element_by_id('jump_page')
input.clear()
input.send_keys(count+1)
browser.find_element_by_class_name('og_but').click()
print('第{}页开始...................................................................'.format(count+1))
try:
count += 1
# 显示等待商品信息加载完成
'''wait.until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, '.j_joblist')
)
)
'''
# 将滚动条拉到最下面的位置,因为往下拉才能将这一页的商品信息全部加载出来
browser.execute_script('document.documentElement.scrollTop=10000')
# 随机延迟,等下元素全部刷新
time.sleep(random.randint(1, 3))
browser.execute_script('document.documentElement.scrollTop=0')
print('###########################################################33')
get_projucts()
except Exception as e:
continue
print('第{}页完成...................................................................'.format(count))
# 如果count==all_page就退出循环
if count == all_page:
break
def get_projucts():
'''
:return:
'''
html = browser.page_source
doc = pq(html)
items = doc('.j_joblist .e').items()
for item in items:
post = item.find('.jname').text()
#print(post)
post_link = item.find('.el').attr('href')
#print(post_link)
data_issue = item.find('.time').text()
#print(data_issue)
salary = item.find('.sal').text()
#print(salary)
extra = item.find('.d').text()
#print(extra)
try:
tags = item.find('.tags').attr('title')
#print(tags)
except:
tags = None
#print(tags)
cname = item.find('.cname').text()
#print(cname)
clink = item.find('.cname').attr('href')
#print(clink)
ctpye = item.find('.dc').text()
#print(ctpye)
cint = item.find('.int').text()
#print(cint)
# 声明一个字典存储数据
data_dict = {}
data_dict['post'] = post
data_dict['post_link'] = post_link
data_dict['data_issue'] = data_issue
data_dict['salary'] = salary
data_dict['extra'] = extra
data_dict['tags'] = tags
data_dict['cname'] = cname
data_dict['clink'] = clink
data_dict['ctpye'] = ctpye
data_dict['cint'] = cint
print(data_dict)
data_list.append(data_dict)
def main():
index_page()
# 将数据写入jsonwenj
print(data_list)
# 将数据写入jsonwenj
with open('data_json.json', 'a+', encoding='utf-8') as f:
json.dump(data_list, f, ensure_ascii=False, indent=4)
print('json文件写入完成')
with open('data_csv.csv', 'w', encoding='utf-8', newline='') as f:
# 表头
title = data_list[0].keys()
# 声明writer
writer = csv.DictWriter(f, title)
# 写入表头
writer.writeheader()
# 批量写入数据
writer.writerows(data_list)
print('csv文件写入完成')
if __name__ == '__main__':
main()
05-07
156
06-16
221
02-09
328