后续再重爬拉勾网,发现了很多原贴Python selenium+beautifulsoup 登录爬取拉勾网的代码问题,本文对代码进行了改进:
- beautifulsoup存在定位不准确,解析重复的问题,此版本不再使用;
- 增加了防selenium识别设置,可以参考代码option部分,或者见我的帖子python selenium被反爬系统识别的问题;
- 使用1个for循环进行多个列表元素遍历,具体实例可见我的贴子python 多元素多列表循环实例
- 增加防止UnicodeEncodeError的代码(见末端)
上代码块:
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver import Chrome
import time
import csv
job_title_list=[]
job_link_list=[]
job_skills=[]
experience_list=[]
company_list=[]
industry_list=[]
company_describe_list=[]
company_link_list=[]
salary_list=[]
location_list=[]
head=['job_title','job_link','job_skills','experience','company','industry',
'company_describe','company_link','salary','location']
#北上广深
cities=['//div/a[@data-id="5"]','//div/a[@data-id="6"]','//div/a[@data-id="765"]','//div/a[@data-id="763"]']
keywords=['数据分析', 'bi','商业数据分析']
for keyword in keywords:
# options是为了防爬虫做的设置
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver=Chrome(options=options)
url='https://www.lagou.com/'
webpage=driver.get(url)
time.sleep(2)
#全国弹窗
quanguo=driver.find_element_by_xpath('//p[@class="checkTips"]/a')
quanguo.click()
#登录界面
login=driver.find_element_by_xpath('//li/a[@class="login"]')
login.click()
username=driver.find_element_by_xpath('//div/input[@type="text"]')
#输入用户名和密码
username.send_keys('xxx')
psw=driver.find_element_by_xpath('//div/input[@type="password"]')
psw.send_keys('xxx')
log_in=driver.find_element_by_xpath('//div/div[@class="login-btn login-password sense_login_password btn-green"]')
log_in.click()
time.sleep(20)
search=driver.find_element_by_xpath('//input[@id="search_input"]')
search.send_keys(keyword)
time.sleep(2)
search_button=driver.find_element_by_xpath('//input[@id="search_button"]')
search_button.click()
time.sleep(3)
#红包弹窗
hongbao=driver.find_element_by_xpath('//div[@class="body-btn"]')
hongbao.click()
time.sleep(2)
for city in cities:
#有默认城市,在选择城市之前要先刷新下
search_fresh=driver.find_element_by_xpath('//input[@id="submit"]')
search_fresh.click()
time.sleep(2)
city=driver.find_element_by_xpath(city)
city.click()
time.sleep(2)
a=1
total_num=driver.find_element_by_xpath('//span[@class="span totalNum"]').text
current_page=driver.find_element_by_xpath('//span[@class="pager_is_current"]').text
#最后一页会跳出验证窗口,太麻烦,我选择不爬取最后一页
while int(current_page) < (int(total_num)-1):
time.sleep(3)
print('当前页码为:'+current_page+'页')
#job_titles这一个路径包括了【公司名称】和【薪资水平】
job_titles=driver.find_elements_by_xpath('//div[@class="s_position_list "]//ul[@class="item_con_list"]//li')
job_links=driver.find_elements_by_xpath('//div[@class="s_position_list "]//a[@class="position_link"]')
skills=driver.find_elements_by_xpath('//div[@class="s_position_list "]//div[@class="list_item_bot"]/div[@class="li_b_l"]')
experiences=driver.find_elements_by_xpath('//div[@class="s_position_list "]//div[@class="p_bot"]/div')
# company=job.find_element_by_tag_name('li').get_attribute('data-company')
# salary=job.find_element_by_tag_name('li').get_attribute('data-salary')
industrys=driver.find_elements_by_xpath('//div[@class="s_position_list "]//div[@class="industry"]')
company_describes=driver.find_elements_by_xpath('//div[@class="s_position_list "]//div[@class="list_item_bot"]/div[@class="li_b_r"]')
company_links=driver.find_elements_by_xpath('//div[@class="company_name"]//a')
locations=driver.find_elements_by_xpath('//span[@class="add"]')
#必须使用zip才可以多个列表循环
for (jobs, jl, sk, ex, i, cd, cl, l ) in zip(job_titles,job_links, skills, experiences, industrys, company_describes, company_links,locations):
job_title=jobs.get_attribute("data-positionname")
job_title_list.append(job_title)
job_link=jl.get_attribute("href")
job_link_list.append(job_link)
skill=sk.text
job_skills.append(skill)
experience=ex.text
experience_list.append(experience)
company=jobs.get_attribute('data-company')
company_list.append(company)
industry=i.text.strip()
industry_list.append(industry)
company_describe=cd.text.strip()
company_describe_list.append(company_describe)
company_link=cl.get_attribute('href')
company_link_list.append(company_link)
salary=jobs.get_attribute('data-salary')
salary_list.append(salary)
location=l.text
location_list.append(location)
next_page=driver.find_element_by_xpath("//div[@class='pager_container']/span[@action='next']")
next_page.click()
current_page=driver.find_element_by_xpath('//span[@class="pager_is_current"]').text
time.sleep(3)
driver.quit()
rows =zip(job_title_list,job_link_list,job_skills,experience_list,company_list,
industry_list,company_describe_list,company_link_list,salary_list,location_list)
csv_file=open('lggz.csv','w',newline='',encoding='utf-8')
writer=csv.writer(csv_file)
writer.writerow(head)
try:
for row in rows:
writer.writerow(row)
csv_file.close()
except UnicodeEncodeError:
for row in rows:
writer.writerow(row)
csv_file.close()