利用selenium和chromdriver进行数据抓取
再使用selenium之前,我尝试过使用requests库对数据进行抓取,虽然代码也可以运行,但是比较麻烦,而且其中的cookies信息坚持不了多久,所以不能够完整的将数据给爬取下来,如下图:
在这里顺便把代码也一下复制过来把,希望大家可以一起交流讨论
import requests
import time
import json
from lxml import etree
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/77.0.3865.90 Safari/537.36",
"Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
"Origin": "https://www.lagou.com",
"X-Anit-Forge-Code": "0",
"X-Anit-Forge-Token": "None",
"X-Requested-With": "XMLHttpRequest"
}
def main():
for i in range(1, 21):
data = {
"first": True,
"pn": i, # 页数
"kd": "python" # 这里的关键字是python
}
# 实例化session,保存cookie信息,先登录原始网页,然后进行数据提取
session = requests.session()
session.get("https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=", headers=headers, timeout=3)
time.sleep(2)
cookie = session.cookies # 为此次获取的cookie
# 请求对应网址抓取数据
url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
response = session.post(url=url, data=data, headers=headers, cookies=cookie, timeout=5)
response.encoding = response.apparent_encoding
result = json.loads(response.text)
time.sleep(2)
positionIds = result["content"]["positionResult"]["result"] # 找到json数据中的职位编号
for position in positionIds:
positionId = position["positionId"]
detail_url = "https://www.lagou.com/jobs/%s.html" % positionId # 构造出详情页的url地址
parse_detail(detail_url)
def parse_detail(url):
response = requests.get(url, headers=headers)
text = response.text
html = etree.HTML(text)
item = {}
item["title"] = html.xpath("//div[@class='job-name']//h1[@class='name']/text()")[0] # 名称
item["company"] = html.xpath("//div[@class='job-name']//h4[@class='company']/text()")[0] # 公司
item["salary"] = html.xpath("//dd[@class='job_request']//span[@class='salary']/text()")[0] # 工资
item["work_place"] = html.xpath("//dd[@class='job_request']//span[2]/text()")[0] # 工作地点
item["work_place"] = re.sub(r'[\s/]', "", item["work_place"])
item["work_years"] = html.xpath("//dd[@class='job_request']//span[3]/text()")[0] # 工作经验
item["work_years"] = re.sub(r'[\s/]', "", item["work_years"])
item["education"] = html.xpath("//dd[@class='job_request']//span[4]/text()")[0] # 学历
item["education"] = re.sub(r'[\s/]', "", item["education"])
item["requirement"] = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
item["requirement"] = re.sub(r"\s", "", item["requirement"]) # 获取工作要求
print(item)
if __name__ == '__main__':
main()
因为这种代码的通用性不高,无法像selenium一样模仿浏览器发送请求,所以使用了selenium配合chromedriver的方法。
实现手段: selenium、chromedriver
实现思路:
- 请求详情页,然后通过抓包获得每个职位的详情页面,如下图所示:
2. 进入职位的详情页,获取对应数据,每当获取完一页数据的时候,找到下一页的按钮,然后进行点击,方法和第一步差不多,值得注意的是,
3. 在进入详情页面是,应该新打开一个窗口,保留原始的列表页,再获取完详情页的数据时,将其关闭,然后再打开新的详情页,反复循环。
4. 再抽取数据的时候,应该将速度放低,不然爬取太快,不仅会让对方的服务器负载过多,还容易让对方识别出自己是一个爬虫
思路理清之后,就是实现代码啦
这是效果图
代码如下:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from lxml import etree
import time
import re
class LagouSpider(object):
option = webdriver.ChromeOptions()
# option.add_argument("--proxy-server=http://118.89.24.136:88")
def __init__(self):
self.driver = webdriver.Chrome('G:\google\chromedriver.exe')
self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
self.portion = []
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source # 获取网页源代码
self.get_detail_url(source)
# 获取按钮
next_page = self.driver.find_element_by_xpath("//div[@class='pager_container']//span[last()]")
if "pager_next_disabled" in next_page.get_attribute("class"): # 获取属性
break
else:
next_page.click()
time.sleep(2)
def get_detail_url(self, source):
"""获取详情页的url地址"""
html = etree.HTML(source)
links = html.xpath("//ul[@class='item_con_list']//div[@class='p_top']/a/@href") # 获取详情页的url地址
for link in links:
self.get_source(link)
print(link)
print("=" * 40)
time.sleep(2)
def get_source(self, link): # 获取详情页的源代码
"""获取详情页的源代码"""
# self.driver.get(link)
self.driver.execute_script("window.open('%s')" % link) # 打开一个新页面
self.driver.switch_to.window(self.driver.window_handles[1])
WebDriverWait(self.driver, timeout=10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']//h1[@class='name']")))
source = self.driver.page_source
self.get_detail(source)
# 关闭当前这个详情页页
self.driver.close()
# 继续切换会这个列表页
self.driver.switch_to.window(self.driver.window_handles[0])
def get_detail(self, source):
"""获取详情页的数据"""
html = etree.HTML(source)
item = {}
item["title"] = html.xpath("//div[@class='job-name']//h1[@class='name']/text()")[0] # 名称
item["company"] = html.xpath("//div[@class='job-name']//h4[@class='company']/text()")[0] # 公司
item["salary"] = html.xpath("//dd[@class='job_request']//span[@class='salary']/text()")[0] # 工资
item["work_place"] = html.xpath("//dd[@class='job_request']//span[2]/text()")[0] # 工作地点
item["work_place"] = re.sub(r'[\s/]', "", item["work_place"])
item["work_years"] = html.xpath("//dd[@class='job_request']//span[3]/text()")[0] # 工作经验
item["work_years"] = re.sub(r'[\s/]', "", item["work_years"])
item["education"] = html.xpath("//dd[@class='job_request']//span[4]/text()")[0] # 学历
item["education"] = re.sub(r'[\s/]', "", item["education"])
item["requirement"] = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
item["requirement"] = re.sub(r"\s", "", item["requirement"])
self.portion.append(item)
self.driver.save_screenshot("%s.png" % item["title"])
# print(self.portion)
print(item)
if __name__ == '__main__':
spider = LagouSpider()
spider.run()
欢迎大家一起讨论交流!!!