from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.common.by import By
import csv
import requests
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from selenium.common.exceptions import NoSuchElementException
class LagouSpider(object):
def __init__(self):
self.canshu = webdriver.ChromeOptions()
self.canshu.add_argument('headless') # 设置option
self.driver_path = r'D:\cd\chromedriver.exe'
self.driver = webdriver.Chrome(executable_path=self.driver_path,chrome_options=self.canshu)
self.url = 'https://www.lagou.com/jobs/list_%E5%AE%A2%E6%9C%8D?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput='
self.job_url="https://www.lagou.com"
self.positions = []
self.stauts = 0
self.cursor=''
self.db=''
def run(self):
#给傻缺链接数据库
self.db = pymysql.connect("127.0.0.1", "root", "111111", "kedou")
self.cursor = self.db.cursor(pymysql.cursors.DictCursor)
#先查询
sql="select * from kedou where id >=0"
self.cursor.execute(sql)
results = self.cursor.fetchall()
while True:
for s in results:
self.driver.implicitly_wait(2)
print(s['kedou'])
self.driver.get(self.job_url)
#等待选择城市弹窗
if int(self.stauts)==0:
#标记为1
self.stauts=1
#cboxWrapper
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//div[@id='cboxWrapper']"))
)
#获取深圳
sz_btn = self.driver.find_element_by_xpath("//ul[@class='clearfix']/li[6]/a")
time.sleep(3)
sz_btn.click()
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//input[@id='search_input']"))
)
inputTag = self.driver.find_elements(By.XPATH, "//input[@id='search_input']")[0]
#print(inputTag)
#inputTag.send_keys('乐逗游戏')
inputTag.send_keys(s['kedou'])
#搜索button
search_btn = self.driver.find_element_by_xpath("//input[@id='search_button']")
time.sleep(3)
search_btn.click()
time.sleep(3)
#开始进入页面
while True:
self.driver.implicitly_wait(4)
page_btn = self.driver.find_element_by_xpath("//div[@class='page-number']/span[last()]").text
dangqian_bit = self.driver.find_element_by_xpath("//div[@class='page-number']//span[1]").text
if int(page_btn) >1:
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
)
#exit('44')
# if page_btn==False:
# print('不存在')
#
# exit('3334')
# print(self.driver.page_source)
self.driver.implicitly_wait(3)
source = self.driver.page_source
#self.get_company(source)
#self.page_list_page(source)
self.get_jsd(source,s['kedou'])
try:
# 获取下一页的
#next_btn1 = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
next_btn = self.driver.find_element_by_xpath("//div[@class='item page']//div[contains(@class,'next')]")
#print(next_btn1.get_attribute("class"))
#print(next_btn1)
#exit('999')
if(int(dangqian_bit)==int(page_btn)):
break
else:
next_btn.click()
except:
print(source)
exit('jd是蝌蚪男!!!!')
time.sleep(10)
else:
source = self.driver.page_source
self.get_jsd(source,s['kedou'])
break
break
print("jd是250加傻缺")
def get_jsd(self,source,company_name):
html=etree.HTML(source)
jianshadiaos = html.xpath("//div[@class='p_top']/a/h3/text()")
jianshadiaosb = html.xpath("//div[@class='p_top']/a/@href")
# print(jianshadiaosb)
# exit('5')
#一个公司所有的职业页面
for i,jianshadiao in enumerate(jianshadiaos):
#print(jianshadiao) #职业名字
#小说运营 编辑 公众号运营 新媒体运营 文案
strs=['小说','运营','文案','新媒体','编辑','行政','前台','客服','php','前端','PHP']
for str1 in strs:
if str1 in jianshadiao:
#点击进入详情吧
url = jianshadiaosb[i]
print(url)
self.driver.execute_script("window.open('%s')" % url)
self.driver.switch_to_window(self.driver.window_handles[1])
self.driver.implicitly_wait(3)
source = self.driver.page_source
html = etree.HTML(source)
zhiweiming = html.xpath("//div[@class ='job-name']/span/text()")[0]
gongzi=html.xpath("//dd[@class='job_request']//span[@class='salary']/text()")[0]
gongsi=html.xpath("//div[@class='job_company_content']//h2//em/text()")[0].strip()
address_list=html.xpath("//div[@class='work_addr']/a/text()")
dizhi2=html.xpath("//div[@class='work_addr']/text()")
# print(type(dizhi2))
# print(dizhi2)
overdizhi=''
for d in dizhi2:
overdizhi+="".join(d)
overdizhi = re.sub(r'\s', "", overdizhi)
dizhi=''
for address in address_list:
if "查看地图" in address:
continue
else:
dizhi+="".join(address).strip()
#print(jianshadiao)
lastdizhi=dizhi+overdizhi
res={
'zhiwei':zhiweiming,
'gongsi':gongsi,
'dizhi':lastdizhi,
'gongzi':gongzi
}
print(res)
sql = "insert into kedou2 (`zhiwei`,`gongsi`,`dizhi`,`gongzi`) values ('%s','%s','%s','%s')" % (zhiweiming,gongsi,lastdizhi,gongzi)
print(sql)
ok = self.cursor.execute(sql)
self.db.commit()
print(ok)
time.sleep(4)
#self.positions.append(res)
# 保持只有2个页面 关闭他
self.driver.close()
# 切回列表页
self.driver.switch_to_window(self.driver.window_handles[0])
time.sleep(4)
break
#exit('jianshaque')
def get_company(self,source):
html=etree.HTML(source)
jianshadiaos=html.xpath("//div[@class='company_name']/a/text()")
for jianshadiao in jianshadiaos:
sql = "insert into kedou (`kedou`) values ('%s')" %(jianshadiao)
print(sql)
ok=self.cursor.execute(sql)
self.db.commit()
print(jianshadiao)
print(ok)
def page_list_page(self, source):
html = etree.HTML(source)
links = html.xpath("//div[@class='p_top']//a/@href")
for link in links:
self.request_detail_page(link)
time.sleep(3)
def request_detail_page(self, url):
# 需要open 详情页
# self.driver.get(url)
self.driver.execute_script("window.open('%s')" % url)
self.driver.switch_to_window(self.driver.window_handles[1])
WebDriverWait(driver=self.driver, timeout=10).until(
# //div[@class='job-name']//span[@class='name']/text() 不能这么写 这个地方不想普通的xpath 只找节点元素 不能找到text
EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']"))
)
source = self.driver.page_source
self.parse_detail_page(source)
# 保持只有2个页面 关闭他
self.driver.close()
# 切回列表页
self.driver.switch_to_window(self.driver.window_handles[0])
def parse_detail_page(self, source):
html = etree.HTML(source)
position_name = html.xpath("//div[@class='job-name']//span[@class='name']/text()")[0]
# print(position_name)
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
# print(salary)
city = job_request_spans[1].xpath('.//text()')[0].strip()
city = re.sub(r"[\s/]", "", city)
# print(city)
work_years = job_request_spans[2].xpath('.//text()')[0].strip()
work_years = re.sub(r"[\s/]", "", work_years)
# print(work_years)
education = job_request_spans[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]", "", education)
# print(education)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
# print(desc)
company_name = html.xpath("//h2[@class='fl']/em/text()")[0].strip()
position = {
'name': position_name,
'salary': salary,
'city': city,
'work_years': work_years,
'education': education,
'desc': desc,
'company_name': company_name
}
self.positions.append(position)
print("*" * 40)
# 写入csv
print(position)
if self.stauts == 0:
self.stauts = 1
self.save_csv(position)
else:
print('进来了')
self.save_csv1(position)
def save_csv(self, data):
headers = ['name', 'salary', 'city', 'work_years', 'education', 'desc', 'company_name']
values = []
values.append(data)
with open('job.csv', 'w', encoding='utf-8', newline='') as fp:
writer = csv.DictWriter(fp, headers)
writer.writeheader()
writer.writerows(values)
def save_csv1(self, data):
headers = ['name', 'salary', 'city', 'work_years', 'education', 'desc', 'company_name']
values = []
values.append(data)
with open('job.csv', 'a', encoding='utf-8', newline='') as fp:
writer = csv.DictWriter(fp, headers)
writer.writerows(values)
def read_csv(self, path='job1.csv'):
with open(path, 'r', encoding='utf-8') as fp:
readers = csv.DictReader(fp)
for reader in readers:
print(reader)
print(reader['name'])
print(reader['desc'])
if __name__ == "__main__":
spider = LagouSpider()
spider.run()
# print(spider.positions)
根据上一篇公司名称进行进一步筛选获取想要的更多职位
最新推荐文章于 2024-09-20 02:33:21 发布