根据上一篇公司名称进行进一步筛选获取想要的更多职位

from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.common.by import By
import csv
import requests
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import  pymysql
from selenium.common.exceptions import NoSuchElementException


class LagouSpider(object):

    def __init__(self):
        self.canshu = webdriver.ChromeOptions()
        self.canshu.add_argument('headless')  # 设置option
        self.driver_path = r'D:\cd\chromedriver.exe'
        self.driver = webdriver.Chrome(executable_path=self.driver_path,chrome_options=self.canshu)
        self.url = 'https://www.lagou.com/jobs/list_%E5%AE%A2%E6%9C%8D?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput='
        self.job_url="https://www.lagou.com"
        self.positions = []
        self.stauts = 0
        self.cursor=''
        self.db=''

    def run(self):
        #给傻缺链接数据库
        self.db = pymysql.connect("127.0.0.1", "root", "111111", "kedou")
        self.cursor = self.db.cursor(pymysql.cursors.DictCursor)

        #先查询

        sql="select * from kedou where id >=0"

        self.cursor.execute(sql)
        results = self.cursor.fetchall()

        while True:
            for s in results:
                self.driver.implicitly_wait(2)
                print(s['kedou'])
                self.driver.get(self.job_url)
                #等待选择城市弹窗
                if int(self.stauts)==0:
                    #标记为1
                    self.stauts=1
                    #cboxWrapper
                    WebDriverWait(driver=self.driver, timeout=10).until(
                        EC.presence_of_element_located((By.XPATH, "//div[@id='cboxWrapper']"))
                    )
                    #获取深圳
                    sz_btn = self.driver.find_element_by_xpath("//ul[@class='clearfix']/li[6]/a")
                    time.sleep(3)
                    sz_btn.click()

                WebDriverWait(driver=self.driver, timeout=10).until(
                    EC.presence_of_element_located((By.XPATH, "//input[@id='search_input']"))
                )
                inputTag = self.driver.find_elements(By.XPATH, "//input[@id='search_input']")[0]
                #print(inputTag)
                #inputTag.send_keys('乐逗游戏')
                inputTag.send_keys(s['kedou'])
                #搜索button
                search_btn = self.driver.find_element_by_xpath("//input[@id='search_button']")
                time.sleep(3)
                search_btn.click()
                time.sleep(3)
                #开始进入页面
                while True:
                    self.driver.implicitly_wait(4)

                    page_btn = self.driver.find_element_by_xpath("//div[@class='page-number']/span[last()]").text
                    dangqian_bit = self.driver.find_element_by_xpath("//div[@class='page-number']//span[1]").text
                    if int(page_btn) >1:
                        WebDriverWait(driver=self.driver, timeout=10).until(
                            EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
                        )
                        #exit('44')
                        # if page_btn==False:
                        #     print('不存在')
                        #
                        # exit('3334')
                        # print(self.driver.page_source)
                        self.driver.implicitly_wait(3)
                        source = self.driver.page_source
                        #self.get_company(source)
                        #self.page_list_page(source)

                        self.get_jsd(source,s['kedou'])
                        try:
                            # 获取下一页的
                            #next_btn1 = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
                            next_btn = self.driver.find_element_by_xpath("//div[@class='item page']//div[contains(@class,'next')]")
                            #print(next_btn1.get_attribute("class"))
                            #print(next_btn1)
                            #exit('999')
                            if(int(dangqian_bit)==int(page_btn)):
                                break
                            else:
                                next_btn.click()
                        except:
                            print(source)
                            exit('jd是蝌蚪男!!!!')
                        time.sleep(10)
                    else:
                        source = self.driver.page_source
                        self.get_jsd(source,s['kedou'])
                        break
            break
        print("jd是250加傻缺")
    def get_jsd(self,source,company_name):
        html=etree.HTML(source)
        jianshadiaos = html.xpath("//div[@class='p_top']/a/h3/text()")
        jianshadiaosb = html.xpath("//div[@class='p_top']/a/@href")

        # print(jianshadiaosb)
        # exit('5')
        #一个公司所有的职业页面
        for i,jianshadiao in enumerate(jianshadiaos):

            #print(jianshadiao) #职业名字
            #小说运营  编辑  公众号运营  新媒体运营  文案
            strs=['小说','运营','文案','新媒体','编辑','行政','前台','客服','php','前端','PHP']
            for str1 in strs:
                if str1 in jianshadiao:
                    #点击进入详情吧
                    url = jianshadiaosb[i]
                    print(url)
                    self.driver.execute_script("window.open('%s')" % url)
                    self.driver.switch_to_window(self.driver.window_handles[1])
                    self.driver.implicitly_wait(3)
                    source = self.driver.page_source

                    html = etree.HTML(source)
                    zhiweiming = html.xpath("//div[@class ='job-name']/span/text()")[0]
                    gongzi=html.xpath("//dd[@class='job_request']//span[@class='salary']/text()")[0]
                    gongsi=html.xpath("//div[@class='job_company_content']//h2//em/text()")[0].strip()
                    address_list=html.xpath("//div[@class='work_addr']/a/text()")
                    dizhi2=html.xpath("//div[@class='work_addr']/text()")
                    # print(type(dizhi2))
                    # print(dizhi2)
                    overdizhi=''
                    for d in dizhi2:
                        overdizhi+="".join(d)
                    overdizhi = re.sub(r'\s', "", overdizhi)
                    dizhi=''
                    for address in address_list:
                        if "查看地图" in address:
                            continue
                        else:
                            dizhi+="".join(address).strip()

                    #print(jianshadiao)
                    lastdizhi=dizhi+overdizhi
                    res={
                        'zhiwei':zhiweiming,
                        'gongsi':gongsi,
                        'dizhi':lastdizhi,
                        'gongzi':gongzi
                    }
                    print(res)
                    sql = "insert into kedou2 (`zhiwei`,`gongsi`,`dizhi`,`gongzi`) values ('%s','%s','%s','%s')" % (zhiweiming,gongsi,lastdizhi,gongzi)
                    print(sql)
                    ok = self.cursor.execute(sql)
                    self.db.commit()
                    print(ok)

                    time.sleep(4)

                    #self.positions.append(res)
                    # 保持只有2个页面 关闭他
                    self.driver.close()
                    # 切回列表页
                    self.driver.switch_to_window(self.driver.window_handles[0])
                    time.sleep(4)
                    break

        #exit('jianshaque')
    def get_company(self,source):
        html=etree.HTML(source)
        jianshadiaos=html.xpath("//div[@class='company_name']/a/text()")
        for  jianshadiao in jianshadiaos:
            sql = "insert into kedou (`kedou`) values ('%s')" %(jianshadiao)
            print(sql)
            ok=self.cursor.execute(sql)
            self.db.commit()
            print(jianshadiao)
            print(ok)
    def page_list_page(self, source):
        html = etree.HTML(source)
        links = html.xpath("//div[@class='p_top']//a/@href")
        for link in links:
            self.request_detail_page(link)
            time.sleep(3)

    def request_detail_page(self, url):
        # 需要open 详情页
        # self.driver.get(url)
        self.driver.execute_script("window.open('%s')" % url)
        self.driver.switch_to_window(self.driver.window_handles[1])
        WebDriverWait(driver=self.driver, timeout=10).until(
            # //div[@class='job-name']//span[@class='name']/text() 不能这么写 这个地方不想普通的xpath 只找节点元素 不能找到text
            EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']"))
        )
        source = self.driver.page_source
        self.parse_detail_page(source)
        # 保持只有2个页面 关闭他
        self.driver.close()
        # 切回列表页
        self.driver.switch_to_window(self.driver.window_handles[0])

    def parse_detail_page(self, source):
        html = etree.HTML(source)
        position_name = html.xpath("//div[@class='job-name']//span[@class='name']/text()")[0]
        # print(position_name)
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        salary = job_request_spans[0].xpath('.//text()')[0].strip()
        # print(salary)
        city = job_request_spans[1].xpath('.//text()')[0].strip()
        city = re.sub(r"[\s/]", "", city)
        # print(city)
        work_years = job_request_spans[2].xpath('.//text()')[0].strip()
        work_years = re.sub(r"[\s/]", "", work_years)
        # print(work_years)
        education = job_request_spans[3].xpath('.//text()')[0].strip()
        education = re.sub(r"[\s/]", "", education)
        # print(education)
        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        # print(desc)
        company_name = html.xpath("//h2[@class='fl']/em/text()")[0].strip()
        position = {
            'name': position_name,
            'salary': salary,
            'city': city,
            'work_years': work_years,
            'education': education,
            'desc': desc,
            'company_name': company_name
        }
        self.positions.append(position)
        print("*" * 40)
        # 写入csv
        print(position)
        if self.stauts == 0:
            self.stauts = 1
            self.save_csv(position)
        else:
            print('进来了')
            self.save_csv1(position)

    def save_csv(self, data):
        headers = ['name', 'salary', 'city', 'work_years', 'education', 'desc', 'company_name']
        values = []
        values.append(data)
        with open('job.csv', 'w', encoding='utf-8', newline='') as fp:
            writer = csv.DictWriter(fp, headers)
            writer.writeheader()
            writer.writerows(values)

    def save_csv1(self, data):
        headers = ['name', 'salary', 'city', 'work_years', 'education', 'desc', 'company_name']
        values = []
        values.append(data)
        with open('job.csv', 'a', encoding='utf-8', newline='') as fp:
            writer = csv.DictWriter(fp, headers)
            writer.writerows(values)

    def read_csv(self, path='job1.csv'):
        with open(path, 'r', encoding='utf-8') as fp:
            readers = csv.DictReader(fp)
            for reader in readers:
                print(reader)
                print(reader['name'])
                print(reader['desc'])


if __name__ == "__main__":
    spider = LagouSpider()
    spider.run()
    # print(spider.positions)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值