拉勾网爬取客服信息并记录到数据库(只获取公司名称)下一篇会根据公司名筛选

本文介绍了如何通过网络爬虫从拉勾网上抓取各个公司的客服信息,并将这些信息集中存储到数据库中,重点关注公司名称的获取。下一篇文章将基于这些公司名称进行进一步的数据筛选和分析。
摘要由CSDN通过智能技术生成
from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.common.by import By
import csv
import requests
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import  pymysql




class LagouSpider(object):

    def __init__(self):
        self.driver_path = r'D:\cd\chromedriver.exe'
        self.driver = webdriver.Chrome(executable_path=self.driver_path)
        self.url = 'https://www.lagou.com/jobs/list_%E5%AE%A2%E6%9C%8D?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput='
        self.positions = []
        self.stauts = 0
        self.cursor=''
        self.db=''

    def run(self):
        #给傻缺链接数据库
        self.db = pymysql.connect("127.0.0.1", "root", "111111", "kedou")
        self.cursor = self.db.cursor(pymysql.cursors.DictCursor)
        self.driver.get(self.url)
        while True:
            WebDriverWait(driver=self.driver, timeout=10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
            )
            # print(self.driver.page_source)
            source = self.driver.page_source
            self.get_company(source)
            #self.page_list_page(source)
            try:
                # 获取下一页的
                next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
                if "pager_next_disabled" in next_btn.get_attribute("class"):
                    break
                else:
                    next_btn.click()
            except:
                print(source)
            time.sleep(10)

    def get_company(self,source):
        html=etree.HTML(source)
        jianshadiaos=html.xpath("//div[@class='company_name']/a/text()")
        for  jianshadiao in jianshadiaos:
            sql = "insert into kedou (`kedou`) values ('%s')" %(jianshadiao)
            print(sql)
            ok=self.cursor.execute(sql)
            self.db.commit()
            print(jianshadiao)
            print(ok)
    def page_list_page(self, source):
        html = etree.HTML(source)
        links = html.xpath("//div[@class='p_top']//a/@href")
        for link in links:
            self.request_detail_page(link)
            time.sleep(3)

    def request_detail_page(self, url):
        # 需要open 详情页
        # self.driver.get(url)
        self.driver.execute_script("window.open('%s')" % url)
        self.driver.switch_to_window(self.driver.window_handles[1])
        WebDriverWait(driver=self.driver, timeout=10).until(
            # //div[@class='job-name']//span[@class='name']/text() 不能这么写 这个地方不想普通的xpath 只找节点元素 不能找到text
            EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']"))
        )
        source = self.driver.page_source
        self.parse_detail_page(source)
        # 保持只有2个页面 关闭他
        self.driver.close()
        # 切回列表页
        self.driver.switch_to_window(self.driver.window_handles[0])

    def parse_detail_page(self, source):
        html = etree.HTML(source)
        position_name = html.xpath("//div[@class='job-name']//span[@class='name']/text()")[0]
        # print(position_name)
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        salary = job_request_spans[0].xpath('.//text()')[0].strip()
        # print(salary)
        city = job_request_spans[1].xpath('.//text()')[0].strip()
        city = re.sub(r"[\s/]", "", city)
        # print(city)
        work_years = job_request_spans[2].xpath('.//text()')[0].strip()
        work_years = re.sub(r"[\s/]", "", work_years)
        # print(work_years)
        education = job_request_spans[3].xpath('.//text()')[0].strip()
        education = re.sub(r"[\s/]", "", education)
        # print(education)
        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        # print(desc)
        company_name = html.xpath("//h2[@class='fl']/em/text()")[0].strip()
        position = {
            'name': position_name,
            'salary': salary,
            'city': city,
            'work_years': work_years,
            'education': education,
            'desc': desc,
            'company_name': company_name
        }
        self.positions.append(position)
        print("*" * 40)
        # 写入csv
        print(position)
        if self.stauts == 0:
            self.stauts = 1
            self.save_csv(position)
        else:
            print('进来了')
            self.save_csv1(position)

    def save_csv(self, data):
        headers = ['name', 'salary', 'city', 'work_years', 'education', 'desc', 'company_name']
        values = []
        values.append(data)
        with open('job.csv', 'w', encoding='utf-8', newline='') as fp:
            writer = csv.DictWriter(fp, headers)
            writer.writeheader()
            writer.writerows(values)

    def save_csv1(self, data):
        headers = ['name', 'salary', 'city', 'work_years', 'education', 'desc', 'company_name']
        values = []
        values.append(data)
        with open('job.csv', 'a', encoding='utf-8', newline='') as fp:
            writer = csv.DictWriter(fp, headers)
            writer.writerows(values)

    def read_csv(self, path='job1.csv'):
        with open(path, 'r', encoding='utf-8') as fp:
            readers = csv.DictReader(fp)
            for reader in readers:
                print(reader)
                print(reader['name'])
                print(reader['desc'])


if __name__ == "__main__":
    spider = LagouSpider()
    spider.run()
    # print(spider.positions)

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,我可以帮你写一个Python的爬虫代码,使用requests和BeautifulSoup库来爬取拉勾的职位信息。以下是代码: ```python import requests from bs4 import BeautifulSoup def get_job_info(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') job_name = soup.select('.job-name')[0].text.strip() salary = soup.select('.job_request .salary')[0].text.strip() experience = soup.select('.job_request p')[1].text.strip() education = soup.select('.job_request p')[2].text.strip() tags = [tag.text.strip() for tag in soup.select('.position-label li')] company = soup.select('.company')[0].text.strip() return {'Job': job_name, 'Salary': salary, 'Experience': experience, 'Education': education, 'Tags': tags, 'Company': company} def get_jobs_list(city, keyword, pages=1): jobs_list = [] for page in range(1, pages+1): url = f'https://www.lagou.com/{city}/zhaopin/{keyword}/{str(page)}/?filterOption=3' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') job_items = soup.select('.itemcon') for job in job_items: job_url = job.select('.positionlink')[0].get('href') job_info = get_job_info(job_url) jobs_list.append(job_info) return jobs_list jobs = get_jobs_list('beijing', 'python', pages=2) for job in jobs: print(job) ``` 该爬虫代码可以爬取拉勾北京地区Python职位的信息。你可以替换city和keyword变量的值来搜索其他城市和职位需求。pages参数是可选的,默认为1,如果设置为2,则爬取前两页的职位信息。 以上是该爬虫代码的实现,如果你有其他问题,可以随时问我。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值