【爬虫】牛客网简易爬虫（二）用Selenium看看你投过简历的公司这个月哪些有笔试+笔试时间

最新推荐文章于 2023-11-22 22:03:13 发布

YYIverson

最新推荐文章于 2023-11-22 22:03:13 发布

阅读量720

点赞数

分类专栏：爬虫文章标签：爬虫 selenium 牛客网秋招

本文链接：https://blog.csdn.net/YYIverson/article/details/100569801

版权

爬虫专栏收录该内容

7 篇文章 1 订阅

订阅专栏

在牛客网--求职--笔试日历可以笔试时间每天都有哪些公司有笔试，如下图

一、需求

1、看看这个有笔试的公司哪家你还没看过。

2、看看自己投过的公司哪家这个月有笔试及笔试的时间。

当然，你还是得把你投的记在excel中，把已笔试的也记下来！

2、代码

from selenium import webdriver
from collections import deque
import pandas as pd

index_jin = None  # 定位“今天”的索引
my_month = None # 记录月份

def first_extract(url):
    '''
    :param url:
    :return:清洗后的今天以后的笔试，因为如果笔试已经过去就没啥意义再看了。
    '''
    global index_jin
    global my_month
    # 实例化ChromeOptions类
    options = webdriver.ChromeOptions()
    # 加参配置
    options.add_argument('--disable-gpu')
    options.add_argument('blink-settings=imagesEnabled=false')
    options.add_argument('disable-infobars')

    # 加入配置，初始化驱动
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url=url)
    # 浏览器最大化
    driver.maximize_window()
    # 找到当前年份与月份
    current_month = driver.find_element_by_xpath(xpath='//div[@class="current-date"]/span[@class="current-mouth"]')
    current_month = str(current_month.text)
    my_month = current_month.split('-')[-1]

    # 找到所有信息
    all = driver.find_elements_by_xpath(xpath='//div[@id="jsCpn_9_component_0"]//td')
    # print(all)

    d = deque([])
    for i in range(len(all)):
        # 进行初步的清洗
        if all[i].text == '':
            continue
        content = str(all[i].text)
        content = content.replace('\n',' ').strip(' +').strip(' 我要添加').strip(' +')
        content = content.strip('技术类').strip('非技术类')
        if '今' in content:
            index_jin = i
            content = content.strip('今 ')
        if len(content.split(' ')[0]) == 1:
            content = current_month +'-0' + content
        else:
            content = current_month + '-' + content

        import re
        p = re.compile('（.*?）')
        M = set(p.findall(content))
        for m in M:
            content = content.replace(m,'')
        d.append(content)
    return list(d)

def last_extract(my_list):
    my_list = my_list[index_jin:] # 把今天之前的给去掉，因为我们只关心今天之后的日期
    d = {}
    for l in my_list:
        l = l.split(' ')
        i = 2
        the_date = l[0]
        while i < len(l):
            if d.get(l[i]):
                d[l[i]].append(f'{the_date}.{l[i-1]}')
            else:
                d[l[i]] = []
                d[l[i]].append(f'{the_date}.{l[i-1]}')
            i += 2
    return d

mylist = first_extract(url='https://www.nowcoder.com/school/calendar')
mydict = last_extract(mylist)

def new(my_dict):
    company = pd.read_excel(io=r'C:\Users\admin\Desktop\校招.xlsx',header=0)
    company['已看过'] = company['已看过'].astype(str)
    checked_company1 = list(company['已看过'])
    checked_company2 = []
    for c in checked_company1:
        checked_company2.append(c.upper())

    no = []
    for d in mydict.keys():
        d = d.upper()
        if d not in checked_company2:
            no.append(d)

    for n in no:
        for c in checked_company2:
            if c in n:
                no.remove(n)
    for c in checked_company2:
        for n in no:
            if n in c:
                no.remove(n)
                break

    print(f'!!!牛客网上{my_month}月的有笔试的公司中，你还没看过这些公司:!!!\n{no}')

def look_your_not_test(my_dict):
    company = pd.read_excel(io=r'C:\Users\admin\Desktop\校招.xlsx',header=0)
    company['已投'] = company['已投'].astype(str)
    company['已笔试'] = company['已笔试'].astype(str)
    # 找出还没笔试的
    checked_company1 = list(company['已投'])
    checked_company2 = list(company['已笔试'])
    for c2 in checked_company2:
        if c2 in checked_company1:
            checked_company1.remove(c2)

    last = []
    for c in checked_company1:
        for k in my_dict.keys():
            if c in k or c == k:
                last.append(k)
    print(f'{my_month}月你已投但未参加笔试的公司有相关笔试安排的公司如下：')
    print(last)
    print(f'你已投但未参加笔试的公司{my_month}月的笔试安排如下：')
    for l in last:
        print(f'{l}:{my_dict[l]}')

new(mydict)
look_your_not_test(mydict)