爬虫课程设计(爬虫代码)

最新推荐文章于 2024-07-18 12:06:32 发布

CLMOON_

最新推荐文章于 2024-07-18 12:06:32 发布

阅读量1.1k

点赞数

分类专栏： Python 大数据

本文链接：https://blog.csdn.net/GXYMOON_/article/details/112481864

版权

大数据同时被 2 个专栏收录

4 篇文章 0 订阅

订阅专栏

Python

3 篇文章 0 订阅

订阅专栏

import bs4
from selenium import webdriver
import time
import pymysql
import csv


def getWebDriver(url):

    # 进入浏览器设置
    options = webdriver.ChromeOptions()
    # 设置中文
    options.add_argument('lang=zh_CN.UTF-8')
    # 更换头部
    options.add_argument(
            'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"')
    # 创建浏览器对象
    wd = webdriver.Chrome(chrome_options=options)
    wd.implicitly_wait(100)
    wd.maximize_window()
    wd.get(url)


    return wd



def getjoplist(wd):
    list=[]
    text = wd.find_element_by_class_name("j_joblist").get_attribute("outerHTML")
    soup = bs4.BeautifulSoup(text,"html.parser")
    joblist = soup.find_all("div",class_="e")
    # 南京  |  3-4年经验  |  本科  |  招1人
    for i in joblist:
        jobname = i.find("a").find("p",class_="t").find_all("span")[0].attrs["title"]
        jobsalary=i.find("a").find("p",class_="info").find_all("span")[0].text
        if (len(jobsalary)==0):
            jobsalary="null"
        Jobcity=i.find("a").find("p",class_="info").find_all("span")[1].text
        jobcity=str(Jobcity).split("|")[0].replace(" ","")
        jobneed=str(Jobcity)[len(jobcity)+2:-1].replace(" ","")

        companyclassify =  str(i.find("div",class_="er").find("p",class_="dc at").text).replace(" ","")
        if (len(companyclassify)==0):
            companyclassify="null"
        companyclassify2=  str(i.find("div",class_="er").find("p",class_="int at").text).replace(" ","")
        if (len(companyclassify2)==0):
            companyclassify2="null"
        list.append([jobname,jobsalary,jobcity,jobneed,companyclassify,companyclassify2])
    return list

def changePage(wd):
    # next = wd.find_element_by_css_selector('li[class="next"]')
    next = wd.find_element_by_css_selector('div[class="j_page"]')
    next = next.find_element_by_css_selector('li[class="next"]')
    next.click()


url = "https://search.51job.com/list/000000,000000,0000,00,9,99,+,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
wd = getWebDriver(url)
# <a style="url = " https:="" search.51job.com="" list=""><i class="e_icons"></i></a>

file_path="d:/dst/爬虫课程设计3.csv"
csv_file= open(file_path, 'w', newline='', encoding='utf-8')
writer = csv.writer(csv_file)


list1=[]
for j in range(2000):

        print(j)
        list = getjoplist(wd)
        # time.sleep(0.5)
        for i in list:
            writer.writerow(i)
            print(i)

        try:
            changePage(wd)
        except:
            try:
                changePage(wd)
            except:
                changePage(wd)




# list = getjoplist(wd)
#
# for i in list:
#     writer.writerow(i)
#     print(i)
#
# time.sleep(0.5)

csv_file.close()