python 爬取各大招聘网站信息

最新推荐文章于 2024-06-05 11:43:07 发布

szx_0101

最新推荐文章于 2024-06-05 11:43:07 发布

阅读量8.6k

点赞数 6

分类专栏： python爬虫文章标签： python selenium 爬虫

本文链接：https://blog.csdn.net/shizhengxin123/article/details/73559502

版权

python爬虫专栏收录该内容

6 篇文章 0 订阅

订阅专栏

python 爬取各大招聘网站信息，源码，留给自己看的

1、拉勾

from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
from time import sleep
import re
what1 = '数据挖掘'
what2 ='全职'
what3 = '北京'
what1 = urllib.parse.quote(what1)
what2  = urllib.parse.quote(what2)
what3  = urllib.parse.quote(what3)
driver=webdriver.PhantomJS()
# driver=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
url = 'https://www.lagou.com/jobs/list_%s?px=default&gx=%s&city=%s#order' % (what1,what2,what3)
url2 = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=sug&fromSearch=true&suginput=shuju'
driver.implicitly_wait(100)
driver.get(url)
bs = BeautifulSoup(driver.page_source)
req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
urllinks = req.find_all('a',class_='position_link')
import queue
que = queue.Queue()
for i in urllinks:
    print(i.get('href'))
    que.put(i.get('href'))
link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")
link_next.click()
times = 0
while True:
    times += 1
    driver.implicitly_wait(10)
    bs = BeautifulSoup(driver.page_source)
    req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
    urllinks = req.find_all('a',class_='position_link')
    for i in urllinks:
        print(i.get('href'))
        que.put(i.get('href'))
    print(times)
    if times  == 3:
        break
    link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")
    link_next.click()
    sleep(3)


driver2 = webdriver.PhantomJS()
# driver2=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
while que:
    try :
        newurl = que.get()
        driver2.get(newurl)
        driver2.implicitly_wait(100)
        bs2 = BeautifulSoup(driver2.page_source)

        job_info = bs2.find('div', class_='job-name')
        company = job_info.find('div', class_='company')
        reg1 = re.compile("<[^>]*>")
        ###部门
        company = reg1.sub('', company.prettify())
        ####职位
        job = job_info.find('span', class_='name')
        reg2 = re.compile("<[^>]*>")
        job = reg2.sub('', job.prettify()).strip('\n')
        ###工资 、地点 、经验、学历
        job_req = bs2.find('dd', class_='job_request')
        all_info = []
        for i in job_req.find_all('span'):
            reg3 = re.compile("<[^>]*>")
            new_in = reg3.sub('', i.prettify())
            all_info.append(new_in)

        salary = all_info[0]
        mod = re.compile('/')
        salary = mod.sub('', salary).strip('\n')

        address = all_info[1]
        address = mod.sub('', address).strip('\n')
        exp = all_info[2]
        exp = mod.sub('', exp).strip('\n')
        edu = all_info[3]
        edu = mod.sub('', edu).strip('\n')
        ###job_detail
        job_det = bs2.find('dl', class_='job_detail', id='job_detail')
        ###职位诱惑
        job_lu = job_det.find('dd', class_='job-advantage').find('p')
        reg4 = re.compile("<[^>]*>")
        job_lu = reg4.sub('', job_lu.prettify())
        ###工作责任与要求
        job_zong = job_det.find('dd', class_='job_bt')
        job_res = job_zong.find('div')
        reg5 = re.compile("<[^>]*>")
        job_res = str(reg5.sub('', job_res.prettify()).strip('\n').strip())
        ###工作地址
        job_ad = bs2.find('dd', class_='job-address clearfix').find('div', class_='work_addr')
        reg6 = re.compile("<[^>]*>")
        job_ad = reg6.sub('', job_ad.prettify()).strip('\n')
        job_con = bs2.find('dl', class_='job_company', id='job_company')
        ###公司名称
        com_name = job_con.find('dt').find('a').find('img').get('alt')
        ###公司类型
        com_cat = job_con.find('ul', class_='c_feature').find_all('li')
        all_info2 = []
        for i in com_cat:
            reg7 = re.compile("<[^>]*>")
            new_in = reg7.sub('', i.prettify())
            all_info2.append(new_in)
        com_cat = all_info2[0].strip('\n')
        lingyu  = '领域'
        dev = '发展阶段'
        gui ='规模'

        a1 = re.compile(lingyu)
        a2 = re.compile(dev)
        a3 = re.compile(gui)
        com_cat = a1.sub('',com_cat).strip()
        com_qua = all_info2[1].strip('\n')
        com_qua = a2.sub('',com_qua).strip()
        com_peo = all_info2[-2].strip('\n')
        com_peo = a3.sub('',com_peo).strip()
        db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
        db.encoding = 'utf-8'
        cursor = db.cursor()
        cursor.execute('set names utf8')

        sql = "INSERT INTO lagou_wajue (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
        cursor.execute(sql % (job, com_name, address, com_cat, com_qua, com_peo, exp, edu, salary, job_res))

        db.commit()
        cursor.close()
        db.close()
    except:
        print('该页面无法获取')


driver.close()
driver2.close()

2、猎聘

2、1下载链接

from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
from time import sleep
what1 = '数据分析'
what1 = urllib.parse.quote(what1)
driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
for i in range(5):
    page = i
    url = 'https://www.liepin.com/zhaopin/?pubTime=&ckid=5ac323b614701474&fromSearchBtn=2&compkind=&isAnalysis=&init=-1&searchType=1&dqs=070020&industryType=&jobKind=&sortFlag=15&degradeFlag=0&industries=&salary=&compscale=&key=%s&clean_condition=&headckid=5ac323b614701474&curPage=%d' % (
    what1, page)
    # url = 'https://www.liepin.com/bj/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&key=%s' % what1

    # driver=webdriver.PhantomJS()
    driver.get(url)
    driver.implicitly_wait(100)

    links = driver.find_elements_by_xpath("//div[@class='job-info']/h3")
    w = open('e:/myurl2.txt', 'a', encoding='utf-8')
    for i in links:
        final = i.find_element_by_xpath("./a")
        print(final.get_attribute('href'))
        w.writelines(final.get_attribute('href') + '\n')

    w.close()

2、2 信息抓取

from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
import re
from time import sleep
import threading
from threading import current_thread,Lock
import multiprocessing

import queue
class MyThread(threading.Thread):
    def __init__(self, funcs, args, name=''):
        threading.Thread.__init__(self)
        self.funcs = funcs
        self.name = name
        self.args = args

    def run(self):
        self.funcs(*self.args)
def getcontent(que,driver):
    while que:
        try:
            newurl = que.get()
            driver.get(newurl)
            driver.implicitly_wait(100)
            bs2 = BeautifulSoup(driver.page_source)

            job_info = bs2.find('div', class_='title-info')
            company = job_info.find('h3').find('a')
            reg1 = re.compile("<[^>]*>")
            ###部门
            company = reg1.sub('', company.prettify()).strip('\n').strip()
            print(company)
            ####职位
            job = job_info.find('h1')
            reg2 = re.compile("<[^>]*>")
            job = reg2.sub('', job.prettify()).strip('\n')
            print(job)
            ###工资 、地点 、经验、学历
            job_req = bs2.find('div', class_='job-title-left')
            salary = job_req.p.contents[0].strip()
            com_addr = job_req.find('p', class_='basic-infor').find('span').find('a').text
            qua = job_req.find('div', class_='job-qualifications')
            need = []
            for i in qua.find_all('span'):
                need.append(i.text)
            edu = need[0]
            exps = need[1]
            print(edu)
            print(exps)
            print(com_addr)
            print(salary)
            response = bs2.find('div', class_='job-item main-message').find('div', class_='content content-word')
            reg3 = re.compile("<[^>]*>")
            job_res = reg3.sub('', response.prettify()).strip('\n').strip()
            print(job_res)
            com_info = bs2.find('div', class_='company-infor').find('ul').find_all('li')
            infom = []
            for i in com_info:
                infom.append(i.text)
                print(i.text)
            com_cat = infom[0].strip('\n').strip()
            com_peo = infom[1]
            com_qua = infom[2]
            sleep(1)
            db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
            db.encoding = 'utf-8'
            cursor = db.cursor()
            cursor.execute('set names utf8')

            sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
            cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))

            db.commit()
            cursor.close()
            db.close()
        except:
            print('页面发生错误')








def main():
    w = open('e:/myurl2.txt', 'r', encoding='utf-8')
    urls = []
    for i in w.readlines():
        newline = i.strip()
        urls.append(newline)
    w.close()
    print(len(urls))

    que = queue.Queue()
    for i in urls:
        que.put(i)
    # driver = webdriver.PhantomJS()
    driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
    getcontent(que,driver)




if  __name__== '__main__':
    main()

3、前程无忧

from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
import re
from time import sleep
import threading
from threading import current_thread,Lock
import multiprocessing


import queue
class MyThread(threading.Thread):
    def __init__(self, funcs, args, name=''):
        threading.Thread.__init__(self)
        self.funcs = funcs
        self.name = name
        self.args = args

    def run(self):
        self.funcs(*self.args)
def getcontent(que,driver):
    while que:
        try:
            newurl = que.get()
            driver.get(newurl)
            driver.implicitly_wait(100)
            bs2 = BeautifulSoup(driver.page_source)
            job_info = bs2.find('div', class_='cn')
            company = job_info.find('p',class_='cname').find('a')

            ###gongsi
            company = company.get('title')

            ####职位
            job = job_info.find('h1')

            job = job.get('title')

            ###工资 、地点 、经验、学历
            com_addr = job_info.find('span',class_='lname').text
            salary = job_info.find('strong').text
            com_all= job_info.find('p',class_='msg ltype').text.strip('\t').strip('\n').split('|')
            com_qua = com_all[0].strip('\n').strip()
            com_peo = com_all[1].strip('\n').strip()
            com_cat = com_all[2].strip('\n').strip()
            print(com_qua)

            job_main = bs2.find('div',class_= 'tCompany_main')
            info_all = []
            for i in job_main.find_all('span',class_='sp4'):
                info_all.append(i.text)
            exps = info_all[0].strip()
            edu = info_all[1].strip()
            if '经验' not in exps:
                exps =None
            if edu not in ['初中及以下','高中/中技/中专','大专','本科','硕士','博士']:
                edu =None
            job_res = job_main.find('div',class_='bmsg job_msg inbox')
            reg3 = re.compile("<[^>]*>")
            job_res = reg3.sub('', job_res.prettify()).strip('\n').strip()
            reg4 = re.compile('分享')
            reg5 = re.compile('举报')
            job_res =  reg4.sub('', job_res).strip('\n').strip()
            job_res = reg5.sub('', job_res).strip('\n').strip()

            db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
            db.encoding = 'utf-8'
            cursor = db.cursor()
            cursor.execute('set names utf8')

            sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
            cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))

            db.commit()
            cursor.close()
            db.close()
        except:
            print('页面发生错误')


def main():
    w = open('e:/myurl10.txt', 'r', encoding='utf-8')
    urls = []
    for i in w.readlines():
        newline = i.strip()
        urls.append(newline)
    w.close()
    print(len(urls))

    que = queue.Queue()
    for i in urls:
        que.put(i)
    # driver = webdriver.PhantomJS()
    driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
    getcontent(que,driver)




if  __name__== '__main__':
    main()

4 中华英才

from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
import re
from time import sleep
import threading
from threading import current_thread,Lock
import multiprocessing


import queue
class MyThread(threading.Thread):
    def __init__(self, funcs, args, name=''):
        threading.Thread.__init__(self)
        self.funcs = funcs
        self.name = name
        self.args = args

    def run(self):
        self.funcs(*self.args)
def getcontent(que,driver):
    while que:
        try:
            newurl = que.get()
            driver.get(newurl)
            driver.implicitly_wait(100)
            bs2 = BeautifulSoup(driver.page_source)
            job_info = bs2.find('div', class_='base_info')
            ####职位
            job = job_info.find('div').find('h1').find('span').text

            ###工资 、地点 、经验、学历

            min_info = job_info.find('div',class_='job_require')
            all_in = []
            for i in min_info.find_all('span'):
                print(i.text)
                all_in.append(i.text)
            print(all_in)
            salary = all_in[0].strip()
            com_addr = all_in[1].strip()
            edu = all_in[3].strip()
            exps = all_in[4].strip()
            job_main = bs2.find('div',class_= 'job_intro_wrap')

            job_res = job_main.find('div',class_='job_intro_info')
            reg3 = re.compile("<[^>]*>")
            job_res = reg3.sub('', job_res.prettify()).strip('\n').strip()
            com_intro = bs2.find('div',class_='job-company jrpadding')

            company = com_intro.find('h4').find('a').text.strip()
            print(company)
            com_info = com_intro.find('tbody').find_all('tr')

            com_s = []
            for i in com_info:
                times = 0
                for j in i.find_all('td'):
                    times += 1

                    if times ==2 :

                        com_s.append(j.text)
            com_cat = com_s[0].strip()
            com_qua = com_s[2].strip()
            com_peo = com_s[1].strip()
            print(job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res)
            sleep(1)
            db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
            db.encoding = 'utf-8'
            cursor = db.cursor()
            cursor.execute('set names utf8')
            sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
            cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))

            db.commit()
            cursor.close()
            db.close()
        except:
            print('页面发生错误')


def main():
    w = open('e:/myurl8.txt', 'r', encoding='utf-8')
    urls = []
    for i in w.readlines():
        newline = i.strip()
        urls.append(newline)
    w.close()
    print(len(urls))

    que = queue.Queue()
    for i in urls:
        que.put(i)
    driver = webdriver.PhantomJS()
    # driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
    getcontent(que,driver)


if  __name__== '__main__':
    main()

szx_0101

关注

6
点赞
踩
47

收藏

觉得还不错? 一键收藏
3
评论
python 爬取各大招聘网站信息

python 爬取各大招聘网站信息，源码，留给自己看的1、拉勾from bs4 import BeautifulSoupimport requestsimport urllibfrom selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport selenium.webdriver.suppo
复制链接

扫一扫