python 爬取各大招聘网站信息,源码,留给自己看的
1、拉勾
from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
from time import sleep
import re
what1 = '数据挖掘'
what2 ='全职'
what3 = '北京'
what1 = urllib.parse.quote(what1)
what2 = urllib.parse.quote(what2)
what3 = urllib.parse.quote(what3)
driver=webdriver.PhantomJS()
# driver=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
url = 'https://www.lagou.com/jobs/list_%s?px=default&gx=%s&city=%s#order' % (what1,what2,what3)
url2 = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=sug&fromSearch=true&suginput=shuju'
driver.implicitly_wait(100)
driver.get(url)
bs = BeautifulSoup(driver.page_source)
req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
urllinks = req.find_all('a',class_='position_link')
import queue
que = queue.Queue()
for i in urllinks:
print(i.get('href'))
que.put(i.get('href'))
link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")
link_next.click()
times = 0
while True:
times += 1
driver.implicitly_wait(10)
bs = BeautifulSoup(driver.page_source)
req = bs.find('ul',class_ = 'item_con_list',style ='display: block;')
urllinks = req.find_all('a',class_='position_link')
for i in urllinks:
print(i.get('href'))
que.put(i.get('href'))
print(times)
if times == 3:
break
link_next = driver.find_element_by_xpath("//span[@class='pager_next ']")
link_next.click()
sleep(3)
driver2 = webdriver.PhantomJS()
# driver2=webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
while que:
try :
newurl = que.get()
driver2.get(newurl)
driver2.implicitly_wait(100)
bs2 = BeautifulSoup(driver2.page_source)
job_info = bs2.find('div', class_='job-name')
company = job_info.find('div', class_='company')
reg1 = re.compile("<[^>]*>")
###部门
company = reg1.sub('', company.prettify())
####职位
job = job_info.find('span', class_='name')
reg2 = re.compile("<[^>]*>")
job = reg2.sub('', job.prettify()).strip('\n')
###工资 、地点 、经验、学历
job_req = bs2.find('dd', class_='job_request')
all_info = []
for i in job_req.find_all('span'):
reg3 = re.compile("<[^>]*>")
new_in = reg3.sub('', i.prettify())
all_info.append(new_in)
salary = all_info[0]
mod = re.compile('/')
salary = mod.sub('', salary).strip('\n')
address = all_info[1]
address = mod.sub('', address).strip('\n')
exp = all_info[2]
exp = mod.sub('', exp).strip('\n')
edu = all_info[3]
edu = mod.sub('', edu).strip('\n')
###job_detail
job_det = bs2.find('dl', class_='job_detail', id='job_detail')
###职位诱惑
job_lu = job_det.find('dd', class_='job-advantage').find('p')
reg4 = re.compile("<[^>]*>")
job_lu = reg4.sub('', job_lu.prettify())
###工作责任与要求
job_zong = job_det.find('dd', class_='job_bt')
job_res = job_zong.find('div')
reg5 = re.compile("<[^>]*>")
job_res = str(reg5.sub('', job_res.prettify()).strip('\n').strip())
###工作地址
job_ad = bs2.find('dd', class_='job-address clearfix').find('div', class_='work_addr')
reg6 = re.compile("<[^>]*>")
job_ad = reg6.sub('', job_ad.prettify()).strip('\n')
job_con = bs2.find('dl', class_='job_company', id='job_company')
###公司名称
com_name = job_con.find('dt').find('a').find('img').get('alt')
###公司类型
com_cat = job_con.find('ul', class_='c_feature').find_all('li')
all_info2 = []
for i in com_cat:
reg7 = re.compile("<[^>]*>")
new_in = reg7.sub('', i.prettify())
all_info2.append(new_in)
com_cat = all_info2[0].strip('\n')
lingyu = '领域'
dev = '发展阶段'
gui ='规模'
a1 = re.compile(lingyu)
a2 = re.compile(dev)
a3 = re.compile(gui)
com_cat = a1.sub('',com_cat).strip()
com_qua = all_info2[1].strip('\n')
com_qua = a2.sub('',com_qua).strip()
com_peo = all_info2[-2].strip('\n')
com_peo = a3.sub('',com_peo).strip()
db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
db.encoding = 'utf-8'
cursor = db.cursor()
cursor.execute('set names utf8')
sql = "INSERT INTO lagou_wajue (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
cursor.execute(sql % (job, com_name, address, com_cat, com_qua, com_peo, exp, edu, salary, job_res))
db.commit()
cursor.close()
db.close()
except:
print('该页面无法获取')
driver.close()
driver2.close()
2、猎聘
2、1下载 链接
from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
from time import sleep
what1 = '数据分析'
what1 = urllib.parse.quote(what1)
driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
for i in range(5):
page = i
url = 'https://www.liepin.com/zhaopin/?pubTime=&ckid=5ac323b614701474&fromSearchBtn=2&compkind=&isAnalysis=&init=-1&searchType=1&dqs=070020&industryType=&jobKind=&sortFlag=15°radeFlag=0&industries=&salary=&compscale=&key=%s&clean_condition=&headckid=5ac323b614701474&curPage=%d' % (
what1, page)
# url = 'https://www.liepin.com/bj/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&key=%s' % what1
# driver=webdriver.PhantomJS()
driver.get(url)
driver.implicitly_wait(100)
links = driver.find_elements_by_xpath("//div[@class='job-info']/h3")
w = open('e:/myurl2.txt', 'a', encoding='utf-8')
for i in links:
final = i.find_element_by_xpath("./a")
print(final.get_attribute('href'))
w.writelines(final.get_attribute('href') + '\n')
w.close()
2、2 信息抓取
from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
import re
from time import sleep
import threading
from threading import current_thread,Lock
import multiprocessing
import queue
class MyThread(threading.Thread):
def __init__(self, funcs, args, name=''):
threading.Thread.__init__(self)
self.funcs = funcs
self.name = name
self.args = args
def run(self):
self.funcs(*self.args)
def getcontent(que,driver):
while que:
try:
newurl = que.get()
driver.get(newurl)
driver.implicitly_wait(100)
bs2 = BeautifulSoup(driver.page_source)
job_info = bs2.find('div', class_='title-info')
company = job_info.find('h3').find('a')
reg1 = re.compile("<[^>]*>")
###部门
company = reg1.sub('', company.prettify()).strip('\n').strip()
print(company)
####职位
job = job_info.find('h1')
reg2 = re.compile("<[^>]*>")
job = reg2.sub('', job.prettify()).strip('\n')
print(job)
###工资 、地点 、经验、学历
job_req = bs2.find('div', class_='job-title-left')
salary = job_req.p.contents[0].strip()
com_addr = job_req.find('p', class_='basic-infor').find('span').find('a').text
qua = job_req.find('div', class_='job-qualifications')
need = []
for i in qua.find_all('span'):
need.append(i.text)
edu = need[0]
exps = need[1]
print(edu)
print(exps)
print(com_addr)
print(salary)
response = bs2.find('div', class_='job-item main-message').find('div', class_='content content-word')
reg3 = re.compile("<[^>]*>")
job_res = reg3.sub('', response.prettify()).strip('\n').strip()
print(job_res)
com_info = bs2.find('div', class_='company-infor').find('ul').find_all('li')
infom = []
for i in com_info:
infom.append(i.text)
print(i.text)
com_cat = infom[0].strip('\n').strip()
com_peo = infom[1]
com_qua = infom[2]
sleep(1)
db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
db.encoding = 'utf-8'
cursor = db.cursor()
cursor.execute('set names utf8')
sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))
db.commit()
cursor.close()
db.close()
except:
print('页面发生错误')
def main():
w = open('e:/myurl2.txt', 'r', encoding='utf-8')
urls = []
for i in w.readlines():
newline = i.strip()
urls.append(newline)
w.close()
print(len(urls))
que = queue.Queue()
for i in urls:
que.put(i)
# driver = webdriver.PhantomJS()
driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
getcontent(que,driver)
if __name__== '__main__':
main()
3、前程无忧
from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
import re
from time import sleep
import threading
from threading import current_thread,Lock
import multiprocessing
import queue
class MyThread(threading.Thread):
def __init__(self, funcs, args, name=''):
threading.Thread.__init__(self)
self.funcs = funcs
self.name = name
self.args = args
def run(self):
self.funcs(*self.args)
def getcontent(que,driver):
while que:
try:
newurl = que.get()
driver.get(newurl)
driver.implicitly_wait(100)
bs2 = BeautifulSoup(driver.page_source)
job_info = bs2.find('div', class_='cn')
company = job_info.find('p',class_='cname').find('a')
###gongsi
company = company.get('title')
####职位
job = job_info.find('h1')
job = job.get('title')
###工资 、地点 、经验、学历
com_addr = job_info.find('span',class_='lname').text
salary = job_info.find('strong').text
com_all= job_info.find('p',class_='msg ltype').text.strip('\t').strip('\n').split('|')
com_qua = com_all[0].strip('\n').strip()
com_peo = com_all[1].strip('\n').strip()
com_cat = com_all[2].strip('\n').strip()
print(com_qua)
job_main = bs2.find('div',class_= 'tCompany_main')
info_all = []
for i in job_main.find_all('span',class_='sp4'):
info_all.append(i.text)
exps = info_all[0].strip()
edu = info_all[1].strip()
if '经验' not in exps:
exps =None
if edu not in ['初中及以下','高中/中技/中专','大专','本科','硕士','博士']:
edu =None
job_res = job_main.find('div',class_='bmsg job_msg inbox')
reg3 = re.compile("<[^>]*>")
job_res = reg3.sub('', job_res.prettify()).strip('\n').strip()
reg4 = re.compile('分享')
reg5 = re.compile('举报')
job_res = reg4.sub('', job_res).strip('\n').strip()
job_res = reg5.sub('', job_res).strip('\n').strip()
db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
db.encoding = 'utf-8'
cursor = db.cursor()
cursor.execute('set names utf8')
sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))
db.commit()
cursor.close()
db.close()
except:
print('页面发生错误')
def main():
w = open('e:/myurl10.txt', 'r', encoding='utf-8')
urls = []
for i in w.readlines():
newline = i.strip()
urls.append(newline)
w.close()
print(len(urls))
que = queue.Queue()
for i in urls:
que.put(i)
# driver = webdriver.PhantomJS()
driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
getcontent(que,driver)
if __name__== '__main__':
main()
4 中华英才
from bs4 import BeautifulSoup
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import pymysql
import re
from time import sleep
import threading
from threading import current_thread,Lock
import multiprocessing
import queue
class MyThread(threading.Thread):
def __init__(self, funcs, args, name=''):
threading.Thread.__init__(self)
self.funcs = funcs
self.name = name
self.args = args
def run(self):
self.funcs(*self.args)
def getcontent(que,driver):
while que:
try:
newurl = que.get()
driver.get(newurl)
driver.implicitly_wait(100)
bs2 = BeautifulSoup(driver.page_source)
job_info = bs2.find('div', class_='base_info')
####职位
job = job_info.find('div').find('h1').find('span').text
###工资 、地点 、经验、学历
min_info = job_info.find('div',class_='job_require')
all_in = []
for i in min_info.find_all('span'):
print(i.text)
all_in.append(i.text)
print(all_in)
salary = all_in[0].strip()
com_addr = all_in[1].strip()
edu = all_in[3].strip()
exps = all_in[4].strip()
job_main = bs2.find('div',class_= 'job_intro_wrap')
job_res = job_main.find('div',class_='job_intro_info')
reg3 = re.compile("<[^>]*>")
job_res = reg3.sub('', job_res.prettify()).strip('\n').strip()
com_intro = bs2.find('div',class_='job-company jrpadding')
company = com_intro.find('h4').find('a').text.strip()
print(company)
com_info = com_intro.find('tbody').find_all('tr')
com_s = []
for i in com_info:
times = 0
for j in i.find_all('td'):
times += 1
if times ==2 :
com_s.append(j.text)
com_cat = com_s[0].strip()
com_qua = com_s[2].strip()
com_peo = com_s[1].strip()
print(job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res)
sleep(1)
db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')
db.encoding = 'utf-8'
cursor = db.cursor()
cursor.execute('set names utf8')
sql = "INSERT INTO lagou (job_name,com_name,com_addr,com_cat,com_qua,com_peo,exp1,edu,salary,com_resp) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
cursor.execute(sql % (job, company, com_addr, com_cat, com_qua, com_peo, exps, edu, salary, job_res))
db.commit()
cursor.close()
db.close()
except:
print('页面发生错误')
def main():
w = open('e:/myurl8.txt', 'r', encoding='utf-8')
urls = []
for i in w.readlines():
newline = i.strip()
urls.append(newline)
w.close()
print(len(urls))
que = queue.Queue()
for i in urls:
que.put(i)
driver = webdriver.PhantomJS()
# driver = webdriver.Chrome(executable_path='E:\package\Chrome64_48.0.2564.109\chromedriver.exe')
getcontent(que,driver)
if __name__== '__main__':
main()