对某招聘网站的全国排行榜进行内容爬取,从一级页面通过遍历进入二级页面获取二级页面的内容,我经常503,代码应该没问题,仅供自己记录
import requests
from bs4 import BeautifulSoup
from urllib import parse
import pandas as pd
import random
import time
from sqlalchemy import create_engine
import MySQLdb
class Company_craw:
def get_ua(self):
ua_list = [
这
里
是
很
多
ua
]
return random.choice(ua_list)
# 导出需要爬取的公司
def com_to_excel(self):
company_Name_list = []
url1 = 'https://www.jobui.com/rank/company/view/quanguo/'
resp = requests.get(url1, headers={'user-agent':self.get_ua()})
context = resp.text
# print(context)
soup = BeautifulSoup(context, 'html.parser')
time.sleep(15)
company_PHB = soup.find_all('div', class_='c-company-list')
for item in company_PHB:
try:
company_Name = item.find('div', class_='company-segmetation').find('h3').text
company_Name_list.append(company_Name)
except:
company_Name = '该公司不想被你看到'
company_Name_dict = {'company_Name': company_Name_list}
# print()
print(company_Name_dict)
df2 = pd.DataFrame(company_Name_dict)
df2.to_excel('企业信息.xlsx', sheet_name='公司名称')
# 读取字典中城市
def get_company(self):
file_path = '企业信息.xlsx'
df = pd.read_excel(io=file_path,sheet_name='公司名称',usecols=[1])
# print(df)
return list(df['company_Name'])
# 获取城市链接
def city_url(self):
city = self.get_company()
city_list = []
for item in city:
city = parse.quote(item)
url = f'https://www.jobui.com/cmp?area=%E5%85%A8%E5%9B%BD&keyword={city}'
city_list.append(url)
# print(city_list)
return city_list
# 数据爬取
def company_info_craw(self):
city_list = self.city_url()
company_name_list = []
company_viewNum_list = []
company_fol_list = []
company_info_list = []
company_nature_list = []
company_scale_list = []
company_create_time_list = []
company_industry_list = []
company_info2_list = []
company_list = []
for item in city_list:
resp = requests.get(item,headers={'user-agent':self.get_ua()})
context = resp.text
soup = BeautifulSoup(context,'html.parser')
time.sleep(15)
company_find = soup.find('div',class_ = 'company-content')
#公司名称
company_name = company_find.find('div',class_ = 'company-segmetation').find('h3').text
#浏览次数/关注量
company_view = company_find.find('span',class_ = 'company-desc').text.replace('\n','').strip().replace('\t','').split('/')
if len(company_view) == 2:
# 浏览次数
company_viewNum = company_view[0]
# 关注量
company_fol = company_view[1]
else:
company_viewNum = company_view[0]
company_fol = 0
try:
#公司简介
company_info = company_find.find('div',class_ = 'company-short-content company-segmetation').text
except:
company_info = '公司无简介'
#公司详情连接
company_url = company_find.find('div',class_ = 'company-segmetation').find('a').get('href')
print(company_name)
print(company_viewNum)
print(company_fol)
print(company_info)
print(company_url)
# todo 第二页数据爬取
company_url = 'https://www.jobui.com'+company_url
resp = requests.get(company_url, headers={'user-agent': self.get_ua()})
context = resp.text
soup = BeautifulSoup(context, 'html.parser')
time.sleep(10)
try:
# 公司详细信息
company_det_info = soup.find('div', class_='cfix fs16')
# 公司性质
company_nature = company_det_info.find('div', class_='company-nature').get('title').split(':')[1]
# 公司规模
company_scale = company_det_info.find('div', class_='company-worker').get('title').split(':')[1]
# 成立时间
company_create_time = company_det_info.find('span',class_='fs18 fwb').text
# 行业
company_industry = company_det_info.find('span',class_='comInd').text
# 公司简介
company_info2 = company_det_info.find('p', class_='mb10 cmp-txtshow').text
except:
company_create_time = '无'
company_industry = '无'
company_det_info = '无'
company_nature= '无'
company_scale= '无'
company_info2 ='无'
print(company_nature)
print(company_scale)
print(company_create_time)
print(company_industry)
print(company_info2)
company_name_list.append(company_name)
company_viewNum_list.append(company_viewNum)
company_fol_list.append(company_fol)
company_info_list.append(company_info)
company_nature_list.append(company_nature)
company_scale_list.append(company_scale)
company_create_time_list.append(company_create_time)
company_industry_list.append(company_industry)
company_info2_list.append(company_info2)
company_list = [company_name_list,
company_viewNum_list,
company_fol_list,
company_info_list,
company_nature_list,
company_scale_list,
company_create_time_list,
company_industry_list,
company_info2_list]
return company_list
# 数据存储
def data_to_excel(self):
company_list = self.company_info_craw()
company_dict = {'company_name':company_list[0],
'company_viewNum':company_list[1],
'company_fol':company_list[2],
'company_info':company_list[3],
'company_nature':company_list[4],
'company_scale':company_list[5],
' company_create_time':company_list[6],
'company_industry':company_list[7],
'company_info2':company_list[8]
}
df = pd.DataFrame(company_dict)
df.to_excel('company.xlsx',sheet_name= '公司信息')
return df
class Mysql:
def dataToMysql(self, df):
engine = create_engine('mysql://root:这里是密码@这里是数据库名/这里是下级数据库名?charset=utf8')
df.to_sql('公司', con=engine, if_exists='append', index=False)
if __name__ == '__main__':
company_craw = Company_craw()
company_craw.data_to_excel()
df1 = company_craw.data_to_excel()
mysql = Mysql()
mysql.dataToMysql(df1)