python 福利吧_简单的福利吧爬虫

import requests

import random

import re

from lxml import etree

start_url = 'https://fuliba2020.net/2020'

import time

def get_headers(url, use='pc'):

pc_agent = [

"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",

"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",

"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",

"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",

"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",

"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",

"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",

"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"

"Mozilla/5.0 (X11; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0"

]

phone_agent = [

"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",

"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",

"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",

"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",

"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",

"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",

"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",

"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",

"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",

"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",

"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",

"UCWEB7.0.2.37/28/999",

# "NOKIA5700/ UCWEB7.0.2.37/28/999",

"Openwave/ UCWEB7.0.2.37/28/999",

"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"

]

"""user_agent部分来源:https://blog.csdn.net/IT__LS/java/article/details/78880903"""

referer = lambda url: re.search(

"^((http://)|(https://))?([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}(/)", url).group()

"""正则来源:https://www.cnblogs.com/blacksonny/p/6055357.html"""

if use == 'phone': # 随机选择一个

agent = random.choice(phone_agent)

else:

agent = random.choice(pc_agent)

headers = {

'User-Agent': agent,

'Referer': referer(url),

'DNT': "1",

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

'Connection': 'keep-alive',

'Accept-Language': 'zh-CN,zh;q=0.9,en-CN;q=0.8,en;q=0.7',

'Accept-Encoding': 'gzip, deflate, br',

}

return headers

#获取图片地址,存入txt

def creat_url(origin_url):

url_list = []

real_list = []

for i in range(1,149):

if i < 10:

order = '00'+str(i)

if i < 100 and i>9:

order = '0' + str(i)

if i > 99:

order = str(i)

for j in range(2,4):

url_list.append(origin_url+order+'.html'+'/'+str(j))

for i in url_list:

response = requests.get(i)

if not len(response.history) >= 1:

print('正在写入'+str(i))

with open('./url.txt', 'a') as f:

f.write(i+'\n')

#读取txt,去除换行,存入数组

def read_txt():

read_list = []

with open('./url.txt','r') as f:

for i in f.readlines():

read_list.append(i[:-1])

return read_list

def get_imgurl(read_list):

new_list = []

for url in read_list:

response = requests.get(url, get_headers(url))

html = etree.HTML(response.text)

result = html.xpath('/html/body/section/div[1]/div/article/p/img/@src')

for img_url in result:

if img_url not in new_list:

new_list.append(img_url)

with open('./image.txt','a') as f:

print('正在写入'+img_url)

f.write(img_url+'\n')

def download_img():

calacute = 1

read_list = []

with open('./image.txt','r') as f:

for i in f.readlines():

read_list.append(i[:-1])

for img_url in read_list:

try:

response = requests.get(img_url, headers=get_headers(img_url))

if str(response.status_code) =='200':

path = './images/' + img_url[30:]

with open(path, 'wb')as f:

f.write(response.content)

print('图片'+img_url+'下载成功'+'----'+'当前进度[{}/{}]'.format(str(calacute),str(len(read_list))))

calacute += 1

except Exception as e:

print(e)

time.sleep(1)

if __name__ == '__main__':

download_img()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是使用Python爬取BOSS直聘岗位数据并进行可视化的步骤: 1. 导入必要的库 ```python import requests from bs4 import BeautifulSoup import pymysql from pyecharts.charts import Bar, Pie, WordCloud from pyecharts import options as opts from flask import Flask, render_template ``` 2. 爬取数据 ```python # 爬取BOSS直聘网站上的数据 def get_data(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') return soup ``` 3. 爬取多页数据 ```python # 爬取多页数据 def get_all_data(): all_data = [] for i in range(1, 11): url = 'https://www.zhipin.com/c101280100-p100109/?page={}&ka=page-{}'.format(i, i) soup = get_data(url) data_list = soup.find_all('div', class_='job-primary') for data in data_list: job_name = data.find('div', class_='job-title').text.strip() salary = data.find('span', class_='red').text.strip() company = data.find('div', class_='company-text').find('a').text.strip() education = data.find('div', class_='job-limit clearfix').find_all('span')[1].text.strip() welfare = data.find('div', class_='info-append').find_all('span') welfare_list = [w.text.strip() for w in welfare] all_data.append([job_name, salary, company, education, welfare_list]) return all_data ``` 4. 存储数据 ```python # 存储数据到MySQL数据库 def save_data(data): db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='boss_zhipin') cursor = db.cursor() sql = 'INSERT INTO job_info(job_name, salary, company, education, welfare) values(%s, %s, %s, %s, %s)' try: cursor.executemany(sql, data) db.commit() except Exception as e: print(e) db.rollback() db.close() ``` 5. 数据可视化 ```python # 数据可视化 app = Flask(__name__) @app.route('/') def index(): return render_template('index.html') @app.route('/salary') def salary(): db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='boss_zhipin') cursor = db.cursor() sql = 'SELECT salary FROM job_info' cursor.execute(sql) results = cursor.fetchall() salary_list = [] for result in results: salary = result[0].replace('k', '').replace('K', '') salary_list.append(int(salary)) salary_dict = {} for i in range(0, 31, 5): salary_dict['{}k-{}k'.format(i, i + 5)] = 0 for salary in salary_list: for key in salary_dict.keys(): if salary >= int(key.split('-')[0]) and salary <= int(key.split('-')[1]): salary_dict[key] += 1 bar = Bar() bar.add_xaxis(list(salary_dict.keys())) bar.add_yaxis('薪资分布', list(salary_dict.values())) bar.set_global_opts(title_opts=opts.TitleOpts(title='BOSS直聘薪资分布图')) return bar.dump_options_with_quotes() @app.route('/education') def education(): db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='boss_zhipin') cursor = db.cursor() sql = 'SELECT education FROM job_info' cursor.execute(sql) results = cursor.fetchall() education_list = [] for result in results: education_list.append(result[0]) education_dict = {} for education in education_list: if education in education_dict.keys(): education_dict[education] += 1 else: education_dict[education] = 1 pie = Pie() pie.add('', list(education_dict.items())) pie.set_global_opts(title_opts=opts.TitleOpts(title='BOSS直聘学历要求分布图')) return pie.dump_options_with_quotes() @app.route('/welfare') def welfare(): db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='boss_zhipin') cursor = db.cursor() sql = 'SELECT welfare FROM job_info' cursor.execute(sql) results = cursor.fetchall() welfare_list = [] for result in results: welfare_list.extend(result[0]) welfare_dict = {} for welfare in welfare_list: if welfare in welfare_dict.keys(): welfare_dict[welfare] += 1 else: welfare_dict[welfare] = 1 wordcloud = WordCloud() wordcloud.add('', list(welfare_dict.items()), word_size_range=[20, 100]) wordcloud.set_global_opts(title_opts=opts.TitleOpts(title='BOSS直聘福利词云图')) return wordcloud.dump_options_with_quotes() if __name__ == '__main__': app.run() ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值