使用requests ThreadPoolExecutor wordcloud 获取电脑壁纸的图片与斗米兼职信息,图片信息获取完后会自动生成词云,兼职信息存放在数据库,python编译器版本为3.9,mysql为8.0。各位要是觉得好用,就请点赞加收藏吧。
import requests
import os
import re
from lxml import etree
import pymysql
from concurrent.futures import ThreadPoolExecutor
from wordcloud import WordCloud
import matplotlib.pyplot as plt
head={
"user-agent":""
}
name =['人文风土', '体育运动', '军事科技', '动物萌宠', '卡通动漫', '城市建筑', '太空科幻',
'家居陈设', '影视剧照', '情感文艺', '明星大咖', '汽车船舶', '游戏原画', '禅意古风',
'精选壁纸', '绘画创意', '网红萝莉', '美食甜品', '肌理纹理', '自然风景', '花卉植物',
'静物特写', '飞机航天', "汽车"]
wenjian=[]
mysqltable =[]
mysqltablezw =[]
def choose(imfort):
path = f"D:\\Python项目\\{imfort}"
folder = os.path.exists(path)# 这里返回应该布尔值
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs创建文件时如果路径不存在会创建这个路径
print()
else:
print()
def hjsavchj(page1):
try:
for i in range(30):
page = i + 1
url = f"https://www.toopic.cn/dnbz/?q=--{page1}--.html&page={page}"
resp = requests.get(url, headers=head)
resp.encoding = "utf-8"
et = etree.HTML(resp.text)
result = et.xpath("//ul[@class='clearfix pic-list gallery']/li/div/a/img/@data-original")
result1 = et.xpath("//ul[@class='clearfix pic-list gallery']/li[1]/div/div/p[1]/a[1]/text()")
for itm in result1:
imfort = re.sub(r'[^\w\s\u4e00-\u9fa5]+', '', itm) #该函数保留字母、数字、汉字,去掉其他符号,\w 匹配所有字母、数字和下划线,\s 匹配所有空白字符(包括空格、制表符、换行符等),\u4e00-\u9fa5 匹配所有中文字符。
choose(imfort)
header = "https://www.toopic.cn"
for item in result:
urlx = header + item
print(urlx)
name = urlx.split("/")[-1] # urlx按照斜杠 ("/") 进行分割,并获取分割后的最后一个部分。
resp_img = requests.get(urlx)
with open(f"{result1[0]}/{name}", mode="wb") as f:
f.write(resp_img.content) # .content转换成二进制
except:
print("请求失败")
else:
print("加载成功")
def texttiqu(table, table2):
head = {
"user-agent": ""
}
#db =x,x是存储的表
conn = pymysql.connect(host='localhost', user='root', password='123456789', db='domi', port=3306)
if conn:
print("成功连接到数据库")
else:
print("连接失败")
return 0
cursor = conn.cursor()
cursor.execute(f'''
CREATE TABLE IF NOT EXISTS `{table}` (
`shuZi` int(11) NOT NULL AUTO_INCREMENT,
`RecruitmentInformation` varchar(150) NOT NULL,
`WorkType` varchar(100) NOT NULL,
`WorkLocation` varchar(100) NOT NULL,
`WorkNumber` varchar(100) NOT NULL,
`Money` char(100) DEFAULT NULL,
`Settlement` char(100) DEFAULT NULL,
PRIMARY KEY (`shuZi`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
''')
for cx in range(1, 5):
if cx == 1:
if pandaunn(1, table2):
TextExtraction(True, cx, cursor, conn, table, table2)
else:
TextExtraction(False, cx, cursor, conn, table, table2)
break
else:
TextExtraction(True, cx, cursor, conn, table, table2)
conn.close()
def TextExtraction(booler ,page3, cursor, conn, table, table2):
url = ""
if booler:
url = f"https://www.doumi.com/{table2}/o{page3}"
else:
url = f"https://www.doumi.com/{table2}"
resp = requests.get(url, headers=head)
resp.encoding = "utf-8"
et = etree.HTML(resp.text)
result = et.xpath("//div[@class='jzList-txt-t']/h3/a/text()")
Type = et.xpath("//ul[@class='jzList-field clearfix']/li[2]/text()")
Location = et.xpath("//ul[@class='jzList-field clearfix']/li[3]/text()")
Number = et.xpath("//ul[@class='jzList-field clearfix']/li[4]/text()")
Money = et.xpath("//div[@class='jzList-salary']/span/em/text()")
Settlement = et.xpath("//div[@class='jzList-salary']/span[@class='money']/text()")
data = zip(result, Type, Location, Number, Money, Settlement)
for item in data:
item = [str(i).strip() for i in item]
try:
cursor.execute(
f'INSERT INTO `{table}` (RecruitmentInformation, WorkType, WorkLocation, WorkNumber, Money, Settlement) VALUES (%s, %s, %s, %s, %s, %s)',
(item[0], item[1], item[2], item[3], item[4], item[5]))
conn.commit()
except Exception as e:
print("插入数据时发生异常:", e)
conn.rollback() # 回滚事务以保持数据一致性
print(url)
def pandaunn(page, table1):
url = f"https://www.doumi.com/{table1}/o{page}"
resp = requests.get(url, headers=head)
resp.encoding = "utf-8"
et = etree.HTML(resp.text)
error_message = et.xpath('//p/text()')[0] # 提取<p>标签中的文本内容
if error_message == "request fail":
return False
return True # 显式地返回True
def jisuan():
dic = dict()
for i in name:
folder_path = f'D:\\Python项目\\{i}' # 文件夹路径
image_extensions = ['.jpg', '.png', '.jpeg'] # 图片文件的扩展名列表
# 获取文件夹中的所有文件
files = os.listdir(folder_path)
# 过滤出图片文件
image_files = [file for file in files if os.path.splitext(file)[1].lower() in image_extensions]
# 获取图片数量
image_count = len(image_files)
dic[i] = image_count
print(dic)
ciyun(dic)
def ciyun(dic):
# 创建一个词云对象
wordcloud = WordCloud(width=800, height=400, background_color="white", font_path="D:\\崩坏铁道\\FangZhengHeiTiJianTi\\FangZhengHeiTiJianTi\\FangZhengHeiTiJianTi-1.ttf")
# 根据字词频率生成词云图的数据
wordcloud.generate_from_frequencies(dic)
# 使用matplotlib.pyplot库展示词云图
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
if __name__=='__main__':
print("欢迎使用网页爬虫0.1版本")
print("请选择你要使用的功能")
print("┌──────────────────┯────────────────────┐")
print(" 1:爬取网页文本信息 2:爬取网页图片 ")
print("└──────────────────┷────────────────────┘")
choosex = input("你的选择是:")
if choosex == '1':
print("你选自动爬取一个已经固定好的网页文本信息还是自己编写")
XuZhe = input("自己编写输入:1,自动爬取固定网站输入:2\n")
if XuZhe == '1':
print("程序已经退出,请你重新修改程序")
if XuZhe == '2':
url = f"https://www.doumi.com/cityselect/"
resp = requests.get(url, headers=head)
resp.encoding = "utf-8"
et = etree.HTML(resp.text)
dd_elements = et.xpath('//div[@class="all-city"]/dl/dd')
for dd_element in dd_elements:
# 获取 <dt> 标签下的文本内容,即城市的首字母
first_letter = dd_element.xpath('preceding-sibling::dt[1]/text()')[0]
# 获取 <a> 标签内的城市名
cities = dd_element.xpath('.//a/text()')
# 打印首字母和对应的城市名
for city in cities:
mysqltablezw.append(city)
mysqltablezw.pop()
print(mysqltablezw)
dd_elements = et.xpath('//div[@class="all-city"]/dl/dd')
# 提取 <a> 标签的 dmalog 属性值中的城市名
for dd in dd_elements:
a_tags = dd.xpath('.//a')
for a_tag in a_tags:
dmalog = a_tag.get('dmalog')
city_name = dmalog.split('=')[-1] if dmalog else None
mysqltable.append(city_name)
mysqltable.pop()
for cx in range(360):
texttiqu(mysqltablezw[cx], mysqltable[cx])
if choosex == '2':
print("你选自动爬取一个已经固定好的网页图片信息还是自己编写")
XuZhe=input("自己编写输入:1,自动爬取固定网站输入:2\n")
if XuZhe == '1':
jisuan()
print("程序已经退出,请你重新修改程序")
if XuZhe == '2':
with ThreadPoolExecutor(22) as x: # 创建线程池
for cx in range(27):
page1 = 92 - cx
x.submit(hjsavchj, page1) # 生成多线程
jisuan()