python自动爬取电脑壁纸与斗米兼职信息-CSDN博客

本文链接：https://blog.csdn.net/m0_73831440/article/details/139243280
使用requests ThreadPoolExecutor wordcloud 获取电脑壁纸的图片与斗米兼职信息，图片信息获取完后会自动生成词云，兼职信息存放在数据库，python编译器版本为3.9，mysql为8.0。各位要是觉得好用，就请点赞加收藏吧。
import requests
import os
import re
from lxml import etree
import pymysql
from concurrent.futures import ThreadPoolExecutor
from wordcloud import WordCloud
import matplotlib.pyplot as plt


head={
    "user-agent":""
}
name =['人文风土', '体育运动', '军事科技', '动物萌宠', '卡通动漫', '城市建筑', '太空科幻',
       '家居陈设', '影视剧照', '情感文艺', '明星大咖', '汽车船舶', '游戏原画', '禅意古风',
       '精选壁纸', '绘画创意', '网红萝莉', '美食甜品', '肌理纹理', '自然风景', '花卉植物',
       '静物特写', '飞机航天', "汽车"]
wenjian=[]
mysqltable =[]
mysqltablezw =[]
def choose(imfort):
    path = f"D:\\Python项目\\{imfort}"
    folder = os.path.exists(path)# 这里返回应该布尔值
    if not folder:  # 判断是否存在文件夹如果不存在则创建为文件夹
        os.makedirs(path)  # makedirs创建文件时如果路径不存在会创建这个路径
        print()
    else:
        print()
def hjsavchj(page1):
    try:
        for i in range(30):
            page = i + 1
            url = f"https://www.toopic.cn/dnbz/?q=--{page1}--.html&page={page}"
            resp = requests.get(url, headers=head)
            resp.encoding = "utf-8"
            et = etree.HTML(resp.text)
            result = et.xpath("//ul[@class='clearfix pic-list gallery']/li/div/a/img/@data-original")
            result1 = et.xpath("//ul[@class='clearfix pic-list gallery']/li[1]/div/div/p[1]/a[1]/text()")
            for itm in result1:
                imfort = re.sub(r'[^\w\s\u4e00-\u9fa5]+', '', itm)     #该函数保留字母、数字、汉字，去掉其他符号,\w 匹配所有字母、数字和下划线，\s 匹配所有空白字符（包括空格、制表符、换行符等），\u4e00-\u9fa5 匹配所有中文字符。
                choose(imfort)
            header = "https://www.toopic.cn"
            for item in result:
                urlx = header + item
                print(urlx)
                name = urlx.split("/")[-1]  # urlx按照斜杠 ("/") 进行分割，并获取分割后的最后一个部分。
                resp_img = requests.get(urlx)
                with open(f"{result1[0]}/{name}", mode="wb") as f:
                    f.write(resp_img.content)  # .content转换成二进制
    except:
        print("请求失败")
    else:
        print("加载成功")
def texttiqu(table, table2):
    head = {
        "user-agent": ""
    }
    #db =x，x是存储的表
    conn = pymysql.connect(host='localhost', user='root', password='123456789', db='domi', port=3306)
    if conn:
        print("成功连接到数据库")
    else:
        print("连接失败")
        return 0
    cursor = conn.cursor()
    cursor.execute(f'''
CREATE TABLE IF NOT EXISTS `{table}` (
  `shuZi` int(11) NOT NULL AUTO_INCREMENT,
  `RecruitmentInformation` varchar(150) NOT NULL,
  `WorkType` varchar(100) NOT NULL,
  `WorkLocation` varchar(100) NOT NULL,
  `WorkNumber` varchar(100) NOT NULL,
  `Money` char(100) DEFAULT NULL,
  `Settlement` char(100) DEFAULT NULL,
  PRIMARY KEY (`shuZi`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
    ''')
    for cx in range(1, 5):
        if cx == 1:
            if pandaunn(1, table2):
                TextExtraction(True, cx, cursor, conn, table, table2)
            else:
                TextExtraction(False, cx, cursor, conn, table, table2)
                break
        else:
            TextExtraction(True, cx, cursor, conn, table, table2)
    conn.close()
def TextExtraction(booler ,page3, cursor, conn, table, table2):
    url = ""
    if booler:
        url = f"https://www.doumi.com/{table2}/o{page3}"
    else:
        url = f"https://www.doumi.com/{table2}"

    resp = requests.get(url, headers=head)
    resp.encoding = "utf-8"
    et = etree.HTML(resp.text)
    result = et.xpath("//div[@class='jzList-txt-t']/h3/a/text()")
    Type = et.xpath("//ul[@class='jzList-field clearfix']/li[2]/text()")
    Location = et.xpath("//ul[@class='jzList-field clearfix']/li[3]/text()")
    Number = et.xpath("//ul[@class='jzList-field clearfix']/li[4]/text()")
    Money = et.xpath("//div[@class='jzList-salary']/span/em/text()")
    Settlement = et.xpath("//div[@class='jzList-salary']/span[@class='money']/text()")
    data = zip(result, Type, Location, Number, Money, Settlement)
    for item in data:
        item = [str(i).strip() for i in item]
        try:
            cursor.execute(
                f'INSERT INTO `{table}` (RecruitmentInformation, WorkType, WorkLocation, WorkNumber, Money, Settlement) VALUES (%s, %s, %s, %s, %s, %s)',
                (item[0], item[1], item[2], item[3], item[4], item[5]))
            conn.commit()
        except Exception as e:
            print("插入数据时发生异常:", e)
            conn.rollback()  # 回滚事务以保持数据一致性
    print(url)

def pandaunn(page, table1):
    url = f"https://www.doumi.com/{table1}/o{page}"
    resp = requests.get(url, headers=head)
    resp.encoding = "utf-8"
    et = etree.HTML(resp.text)
    error_message = et.xpath('//p/text()')[0]  # 提取<p>标签中的文本内容
    if error_message == "request fail":
        return False
    return True  # 显式地返回True


def jisuan():
    dic = dict()
    for i in name:
        folder_path = f'D:\\Python项目\\{i}'  # 文件夹路径
        image_extensions = ['.jpg', '.png', '.jpeg']  # 图片文件的扩展名列表
        # 获取文件夹中的所有文件
        files = os.listdir(folder_path)
        # 过滤出图片文件
        image_files = [file for file in files if os.path.splitext(file)[1].lower() in image_extensions]
        # 获取图片数量
        image_count = len(image_files)
        dic[i] = image_count
    print(dic)
    ciyun(dic)
def ciyun(dic):
    # 创建一个词云对象
    wordcloud = WordCloud(width=800, height=400, background_color="white", font_path="D:\\崩坏铁道\\FangZhengHeiTiJianTi\\FangZhengHeiTiJianTi\\FangZhengHeiTiJianTi-1.ttf")
    # 根据字词频率生成词云图的数据
    wordcloud.generate_from_frequencies(dic)
    # 使用matplotlib.pyplot库展示词云图
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

if __name__=='__main__':
    print("欢迎使用网页爬虫0.1版本")
    print("请选择你要使用的功能")
    print("┌──────────────────┯────────────────────┐")
    print("  1:爬取网页文本信息        2:爬取网页图片    ")
    print("└──────────────────┷────────────────────┘")
    choosex = input("你的选择是：")
    if choosex == '1':
        print("你选自动爬取一个已经固定好的网页文本信息还是自己编写")
        XuZhe = input("自己编写输入:1，自动爬取固定网站输入:2\n")
        if XuZhe == '1':
            print("程序已经退出，请你重新修改程序")
        if XuZhe == '2':
            url = f"https://www.doumi.com/cityselect/"
            resp = requests.get(url, headers=head)
            resp.encoding = "utf-8"
            et = etree.HTML(resp.text)

            dd_elements = et.xpath('//div[@class="all-city"]/dl/dd')

            for dd_element in dd_elements:
                # 获取 <dt> 标签下的文本内容，即城市的首字母
                first_letter = dd_element.xpath('preceding-sibling::dt[1]/text()')[0]
                # 获取 <a> 标签内的城市名
                cities = dd_element.xpath('.//a/text()')
                # 打印首字母和对应的城市名
                for city in cities:
                    mysqltablezw.append(city)

            mysqltablezw.pop()
            print(mysqltablezw)
            dd_elements = et.xpath('//div[@class="all-city"]/dl/dd')
            # 提取 <a> 标签的 dmalog 属性值中的城市名
            for dd in dd_elements:
                a_tags = dd.xpath('.//a')
                for a_tag in a_tags:
                    dmalog = a_tag.get('dmalog')
                    city_name = dmalog.split('=')[-1] if dmalog else None
                    mysqltable.append(city_name)
            mysqltable.pop()
            for cx in range(360):
                texttiqu(mysqltablezw[cx], mysqltable[cx])

    if choosex == '2':
        print("你选自动爬取一个已经固定好的网页图片信息还是自己编写")
        XuZhe=input("自己编写输入:1，自动爬取固定网站输入:2\n")
        if XuZhe == '1':
            jisuan()
            print("程序已经退出，请你重新修改程序")
        if XuZhe == '2':
            with ThreadPoolExecutor(22) as x:  # 创建线程池
                for cx in range(27):
                    page1 = 92 - cx
                    x.submit(hjsavchj, page1)   # 生成多线程
            jisuan()