python如何给正文内容加富文本标签

给正文加富文本标签,给图片加网址

import os
import shutil
import hashlib
import time
import pandas as pd
import requests
import re

file_dir_ = "903"

file_dir = r"C:\Users\Administrator\Desktop\download\9.3"

excel_name = file_dir.split("\\")[-1]


def file_name(file_dir_):
    for root, dirs, files in os.walk(file_dir_):
        return files, dirs


def md5_content(content_):
    """
    将段落进行md5 加密
    :param content_:
    :return:
    """
    m = hashlib.md5()
    b = content_.encode(encoding='utf-8')
    m.update(b)
    str_md5 = m.hexdigest()
    return str_md5


def copy_path(source_path, target_path):
    if not os.path.exists(target_path):
        os.makedirs(target_path)
    if os.path.exists(source_path):
        shutil.rmtree(target_path)
    shutil.copytree(source_path, target_path)


def rename_f(src_file, dst_file):
    os.rename(src_file, dst_file)


def get_img_title_(name_):
    file_dir_copy = file_dir + '\\' + name_ + '\\' + file_dir_
    copy_path(source_path=file_dir + '\\' + name_, target_path=file_dir_copy)
    all_p = file_name(file_dir_=file_dir_copy)[0]
    dd = []
    for t in all_p:
        _title = t.split('.')[0]
        type_ = t.split(".")[1]
        id_ = md5_content(t + str(time.time()))
        src_file_ = file_dir_copy + "\\" + t
        dst_file_ = file_dir_copy + "\\" + t.replace(_title, id_)
        rename_f(src_file_, dst_file_)
        url = f"https://wwww.aliyuncs.com/{file_dir_}/{id_}.{type_}"
        dd.append((name_ + "|" + _title, url))
    return dd


def get_all_title():
    all_p = file_name(file_dir_=file_dir)[1]
    d_l = []
    for n in all_p:
        res_ = get_img_title_(name_=n)
        for i in res_:
            d_l.append(i)
    ddd = pd.DataFrame(d_l)
    ddd.to_excel(f"{excel_name}.xlsx", index=False, header=["图片标识", "图片url"])


def createdir(path):
    is_exists = os.path.exists(path)
    # 判断结果
    if not is_exists:
        # 如果不存在则创建目录
        os.makedirs(path)
        print(path + ' 目录创建成功')
    else:
        # 如果目录存在则不创建,并提示目录已存在

        print(path + ' 目录已存在')
        shutil.rmtree(path)
        os.makedirs(path)


def copy_file(filepath, new_path):
    # 获取当前路径下的文件名,返回List
    file_names = os.listdir(filepath)
    for file_ in file_names:
        # 将文件命加入到当前文件路径后面
        new_dir = filepath + '/' + file_
        # 如果是文件
        if os.path.isfile(new_dir):
            new_file = new_path + '/' + file_
            # copyfile函数两个必须为文件,不能是目录,
            shutil.copyfile(new_dir, new_file)
        # 如果不是文件,递归这个文件夹的路径
        else:
            copy_file(new_dir, new_path)


def copy_file_all():
    # 创建文件夹
    path_ = file_dir + "\\" + file_dir_
    createdir(path_)
    # 遍历当前文件夹
    file_names = file_name(file_dir_=file_dir)[1]
    for file_d in file_names:
        if file_d == file_dir_:
            continue
        file_name_two = file_name(file_dir_=file_dir + "\\" + file_d)[1][0]
        copy_file(file_dir + "\\" + file_d + "\\" + file_name_two, path_)
        shutil.rmtree(file_dir + "\\" + file_d + "\\" + file_name_two)


def get_content(content_, url_l, name_, title):
    title_ = title.replace("?", "")
    cont_ = ""
    content = content_.split("\n")
    for c in content:
        cont_ += "<p>" + c + "</p>" + "\n"
    for i in range(1, 10):
        if f"{title}_{i}" in cont_:            
        # cont_ = re.sub(f"{title}_{i}", f'<img src=\"{url_l[name_ + "|" + title_ + "_" + str(i)]}\">', cont_)
            cont_ = cont_.replace(f"<p>{title}_{i}</p>", f'<p></p>\n<img src=\"{url_l[name_ + "|" + title_ + "_" + str(i)]}\">\n<p></p>')

    cont_ += "<p>《图片来源于网络,如有问题请联系作者》</p>"
    # print(f"{title}_{i}")
    # print(name_ + "|" + title_ + "_" + str(i))
    # print(cont_)
    # print("* ****************************")
    return cont_, url_l[name_ + "|" + title_ + "_" + str(1)]


def get_keyword(title):
    """获取项目标签词"""
    res = requests.get(f"?title={title}").json()
    return res[2]


def get_excel_one(name_):
    all_ = pd.read_excel(file_dir + "\\" + f"{name_}.xlsx")  # 单个文章标题表
    all_img = pd.read_excel(f"{excel_name}.xlsx")  # 图片链接加密表
    dp = []
    img_d = dict(zip(all_img['图片标识'], all_img['图片url']))
    for value in all_.itertuples():
        query_id = value.queryID
        title = value.query
        content = value.正文
        title_1 = value.标题
        author = name_
        content_, f_url = get_content(content_=content, url_l=img_d, name_=name_, title=title)
        keyword = get_keyword(title=title)
        dp.append((query_id, title, title_1, content, "", f_url, content_, keyword, author))
    return dp


def get_all_excel():
    all_p = file_name(file_dir_=file_dir)[1]
    L = []
    for p in all_p:
        if p == file_dir_:
            continue
        res = get_excel_one(name_=p)
        for i in res:
            L.append(i)
    ddd = pd.DataFrame(L)
    ddd.to_excel(f"{excel_name}_all.xlsx", index=False,
                 header=["queryID", "query", "标题", "正文", "图片比例", "封面图片", "富文本", "项目标签","作者"])


def main():
    get_all_title()
    time.sleep(2)
    copy_file_all()
    get_all_excel()


if __name__ == "__main__":
    main()

给图片加网址

import os
import shutil
import hashlib
import time
import pandas as pd

file_dir_="726_6"

file_dir = r"C:\Users\Administrator\Desktop\download\7.22苏苏8图"
file_dir_copy = r"C:\Users\Administrator\Desktop\download\7.22苏苏8图"+file_dir_


def file_name(file_dir_):
    for root, dirs, files in os.walk(file_dir_):
        return files


def md5_content(content_):
    """
    将段落进行md5 加密
    :param content_:
    :return:
    """
    m = hashlib.md5()
    b = content_.encode(encoding='utf-8')
    m.update(b)
    str_md5 = m.hexdigest()
    return str_md5


def copy_path(source_path, target_path):
    if not os.path.exists(target_path):
        os.makedirs(target_path)
    if os.path.exists(source_path):
        shutil.rmtree(target_path)
    shutil.copytree(source_path, target_path)


def rename_f(src_file, dst_file):
    os.rename(src_file, dst_file)


copy_path(source_path=file_dir, target_path=file_dir_copy)
all_p = file_name(file_dir_=file_dir_copy)

dd = []
for t in all_p:
    _title = t.split('.')[0]
    id_ = md5_content(t + str(time.time()))
    src_file_ = file_dir_copy + "\\" + t
    dst_file_ = file_dir_copy + "\\" + t.replace(_title, id_)
    rename_f(src_file_, dst_file_)
    url = f"https://wwww.aliyuncs.com/{file_dir_}/{id_}.jpg"
    dd.append((_title, url))

ddd = pd.DataFrame(dd)
name = file_dir.split("\\")[-1]
ddd.to_excel(f"{name}.xlsx", index=False, header=False)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值