给正文加富文本标签,给图片加网址
import os
import shutil
import hashlib
import time
import pandas as pd
import requests
import re
file_dir_ = "903"
file_dir = r"C:\Users\Administrator\Desktop\download\9.3"
excel_name = file_dir.split("\\")[-1]
def file_name(file_dir_):
for root, dirs, files in os.walk(file_dir_):
return files, dirs
def md5_content(content_):
"""
将段落进行md5 加密
:param content_:
:return:
"""
m = hashlib.md5()
b = content_.encode(encoding='utf-8')
m.update(b)
str_md5 = m.hexdigest()
return str_md5
def copy_path(source_path, target_path):
if not os.path.exists(target_path):
os.makedirs(target_path)
if os.path.exists(source_path):
shutil.rmtree(target_path)
shutil.copytree(source_path, target_path)
def rename_f(src_file, dst_file):
os.rename(src_file, dst_file)
def get_img_title_(name_):
file_dir_copy = file_dir + '\\' + name_ + '\\' + file_dir_
copy_path(source_path=file_dir + '\\' + name_, target_path=file_dir_copy)
all_p = file_name(file_dir_=file_dir_copy)[0]
dd = []
for t in all_p:
_title = t.split('.')[0]
type_ = t.split(".")[1]
id_ = md5_content(t + str(time.time()))
src_file_ = file_dir_copy + "\\" + t
dst_file_ = file_dir_copy + "\\" + t.replace(_title, id_)
rename_f(src_file_, dst_file_)
url = f"https://wwww.aliyuncs.com/{file_dir_}/{id_}.{type_}"
dd.append((name_ + "|" + _title, url))
return dd
def get_all_title():
all_p = file_name(file_dir_=file_dir)[1]
d_l = []
for n in all_p:
res_ = get_img_title_(name_=n)
for i in res_:
d_l.append(i)
ddd = pd.DataFrame(d_l)
ddd.to_excel(f"{excel_name}.xlsx", index=False, header=["图片标识", "图片url"])
def createdir(path):
is_exists = os.path.exists(path)
if not is_exists:
os.makedirs(path)
print(path + ' 目录创建成功')
else:
print(path + ' 目录已存在')
shutil.rmtree(path)
os.makedirs(path)
def copy_file(filepath, new_path):
file_names = os.listdir(filepath)
for file_ in file_names:
new_dir = filepath + '/' + file_
if os.path.isfile(new_dir):
new_file = new_path + '/' + file_
shutil.copyfile(new_dir, new_file)
else:
copy_file(new_dir, new_path)
def copy_file_all():
path_ = file_dir + "\\" + file_dir_
createdir(path_)
file_names = file_name(file_dir_=file_dir)[1]
for file_d in file_names:
if file_d == file_dir_:
continue
file_name_two = file_name(file_dir_=file_dir + "\\" + file_d)[1][0]
copy_file(file_dir + "\\" + file_d + "\\" + file_name_two, path_)
shutil.rmtree(file_dir + "\\" + file_d + "\\" + file_name_two)
def get_content(content_, url_l, name_, title):
title_ = title.replace("?", "")
cont_ = ""
content = content_.split("\n")
for c in content:
cont_ += "<p>" + c + "</p>" + "\n"
for i in range(1, 10):
if f"{title}_{i}" in cont_:
cont_ = cont_.replace(f"<p>{title}_{i}</p>", f'<p></p>\n<img src=\"{url_l[name_ + "|" + title_ + "_" + str(i)]}\">\n<p></p>')
cont_ += "<p>《图片来源于网络,如有问题请联系作者》</p>"
return cont_, url_l[name_ + "|" + title_ + "_" + str(1)]
def get_keyword(title):
"""获取项目标签词"""
res = requests.get(f"?title={title}").json()
return res[2]
def get_excel_one(name_):
all_ = pd.read_excel(file_dir + "\\" + f"{name_}.xlsx")
all_img = pd.read_excel(f"{excel_name}.xlsx")
dp = []
img_d = dict(zip(all_img['图片标识'], all_img['图片url']))
for value in all_.itertuples():
query_id = value.queryID
title = value.query
content = value.正文
title_1 = value.标题
author = name_
content_, f_url = get_content(content_=content, url_l=img_d, name_=name_, title=title)
keyword = get_keyword(title=title)
dp.append((query_id, title, title_1, content, "", f_url, content_, keyword, author))
return dp
def get_all_excel():
all_p = file_name(file_dir_=file_dir)[1]
L = []
for p in all_p:
if p == file_dir_:
continue
res = get_excel_one(name_=p)
for i in res:
L.append(i)
ddd = pd.DataFrame(L)
ddd.to_excel(f"{excel_name}_all.xlsx", index=False,
header=["queryID", "query", "标题", "正文", "图片比例", "封面图片", "富文本", "项目标签","作者"])
def main():
get_all_title()
time.sleep(2)
copy_file_all()
get_all_excel()
if __name__ == "__main__":
main()
给图片加网址
import os
import shutil
import hashlib
import time
import pandas as pd
file_dir_="726_6"
file_dir = r"C:\Users\Administrator\Desktop\download\7.22苏苏8图"
file_dir_copy = r"C:\Users\Administrator\Desktop\download\7.22苏苏8图"+file_dir_
def file_name(file_dir_):
for root, dirs, files in os.walk(file_dir_):
return files
def md5_content(content_):
"""
将段落进行md5 加密
:param content_:
:return:
"""
m = hashlib.md5()
b = content_.encode(encoding='utf-8')
m.update(b)
str_md5 = m.hexdigest()
return str_md5
def copy_path(source_path, target_path):
if not os.path.exists(target_path):
os.makedirs(target_path)
if os.path.exists(source_path):
shutil.rmtree(target_path)
shutil.copytree(source_path, target_path)
def rename_f(src_file, dst_file):
os.rename(src_file, dst_file)
copy_path(source_path=file_dir, target_path=file_dir_copy)
all_p = file_name(file_dir_=file_dir_copy)
dd = []
for t in all_p:
_title = t.split('.')[0]
id_ = md5_content(t + str(time.time()))
src_file_ = file_dir_copy + "\\" + t
dst_file_ = file_dir_copy + "\\" + t.replace(_title, id_)
rename_f(src_file_, dst_file_)
url = f"https://wwww.aliyuncs.com/{file_dir_}/{id_}.jpg"
dd.append((_title, url))
ddd = pd.DataFrame(dd)
name = file_dir.split("\\")[-1]
ddd.to_excel(f"{name}.xlsx", index=False, header=False)