# -*- coding: utf-8 -*-
import re
import csv
name_list = []
f = open("result.csv", "w+", encoding="utf-8")
csv_writer = csv.writer(f)
csv_writer.writerow(["用户名", "发帖时间", "发帖内容"])
file = open("source.txt", "r") #源代码存储文件
content = file.read()
file.close()
#匹配发帖人
pattern1 = re.compile(r'''p_author_name.*?>(.*)''', re.M | re.I)
pattern2 = re.compile(r"<img.*?/>")
name = pattern1.findall(content)
for str1 in name:
str1 = re.sub(pattern2, "", str1)
str1 = re.sub(r"</a>", "", str1)
name_list.append(str1)
#匹配发帖时间
pattern3 = re.compile(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}")
time_list = pattern3.findall(content)
#匹配发帖内容
pattern4 = re.compile(r'''post_content_\d{12}.*?>\s*(.*?<)''', re.M | re.I)
info = pattern4.findall(content)
info_list = []
for str1 in info:
str1 = re.sub(r"<", "", str1)
info_list.append(str1)
#zip()内置函数同时遍历三个列表
for name, time, info in zip(name_list, time_list, info_list):
csv_writer.writerow([name, time, info])