包含去除标点符号和换行、空格等因素
import pandas as pd
def question1():
words = ['贾宝玉', '宝玉', '林黛玉', '黛玉']
with open("HLM-UTF-8.txt", 'r', encoding='UTF-8') as r:
lines = r.readlines()
with open('result1.txt', 'w', encoding='UTF-8') as w:
for word in words:
w.write(word + '出现的行数及行如下:\n')
for line in lines:
if word in line:
w.write(str(lines.index(line) + 1) + line)
w.close()
r.close()
def question2():
result = {}
with open("HLM-UTF-8.txt", 'r', encoding='UTF-8') as r:
lines = r.readlines()
for line in lines:
line = line.strip()
for char in '!"#$&()*+,-./:;<=>?@[\\]^_{|}·~‘’\n\'"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。\u3000':
line = line.replace(char, "")
for char in line:
if char in list(result.keys()):
result[char] += 1
else:
result[char] = 1
df = pd.DataFrame(result, index=[0]).T
df.to_csv("result2.csv", encoding='utf-8-sig')
r.close()
def question3():
result = {}
with open('JaneEyre.txt', 'r') as r:
lines = r.readlines()
for line in lines:
line = line.lower()
for char in '!"#$&()*+,-./:;<=>?@[\\]^_{|}·~‘’\u3000\n\'':
line = line.replace(char, "")
words = line.split(' ')
for word in words:
if word in list(result.keys()):
result[word] += 1
else:
result[word] = 1
del result['']
df = pd.DataFrame(result, index=[0]).T
df.to_csv("result3.csv")
r.close()
if __name__ == '__main__':
# question1()
question2()
# question3()