英语四级单词文本处理
import re
import xlwt
import sqlite3
def get_file():
"""
将txt中的文本进行处理,删除其他字符和中文,返回文本内容
"""
f1 = open(r"C:\Users\Administrator\Desktop\英语四级.txt", "r",encoding="utf-8" )
content = f1.read().lower()
f1.close()
for sub in '!@#$%^&*()(): ;:;_+-{}[]|\<>?/.,`~1234567890"\'£“”':
content = content.replace(sub," ")
pattern = re.compile(r'[\u4e00-\u9fa5]')
content = re.sub(pattern, "", content)
content = re.sub('\s', ' ', content)
return content
def count_words(content):
"""
将文本内容中的单词进行统计数量并排序
:param content: 文本
:return:排序好的单词列表(字典嵌套在列表中)
"""
ls = content.split(" ")
counts = {}
for word in ls:
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
return items
def proceedings_ls(words_list):
"""
再一次处理排好序的列表,将不必要的单词删除
:return: 处理好的列表
"""
words=[]
for word in words_list:
if len(word[0]) > 4 and word[1]>=4:
words.append(list(word))
return words
def save_excel(data_list):
"""将数据存入excel中"""
workbook = xlwt.Workbook(encoding="utf-8")
worksheet = workbook.add_sheet("sheet1")
worksheet.write(0, 0, "单词")
worksheet.write(0,1,"频率")
for i in range(len(data_list)):
word =data_list[i][0]
rate = data_list[i][1]
worksheet.write(i+1,0,word)
worksheet.write(i+1,1,rate)
workbook.save("english4.xlsx")
def save_db(datalist,dbpath):
"""
将数据保存到数据库
"""
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
for data in datalist:
for index in range(len(data)):
data[index] = '"'+str(data[index])+'"'
sql ='''
insert into word4(
Word,Rate)
values(%s)
'''%",".join(data)
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
def init_db(dbpath):
sql = '''
create table word4(
id integer primary key autoincrement,
Word text,
Rate text
)
'''
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
def main():
text = get_file()
words_ls=count_words(text)
data_list = proceedings_ls(words_ls)
print(data_list)
save_excel(data_list)
save_db(data_list,"English.db")
if __name__ == '__main__':
main()
英语单词翻译
import pandas as pd
df_dict = pd.read_csv(r"C:\Users\Administrator\Desktop\stardict.csv")
df_dict.shape
df_dict.sample(10).head()
df_dict = df_dict[["word", "translation"]]
df_dict.head()
f = open("translate.txt","r",encoding="utf-8")
content = f.read()
wordlist = content.split(" ")
df_words = pd.DataFrame({
"word": wordlist
})
df_merge = pd.merge(
left = df_dict,
right = df_words,
left_on = "word",
right_on = "word"
)
df_merge.to_excel("translate.xlsx", index=False)