# -*- coding: utf-8 -*-
"""
Created on Sun Oct 7 09:00:32 2018
@author: asus
"""
#9 作者归属问题
#9.1.3 获取数据
import os
import sys
data_folder = os.path.join(
"E:\\books\Python数据挖掘入门与实践\作者归属问题,支持向量机",
"Data", "books")
#数据可用爬虫爬取下来,但时间太长
#加载文件时跳过古腾堡项目的说明
def clean_book(document):
lines = document.split("\n")
#遍历文档的每一行,寻找作品的开头和结尾,中间部分就是作品内容。
start = 0
end = len(lines)
for i in range(len(lines)):
line = lines[i]
if line.startswith("*** START OF THIS PROJECT GUTENBERG"):
start = i + 1
elif line.startswith("*** END OF THIS PROJECT GUTENBERG"):
end = i - 1
#最后,用换行符把所有行连接起来,得到作品内容。
return "\n".join(lines[start:end])
#创建函数,加载所有图书,进行上述预处理操作
import numpy as np
#声明加载图的函数,参数为图书所在目录books,该目录下是一系列以作者名字命名的子文件夹,
#图书文件就在这些子文件夹中
def load_books_data(folder=data_folder):
documents = []
authors = []
#获取books目录下的所有子文件夹
subfolders = [subfolder for subfolder in os.listdir(folder) if
os.path.isdir(os.path.join(folder, subfolder))]
#遍历这些子文件夹,使用enumerate函数为这些子文件夹指定索引
for author_number, subfolder in enumerate(subfolders):
#获取子文件夹的绝对路径,查找里面的所有图书文件
full_subfolder_path = os.path.join(folder, subfolder)
for document_name in os.listdir(full_subfolder_path):
with open(os.path.join(full_subfolder_path), document_name) as inf:
documents.append(clean_book(inf.read()))
authors.append(author_number)
return documents, np.array(authors, dtype='int')
documents, classes = load_books_data(data_folder)
#9.2 功能词
#统计功能词
function_words = ["a", "able", "aboard", "about", "above", "absent",
"according" , "accordingly", "across", "after", "against",
"ahead", "albeit", "all", "along", "alongside", "although",
"am", "amid", "amidst", "among", "amongst", "amount", "an",
"and", "another", "anti", "any", "anybody", "anyone",
"anything", "are", "around", "as", "aside", "astraddle",
"astride", "at", "away", "bar", "barring", "be", "because",
"been", "before", "behind", "being", "below", "beneath",
9.作者归属问题
最新推荐文章于 2022-04-30 22:54:32 发布
本文深入研究了作者归属问题,分析了在不同情境下确定作品作者的重要性及其挑战。通过对相关案例的解析,阐述了如何运用法律和证据来解决此类争议,并讨论了未来可能的发展趋势和技术在其中的作用。
摘要由CSDN通过智能技术生成