unzip_save.py
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # 不显示等级2以下的提示信息
import zipfile
# 解压
local_zip1 = 'E:/Python/pythonProject_1/Sarcasm_Headlines/tmp/archive.zip' # 数据集压缩包路径
zip_ref1 = zipfile.ZipFile(local_zip1, 'r') # 打开压缩包,以读取方式
zip_ref1.extractall('E:/Python/pythonProject_1/Sarcasm_Headlines/tmp/archive/') # 解压到以下路径
zip_ref1.close()
import json
datastore = open("E:/Python/pythonProject_1/Sarcasm_Headlines/tmp/archive/Sarcasm_Headlines_Dataset.json",'r',encoding='utf-8')
sentences = []
labels = []
urls = [] # 这个例子不会用到 urls
for items in datastore.readlines():
dic = json.loads(items)
sentences.append(dic['headline'])
labels.append(dic['is_sarcastic'])
urls.append(dic['article_link'])
main.py
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # 不显示等级2以下的提示信息
from tensorflow.keras.preprocessing.text import Tokenizer # 加载半自动生成式字典库
from tensorflow.keras.preprocessing.sequence import pad_sequences # 加载自动补齐工具包
tokenizer = Tokenizer(oov_token="<OOV>") # 非关键词标注
tokenizer.fit_on_texts(sentences) # 句子处理
word_index = tokenizer.word_index
print(len(word_index)) # 词库数量
print(word_index) # 词库显示
senquences = tokenizer.texts_to_sequences(sentences) # 句子显示
padded = pad_sequences(senquences, padding='post') # 句子排序、补齐 padding='post' : 将0填充到句子后面
print(senquences[2])
print(padded[2])
print(padded.shape)
rusult
29657
{'<OOV>': 1, 'to': 2, 'of': 3, 'the': 4, 'in': 5, 'for': 6, 'a': 7, 'on': 8, 'and': 9, 'with': 10, 'is': 11, 'new': 12, 'trump': 13, 'man': 14, 'from': 15, 'at': 16, 'about': 17, 'you': 18, 'this': 19, 'by': 20, 'after': 21, 'up': 22, 'out': 23, 'be': 24, 'how': 25, 'as': 26, 'it': 27, 'that': 28, 'not':
...........................................................................................
...........................................................................................
, "writin'": 29647, "'easy": 29648, 'drywall': 29649, 'blowhole': 29650, "zimbabwe's": 29651, 'gonzalez': 29652, 'breached': 29653, "'basic'": 29654, 'hikes': 29655, 'gourmet': 29656, 'foodie': 29657}
[145, 838, 2, 907, 1749, 2093, 582, 4719, 221, 143, 39, 46, 2, 10736]
[ 145 838 2 907 1749 2093 582 4719 221 143 39 46
2 10736 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0]
(26709, 40)
数据链接:https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection