process_data

该代码实现了一个深度学习项目,涉及文本和图像数据的预处理。包括停用词移除、文本清洗、分词、词向量表示以及图像的resize、centercrop和标准化。还涉及到数据集的划分和加载,以及词嵌入模型的构建。
摘要由CSDN通过智能技术生成

encoding=utf-8

#import cPickle as pickle
import _pickle as cPickle
import random
from random import *
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import os
from collections import defaultdict
import sys, re
import pandas as pd
from PIL import Image
import math
from types import *
from gensim.models import Word2Vec
import jieba
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import os.path
from gensim.models import Word2Vec

def stopwordslist(filepath = ‘…/data/weibo/stop_words.txt’): #停止词列表
stopwords = {}
for line in open(filepath, ‘r’, encoding=“utf-8”).readlines():
#line = unicode(line, “utf-8”).strip()
line = str(line).strip()
stopwords[line] = 1
#stopwords = [line.strip() for line in open(filepath, ‘r’, encoding=‘utf-8’).readlines()]
return stopwords

def clean_str_sst(string): #删除指定字符
“”"
Tokenization/string cleaning for the SST dataset
“”"
string = re.sub(u"[,。 :,.;|-“”——_/nbsp+&;@、《》~()())#O!:【】]", “”, string)
return string.strip().lower() #删除开头结尾空格

import sys

reload(sys)

sys.setdefaultencoding(“utf-8”)

def read_image(): #图片保存在字典中
image_list = {}
file_list = [‘…/data/weibo/nonrumor_images/’, ‘…/data/weibo/rumor_images/’]
for path in file_list:
data_transforms = transforms.Compose([
transforms.Resize(256),#像素256*256
transforms.CenterCrop(224), #中心裁剪
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) #标准化0-1之间 加快模型的收敛速度
])

    for i, filename in enumerate(os.listdir(path)):  # assuming gif

        # print(filename)
        try:
            im = Image.open(path + filename).convert('RGB')
            im = data_transforms(im)
            #im = 1
            image_list[filename.split('/')[-1].split(".")[0].lower()] = im
        except:
            print(filename)
print("image length " + str(len(image_list)))
#print("image names are " + str(image_list.keys()))
return image_list

def write_txt(data): #写入txt文件 (没什么用)
f = open(“…/data/weibo/top_n_data.txt”, ‘wb’)
for line in data:
for l in line:
f.write(l+“\n”)
f.write(“\n”)
f.write(“\n”)
f.close()

text_dict = {}

def write_data(flag, image, text_only):#读取文本

def read_post(flag):
    stop_words = stopwordslist()
    pre_path = "../data/weibo/tweets/"
    file_list = [pre_path + "test_nonrumor.txt", pre_path + "test_rumor.txt", \
                     pre_path + "train_nonrumor.txt", pre_path + "train_rumor.txt"]
    if flag == "train":
        id = cPickle.load(open("../data/weibo/train_id.pickle", 'rb'))   #将pickle文件转换为python的数据结构
    elif flag == "validate":
        id = cPickle.load(open("../data/weibo/validate_id.pickle", 'rb'))
    elif flag == "test":
        id = cPickle.load(open("../data/weibo/test_id.pickle", 'rb'))

    post_content = []
    labels = []
    image_ids = []
    twitter_ids = []
    data = []
    column = ['post_id', 'image_id', 'original_post', 'post_text', 'label', 'event_label']
    key = -1
    map_id = {}
    top_data = []
    for k, f in enumerate(file_list):

        f = open(f, 'rb')
        if (k + 1) % 2 == 1:
            label = 0  ### real is 0  #test_nonrumor.txt和train_nonrumor.txt
        else:
            label = 1  ####fake is 1  #test_rumor.txt和train_rumor.txt

        twitter_id = 0
        line_data = []
        top_line_data = []

        for i, l in enumerate(f.readlines()):   #txt文件中每行数据
            # key += 1

            # if int(key /3) in index:
            # print(key/3)
            # continue

            #每三行是一轮处理
            if (i + 1) % 3 == 1:#取第一行第一个,id
                line_data = []
                twitter_id = l.decode().split('|')[0]
                line_data.append(twitter_id)



            if (i + 1) % 3 == 2:#网址

                line_data.append(l.lower())

            if (i + 1) % 3 == 0:#新闻内容
                l = clean_str_sst(str(l))  #清除指定字符

                seg_list = jieba.cut_for_search(l)  #中文分词,适用于搜索引擎分词
                new_seg_list = []
                for word in seg_list:
                    if word not in stop_words:
                        new_seg_list.append(word)

                clean_l = " ".join(new_seg_list)       #空格分隔,不包括结束词
                if len(clean_l) > 10 and line_data[0] in id:    #id:pickle文件
                    post_content.append(l)
                    line_data.append(l)
                    line_data.append(clean_l)
                    line_data.append(label)
                    event = int(id[line_data[0]])
                    if event not in map_id:
                        map_id[event] = len(map_id)
                        event = map_id[event]
                    else:
                        event = map_id[event]

                    line_data.append(event)

                    data.append(line_data)


        f.close()
        # print(data)
        #     return post_content
    
    data_df = pd.DataFrame(np.array(data), columns=column)
    write_txt(top_data)

    return post_content, data_df

post_content, post = read_post(flag)
print("Original post length is " + str(len(post_content)))
print("Original data frame is " + str(post.shape))


def find_most(db):
    maxcount = max(len(v) for v in db.values())
    return [k for k, v in db.items() if len(v) == maxcount]     #键值对组成元组输出列表

def select(train, selec_indices):
    temp = []
    for i in range(len(train)):
        ele = list(train[i])
        temp.append([ele[i] for i in selec_indices])
        #   temp.append(np.array(train[i])[selec_indices])
    return temp

def balance_event(data, event_list):

id = find_most(event_list)[0]

remove_indice = random.sample(range(min(event_list[id]), \

max(event_list[id])), int(len(event_list[id]) * 0.9))

select_indices = np.delete(range(len(data[0])), remove_indice)

return select(data, select_indices)

def paired(text_only = False):
    ordered_image = []
    ordered_text = []
    ordered_post = []
    ordered_event= []
    label = []
    post_id = []
    image_id_list = []
    #image = []

    image_id = ""
    for i, id in enumerate(post['post_id']):
        for image_id in post.iloc[i]['image_id'].split('|'):
            image_id = image_id.split("/")[-1].split(".")[0]
            if image_id in image:
                break

        if text_only or image_id in image:
            if not text_only:
                image_name = image_id
                image_id_list.append(image_name)
                ordered_image.append(image[image_name])
            ordered_text.append(post.iloc[i]['original_post'])
            ordered_post.append(post.iloc[i]['post_text'])
            ordered_event.append(post.iloc[i]['event_label'])
            post_id.append(id)


            label.append(post.iloc[i]['label'])

    label = np.array(label, dtype=np.int_)
    ordered_event = np.array(ordered_event, dtype=np.int_)

    print("Label number is " + str(len(label)))
    print("Rummor number is " + str(sum(label)))
    print("Non rummor is " + str(len(label) - sum(label)))



    #
    if flag == "test":
        y = np.zeros(len(ordered_post))
    else:
        y = []


    data = {"post_text": np.array(ordered_post),
            "original_post": np.array(ordered_text),
            "image": ordered_image, "social_feature": [],
            "label": np.array(label), \
            "event_label": ordered_event, "post_id":np.array(post_id),
            "image_id":image_id_list}
    #print(data['image'][0])


    print("data size is " + str(len(data["post_text"])))
    
    return data

paired_data = paired(text_only)

print("paired post length is "+str(len(paired_data["post_text"])))
print("paried data has " + str(len(paired_data)) + " dimension")
return paired_data

def load_data(train, validate, test):
vocab = defaultdict(float)
all_text = list(train[‘post_text’]) + list(validate[‘post_text’])+list(test[‘post_text’])
for sentence in all_text:
for word in sentence:
vocab[word] += 1
return vocab, all_text

def build_data_cv(data_folder, cv=10, clean_string=True):
“”"
Loads data and split into 10 folds.
“”"
revs = []
pos_file = data_folder[0]
neg_file = data_folder[1]
vocab = defaultdict(float)
with open(pos_file, “rb”) as f:
for line in f:
rev = []
rev.append(line.strip())
if clean_string:
orig_rev = clean_str(" ".join(rev))
else:
orig_rev = " “.join(rev).lower()
words = set(orig_rev.split())
for word in words:
vocab[word] += 1
datum = {“y”: 1,
“text”: orig_rev,
“num_words”: len(orig_rev.split()),
“split”: np.random.randint(0, cv)}
revs.append(datum)
with open(neg_file, “rb”) as f:
for line in f:
rev = []
rev.append(line.strip())
if clean_string:
orig_rev = clean_str(” ".join(rev))
else:
orig_rev = " ".join(rev).lower()
words = set(orig_rev.split())
for word in words:
vocab[word] += 1
datum = {“y”: 0,
“text”: orig_rev,
“num_words”: len(orig_rev.split()),
“split”: np.random.randint(0, cv)}
revs.append(datum)
return revs, vocab

def get_W(word_vecs, k=32):
“”"
Get word matrix. W[i] is the vector for word indexed by i
“”"
# vocab_size = len(word_vecs)
word_idx_map = dict()
W = np.zeros(shape=(len(word_vecs) + 1, k), dtype=‘float32’)
W[0] = np.zeros(k, dtype=‘float32’)
i = 1
for word in word_vecs:
W[i] = word_vecs[word]
word_idx_map[word] = i
i += 1
return W, word_idx_map

def load_bin_vec(fname, vocab):
“”"
Loads 300x1 word vecs from Google (Mikolov) word2vec
“”"
word_vecs = {}
with open(fname, “rb”) as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype(‘float32’).itemsize * layer1_size
for line in xrange(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ’ ':
word = ‘’.join(word)
break
if ch != ‘\n’:
word.append(ch)
if word in vocab:
word_vecs[word] = np.fromstring(f.read(binary_len), dtype=‘float32’)
else:
f.read(binary_len)
return word_vecs

def add_unknown_words(word_vecs, vocab, min_df=1, k=32):
“”"
For words that occur in at least min_df documents, create a separate word vector.
0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
“”"
for word in vocab:
if word not in word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25, 0.25, k)

def get_data(text_only):
#text_only = False

if text_only:
    print("Text only")
    image_list = []
else:
    print("Text and image")
    image_list = read_image()

train_data = write_data("train", image_list, text_only)
valiate_data = write_data("validate", image_list, text_only)
test_data = write_data("test", image_list, text_only)

print("loading data...")
vocab, all_text = load_data(train_data, valiate_data, test_data)
print("number of sentences: " + str(len(all_text)))
print("vocab size: " + str(len(vocab)))
max_l = len(max(all_text, key=len))
print("max sentence length: " + str(max_l))
word_embedding_path = "../data/weibo/w2v.pickle"

w2v = cPickle.load(open(word_embedding_path, 'rb'), encoding='bytes')
print("word2vec loaded!")
print("num words already in word2vec: " + str(len(w2v)))

print("word2vec loaded!")
print("num words already in word2vec: " + str(len(w2v)))
add_unknown_words(w2v, vocab)
W, word_idx_map = get_W(w2v)
# # rand_vecs = {}
# # add_unknown_words(rand_vecs, vocab)
W2 = rand_vecs = {}
w_file = open("../data/weibo/word_embedding.pickle", "wb")
cPickle.dump([W, W2, word_idx_map, vocab, max_l], w_file)
w_file.close()
return train_data, valiate_data, test_data
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值