删除根据文章标题删除重复文章:
eg:头脑风暴 —— 创新药投资及研发成功率的思考
头脑风暴丨创新药投资及研发成功率的思考
迈博斯生物与奕安济世生物药业合并成立Transcenta Holding,加速推进建设国际化整合型生物制药公司
官宣!迈博斯生物与奕安济世生物药业合并成立 Transcenta Holding,加速推进建设国际化整合型生物制药公司
In Vivo's Top Financing Of 2018: Cast Your Vote!
In Vivo's Top M&A Of 2018: Cast Your Vote!
In Vivo's Top Alliance Of 2018: Cast Your Vote!
import pymssql
import pandas as pd
import jieba
from gensim import corpora,models,similarities
import numpy as np
from pandas import DataFrame
###删除重复文章###
test['title_cut'] = test.DocTitle.apply(lambda x:jieba.lcut(x))
title_cut = list(test['title_cut'])
dictionary = corpora.Dictionary(test['title_cut'])
#dictionary.keys()
#b = dictionary.token2id
corpus = [dictionary.doc2bow(doc) for doc in test['title_cut']]
tfidf = models.TfidfModel(corpus)
tfidf[corpus]
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[corpus]]
q = 0
sim_dayu = np.argwhere( 0.7<sim)
sim_dayu_df=DataFrame(sim_dayu)
for j in sim_dayu:
if j[0] == j[1]:
sim_dayu_df = sim_dayu_df.drop([q],inplace=False)
q=q+1
else:
q=q+1
a = list(zip(sim_dayu_df[0],sim_dayu_df[1]))
for i in range(len(a)):
a[i]=list(a[i])
b = len(a)
for i in range(b):
for j in range(b):
x = list(set(a[i]+a[j]))
y = len(a[j])+len(a[i])
if i == j or a[i] == 0 or a[j] == 0:
break
elif len(x) < y:
a[i] = x
a[j] = [0]
# print (a)
#print ([i for i in a if i != [0]])
r = [i for i in a if i != [0]]
for i in range(len(r)):
for j in range(len(r[i])-1):
u = r[i][j+1]
test = test.drop([u],inplace=False)