import requests
from lxml import etree
import re
import xlwt
from openpyxl import workbook
from openpyxl import load_workbook
from xlrd import book
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0' ,
'Cookie' : 'x-wl-uid=1DVw4k4T/jAduWIfwW2jvf029Ha4Bgv/AJGjP/yRfJTdq26dr7oDdeEBdb6zOPUl0ByfsaKJ3GUY=; session-id-time=2082729601l; session-id=457-7649276-4174543; csm-hit=tb:DAHATSQRZZBWHWD4ZXYP+s-T61YJHRDEC6Y6S2VMTVZ|1573355007668&t:1573355007668&adb:adblk_no; ubid-acbcn=459-2457809-1906210; session-token="4sZGQQPKw9CJUOzJFLsTdS3FtlpqIyp0hyvhXL6RMOchbDf7p7YLDEL90YFps2Hl80fBT6uPmzQ00meCLYxsrjuoabX3+kz7OB+CLw8GaAYZB8J9oBBcJLBUsGs6LLm/EHQht5Tm0IpOKR0hz0GGtATgcpJXDfRoEdvNol+CUc3mXOMA5KmEfFWstdV+KwyzSGrGW+DdrAftisgZMl2stffIdhcOLh53B4tJwsR5awKqPrOqZF8uJg=="; lc-acbcn=zh_CN; i18n-prefs=CNY'
}
hao = [ ]
zhong = [ ]
cha = [ ]
all_info_list = [ ]
def parge_page ( url) :
response = requests. get( url= url, headers= headers)
text = response. text
html = etree. HTML( text)
quan = html. xpath( '//div[@id="cm_cr-review_list"]/div' )
for i in quan:
pinfen1 = i. xpath( './/span[@class="a-icon-alt"]/text()' )
pinlun = i. xpath( './/span[@data-hook="review-body"]/span/text()' )
time = i. xpath( './/span[@data-hook="review-date"]/text()' )
color = i. xpath( './/a[@ data-hook="format-strip"]/text()' )
for pinfen in pinfen1:
a = re. sub( '颗星' , '' , pinfen)
list = { '评论' : pinlun, '颜色和尺寸' : color, '评分' : a, '日期' : time}
print ( list )
info_list = [ pinlun, color, a, time]
all_info_list. append( info_list)
def main ( ) :
for x in range ( 10 ) :
url = 'https://www.amazon.com/product-reviews/B07XGK5QXD/?pageNumber=' + str ( x)
if __name__ == '__main__' :
main( )
book = xlwt. Workbook( encoding= 'utf-8' )
sheet = book. add_sheet( 'sheet1' )
head = [ '评论' , '颜色和尺寸' , '评分' , '日期' ]
for h in range ( len ( head) ) :
sheet. write( 0 , h, head[ h] )
i = 1
for list in all_info_list:
j = 0
for data in list :
sheet. write( i, j, data)
j += 1
i += 1
book. save( '评论信息.xlsx' )
print ( '完成' , book)
如果还想爬取其他信息,自己添加Xpath!
import pandas as pd
data_path = '8-8026.xlsx'
df = pd. read_excel( data_path, encoding= 'gbk' )
print ( len ( df) )
print ( type ( df) )
df = df. dropna( )
print ( '清除缺失数据后:' )
print ( len ( df) )
print ( type ( df) )
df = pd. DataFrame( df. iloc[ : , 0 ] . unique( ) )
print ( len ( df) )
def str_unique ( raw_str, reverse= False ) :
"""
比如:我喜欢喜欢喜欢喜欢喜欢喜欢该商品;去掉重复的“喜欢”
:param raw_str:
:param reverse: 是否转置
:return:
"""
if reverse:
raw_str = raw_str[ : : - 1 ]
res_str = ''
for i in raw_str:
if i not in res_str:
res_str += i
if reverse:
res_str = res_str[ : : - 1 ]
return res_str
ser1 = df. iloc[ : , 0 ] . apply ( str_unique)
df2 = pd. DataFrame( ser1. apply ( str_unique, reverse= True ) )
print ( '机械压缩去词后:' )
print ( len ( df2) )
print ( type ( df2) )
print ( '------------------' )
df3 = df2[ df2. iloc[ : , 0 ] . apply ( len ) >= 4 ]
print ( '短句过滤后:' )
print ( len ( df3) )
print ( '------------------' )
from snownlp import SnowNLP
coms = df3. iloc[ : , 0 ] . apply ( lambda x: SnowNLP( x) . sentiments)
positive_df = df3[ coms >= 0.9 ]
negative_df = df3[ coms < 0.1 ]
print ( '好评' )
print ( positive_df)
print ( '差评' )
print ( negative_df)
import jieba
data1 = pd. read_csv( 'comments_post.txt' , encoding= 'utf-8' , header= None )
data2 = pd. read_csv( 'comments_neg.txt' , encoding= 'utf-8' , header= None )
mycut = lambda s: ' ' . join( jieba. cut( s) )
data1 = data1[ 0 ] . apply ( mycut)
data2 = data2[ 0 ] . apply ( mycut)
data1. to_csv( '好评原因.txt' , index= False , header= False , encoding= 'utf_8_sig' )
data2. to_csv( '差评原因.txt' , index= False , header= False , encoding= 'utf_8_sig' )
print ( data2)
from gensim import corpora, models
post = pd. read_csv( '好评原因.txt' , encoding= 'utf-8' , header= None , error_bad_lines= False )
neg = pd. read_csv( '差评原因.txt' , encoding= 'utf-8' , header= None , error_bad_lines= False )
stop = pd. read_csv( 'stopwords.txt' , header= None , sep= 'tipdm' , engine= 'python' )
stop = [ ' ' , '' ] + list ( stop[ 0 ] )
post[ 1 ] = post[ 0 ] . apply ( lambda s: s. split( ' ' ) )
post[ 2 ] = post[ 1 ] . apply ( lambda x: [ i for i in x if i not in stop] )
neg[ 1 ] = neg[ 0 ] . apply ( lambda s: s. split( ' ' ) )
neg[ 2 ] = neg[ 1 ] . apply ( lambda x: [ i for i in x if i not in stop] )
'''正面主题分析'''
post_dict = corpora. Dictionary( post[ 2 ] )
post_corpus = [ post_dict. doc2bow( i) for i in post[ 2 ] ]
post_lda = models. LdaModel( post_corpus, num_topics= 3 , id2word= post_dict)
for i in range ( 3 ) :
print ( "正面主题" , post_lda. print_topic( i) )
'''负面主题分析'''
neg_dict = corpora. Dictionary( neg[ 2 ] )
neg_corpus = [ neg_dict. doc2bow( i) for i in neg[ 2 ] ]
neg_lda = models. LdaModel( neg_corpus, num_topics= 3 , id2word= neg_dict)
for i in range ( 3 ) :
print ( "负面主题" , neg_lda. print_topic( i) )