闲来随笔
from urllib import request
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import jieba
import matplotlib. pyplot as plotShow
def replace ( content) :
str = ''
for i in content:
if i != ' ' :
str += i
return str
def pythonpScrap1Camouflage ( ) :
head = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
}
url = "https://movie.douban.com/top250?format=text" ;
fullHtml = url
req = request. Request( fullHtml, headers= head)
res = request. urlopen( req)
html = res. read( )
html = html. decode( "utf-8" )
soup = BeautifulSoup( html, "html.parser" )
fileWrite = open ( 'wordCount.txt' , 'w+' , encoding= 'utf-8' )
for tag in soup. find_all( "div" , class_= "item" ) :
m_movie_level = tag. find( "div" , class_= "pic" )
fileWrite. write( m_movie_level. find( 'a' ) . get( 'href' ) )
fileWrite. write( m_movie_level. find( 'a' ) . find( "img" ) [ "src" ] )
fileWrite. write( m_movie_level. find( 'em' ) . get_text( ) )
m_movie_info_hd = tag. find( "div" , class_= "info" ) . find( "div" , class_= "hd" )
m_movie_info_hd_span = m_movie_info_hd. findAll( "span" ) ;
fileWrite. write( m_movie_info_hd_span[ 0 ] . contents[ 0 ] ) ;
fileWrite. write( m_movie_info_hd_span[ 1 ] . contents[ 0 ] + m_movie_info_hd_span[ 2 ] . contents[ 0 ] ) ;
m_movie_info_hd = tag. find( "div" , class_= "info" ) . find( "div" , class_= "bd" )
m_movie_info_hd_class = m_movie_info_hd. findAll( "p" ) ;
fileWrite. write( replace( m_movie_info_hd_class[ 0 ] . contents[ 0 ] ) ) ;
m_movie_info_hd_star = m_movie_info_hd. find( "div" , class_= "star" ) . findAll( "span" ) ;
fileWrite. write( replace( m_movie_info_hd_star[ 1 ] . contents[ 0 ] ) )
m_movie_info_hd_cri = m_movie_info_hd. find( "p" , class_= "quote" ) . get_text( )
fileWrite. write( replace( m_movie_info_hd_cri) ) ;
fileWrite. close( )
def pythonpScrap2wordcloud1 ( ) :
text= open ( r'wordCount.txt' , "r" , encoding= 'utf-8' ) . read( ) ;
cut_text = jieba. cut( text)
result = " " . join( cut_text)
wc= WordCloud(
font_path= "C:/Windows/Fonts/STXINGKA.TTF" ,
background_color= 'white' ,
width= 500 ,
height= 350 ,
max_font_size= 50 ,
min_font_size= 10 ,
mode= 'RGBA' ,
colormap= 'pink'
)
wc. generate( result)
wc. to_file( r'wordClound.png' )
plotShow. figure( "jay" )
plotShow. imshow( wc)
plotShow. axis( "off" )
plotShow. show( )
if __name__== "__main__" :
pythonpScrap1Camouflage( ) ;
pythonpScrap2wordcloud1( )