闲来随笔
from urllib import request
from bs4 import BeautifulSoup
def replace ( content) :
str = ''
for i in content:
if i != ' ' :
str += i
return str
if __name__== "__main__" :
head = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
}
url= "https://movie.douban.com/top250?format=text" ;
fullHtml= url
req= request. Request( fullHtml, headers= head)
res= request. urlopen( req)
html= res. read( )
html= html. decode( "utf-8" )
soup = BeautifulSoup( html, "html.parser" )
for tag in soup. find_all( "div" , class_= "item" ) :
m_movie_level= tag. find( "div" , class_= "pic" )
print ( m_movie_level. find( 'a' ) . get( 'href' ) )
print ( m_movie_level. find( 'a' ) . find( "img" ) [ "src" ] )
print ( m_movie_level. find( 'em' ) . get_text( ) )
m_movie_info_hd= tag. find( "div" , class_= "info" ) . find( "div" , class_= "hd" )
m_movie_info_hd_span= m_movie_info_hd. findAll( "span" ) ;
print ( m_movie_info_hd_span[ 0 ] . contents[ 0 ] ) ;
print ( m_movie_info_hd_span[ 1 ] . contents[ 0 ] + m_movie_info_hd_span[ 2 ] . contents[ 0 ] ) ;
m_movie_info_hd = tag. find( "div" , class_= "info" ) . find( "div" , class_= "bd" )
m_movie_info_hd_class = m_movie_info_hd. findAll( "p" ) ;
print ( replace( m_movie_info_hd_class[ 0 ] . contents[ 0 ] ) ) ;
m_movie_info_hd_star= m_movie_info_hd. find( "div" , class_= "star" ) . findAll( "span" ) ;
print ( replace( m_movie_info_hd_star[ 1 ] . contents[ 0 ] ) )
m_movie_info_hd_cri = m_movie_info_hd. find( "p" , class_= "quote" ) . get_text( )
print ( replace( m_movie_info_hd_cri) ) ;