importrequestsfrom lxml importetreeimportreimportpymysqlimporttime
author= 'qewwc'conn= pymysql.connect(host='localhost', user='root', passwd='root', db='test', charset="utf8", use_unicode="True",port=3306)
cursor=conn.cursor()
headers={'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
sql_in= '''insert into doubanmovie (name,director,actor,style,country,release_time,time,score) value (%s,%s,%s,%s,%s,%s,%s,%s)'''
def get_movie_url(url): #获取每个电影的链接
html = requests.get(url=url,headers=headers)
selector=etree.HTML(html.text)
movie_hrefs= selector.xpath('//div[@class="hd"]/a/@href')for movie_href inmovie_hrefs:
get_movie_info(movie_href)defget_movie_info(url):
html= requests.get(url= url,headers = headers) #
selector = etree.HTML(html.text) #
try:
name= selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]exceptIndexError:
name= ''
try:
director= selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]exceptIndexError:
director= ''
try:
actors= selector.xpath('//*[@class="actor"]/span[2]')[0]
actor= actors.xpath('string(.)')#actors = re.findall('(.*?)',html.text,re.S)
exceptIndexError:
actor= ''
try:
style= re.findall('(.*?)', html.text, re.S)[0]exceptIndexError:
style= ''
try:
country= re.findall('制片国家/地区: (.*?)
', html.text, re.S)[0]exceptIndexError:
country= ''
try:
release_time= re.findall('(.*?)', html.text, re.S)[0]exceptIndexError:
release_time= ''
try:
time= re.findall('(.*?)', html.text, re.S)[0]exceptIndexError:
time= ''
try:
score= selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]exceptIndexError:
score= ''cursor.execute(
sql_in,[str(name),str(director),str(actor),str(style),str(country),str(release_time),str(time),str(score)]
)#url = 'https://movie.douban.com/subject/33967902/'
#2019-06-03(英国)
urls= ['https://movie.douban.com/top250?start={}&filter='.format(i) for i in range(0,250,25)]for url inurls:
get_movie_url(url)
time.sleep(5)print('我好了!')
conn.commit()