python 豆瓣电影top250_python 爬取 豆瓣电影top250 存储到mysql

importrequestsfrom lxml importetreeimportreimportpymysqlimporttime

author= 'qewwc'conn= pymysql.connect(host='localhost', user='root', passwd='root', db='test', charset="utf8", use_unicode="True",port=3306)

cursor=conn.cursor()

headers={'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

sql_in= '''insert into doubanmovie (name,director,actor,style,country,release_time,time,score) value (%s,%s,%s,%s,%s,%s,%s,%s)'''

def get_movie_url(url): #获取每个电影的链接

html = requests.get(url=url,headers=headers)

selector=etree.HTML(html.text)

movie_hrefs= selector.xpath('//div[@class="hd"]/a/@href')for movie_href inmovie_hrefs:

get_movie_info(movie_href)defget_movie_info(url):

html= requests.get(url= url,headers = headers) #

selector = etree.HTML(html.text) #

try:

name= selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]exceptIndexError:

name= ''

try:

director= selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]exceptIndexError:

director= ''

try:

actors= selector.xpath('//*[@class="actor"]/span[2]')[0]

actor= actors.xpath('string(.)')#actors = re.findall('(.*?)',html.text,re.S)

exceptIndexError:

actor= ''

try:

style= re.findall('(.*?)', html.text, re.S)[0]exceptIndexError:

style= ''

try:

country= re.findall('制片国家/地区: (.*?)
', html.text, re.S)[0]exceptIndexError:

country= ''

try:

release_time= re.findall('(.*?)', html.text, re.S)[0]exceptIndexError:

release_time= ''

try:

time= re.findall('(.*?)', html.text, re.S)[0]exceptIndexError:

time= ''

try:

score= selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]exceptIndexError:

score= ''cursor.execute(

sql_in,[str(name),str(director),str(actor),str(style),str(country),str(release_time),str(time),str(score)]

)#url = 'https://movie.douban.com/subject/33967902/'

#2019-06-03(英国)

urls= ['https://movie.douban.com/top250?start={}&filter='.format(i) for i in range(0,250,25)]for url inurls:

get_movie_url(url)

time.sleep(5)print('我好了!')

conn.commit()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值