豆瓣Top250数据信息爬取写入数据库

最新推荐文章于 2024-08-28 14:26:16 发布

m0_53460357

最新推荐文章于 2024-08-28 14:26:16 发布

阅读量392

点赞数 10

文章标签：爬虫 python 数据库

本文链接：https://blog.csdn.net/m0_53460357/article/details/135173485

版权

import requests
from lxml import etree
import pymysql

startpag=''
startpag=input(f'请输入你的起始页数{startpag}')#起始页
startpag=int(startpag)
endpag=''
endpag=input(f"请输入你结束的页数{endpag}")#结束页
endpag=int(endpag)

count=0
for i in range(startpag, endpag):
	url = rf'https://movie.douban.com/top250?start={i}&filter='  # Top第一页规律每页加start=（页数-1）*25
	heardrs = {"User-Agent":
				   "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0"}
	res = requests.get(url=url,headers=heardrs)
	html = etree.HTML(res.text)
	lis = html.xpath('//*[@id="content"]/div/div[1]/ol/li')#25页
	# print(res.text)
	#解析数据
	def get_first_text(list):
		return list[0].strip()#去空格
	for li in lis :
		title = get_first_text(li.xpath('./div/div[2]/div[1]/a/span[1]/text()'))#标题
		herf = get_first_text(li.xpath("./div/div[2]/div[1]/a/@href"))#网址
		p = get_first_text(li.xpath('./div/div[2]/div[2]/p[1]/text()'))#导演
		score=get_first_text(li.xpath('./div/div[2]/div[2]/div/span[2]/text()'))#评分
		comment=get_first_text(li.xpath('./div/div[2]/div[2]/div/span[4]/text()'))
		summary = get_first_text(li.xpath('./div/div[2]/div[2]/p[2]/span/text()'))
		count=count+1
		print(title,herf,p,score,comment,summary,f'爬取第{count}部....')
		host = '192.168.48.132'
		port = 3306
		user = 'root'
		password = '123456'
		database = 'mysql_test'
		mydb = pymysql.connect(host=host, port=port,
							   user=user, password=password, database=database
							   , charset='utf8')#连接数据库

		cur = mydb.cursor()
		try:
			data1=cur.execute(f"INSERT into doubanmovie VALUES ('0','{title}','{herf}','{p}','{score}','{comment}','{summary}')")#写入数据库
			row = cur.fetchone()
			mydb.commit()
		except Exception as e:
			cur.close()

#爬虫#python#mysql