爬取豆瓣电影TOP250信息,并存进数据库
技术用到的有requests,pymysql,lxml,xpath
爬取内容
首先查看网页源码,右键查看网页源代码我们可以发现每部电影的信息都在li标签中:
这就表明我们不需要去抓包分析。思路很简单了:发送请求→获取网页源代码→xpath提取→保存信息。
下面附上超详细的源码:👇👇👇👇
import requests
from lxml import etree
import pymysql
def zhu():
# 定义数据库
#host:地址 potr:端口 user:用户名 password:密码 db:数据库名 charset:编码
db = pymysql.connect(host="localhost", port=3306, user="root", password="123456", db="douban", charset="utf8")
cursor = db.cursor() #创建游标
#定义请求参数地址 range()第一个参数起始数 ,第二个结束数,加1,第三个每次增加数
urls= ['https://movie.douban.com/top250?start='+str(i)+'&filter=' for i in range(0,226,25)]
#伪装浏览器请求
headers ={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36'
}
for url in urls: #循环地址
# print(url)
re = requests.get(url,headers=headers) #requers.get方法请求
ye = etree.HTML(re.text) #用lxml中的etree模块解析获取到的页面内容,
infos = ye.xpath('//div[@id="wrapper"]//ol[1]/li') #解析完就可以用xpath解析了,获取循环点,拿到列表
for info in infos: #循环列表
#https://www.runoob.com/xpath/xpath-tutorial.html --------xpath教程
num = info.xpath('.//div/div[1]/em/text()')[0] #[0]是取列表索引
# /text()取标签内容 /@title取标签title属性值 /@href取标签href属性值
name = info.xpath('.//div/div[2]/div[1]/a/span[1]/text()')[0]
pingfem = info.xpath('.//div/div[2]/div[2]/div/span[2]/text()')[0]
mingju = info.xpath('.//div/div[2]/div[2]/p[2]/span/text()') #如果为空需进行判断
if not mingju: #如果为空
mingju = "无" #则
mingju = mingju[0]
print(num,name,pingfem,pingfem,mingju)
#定义sql语句
sql = 'insert into douban_data values ("%s","%s","%s","%s")' % (num,name,pingfem,mingju)
cursor.execute(sql) #执行语句
db.commit()
zhu()
这里还有爬取更详细的内容,请点击这里
可以看到我们爬取的数据不是每个电影的全部信息,下面提取有一点点难度的完整信息
拓展爬取完整信息 ↓
下面要爬取这些内容 排名’, ‘电影名称’, ‘导演’,‘演员’,‘国家’, ‘评分’, ‘评价人数’, '名句 8种
并且源码里也有存进csv的方法,
这里面需要我们对拿到的数据进一步加工处理,不多说来看看吧
源码如下:
import pymysql
import requests
from lxml import etree
def zhu():
# db = pymysql.connect(host="", port=3306, user="root",
# password="123456", db="bo", charset="utf8")
# cursor = db.cursor()
# csvv = open('E://爬虫数据/豆瓣排行数据+演员.csv', 'w+', newline='', encoding='utf-8-sig')
# writer = csv.writer(csvv)
# writer.writerow(('排名', '电影名称', '导演','演员','国家', '评分', '评价人数', '名句'))
urls = ['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0, 251, 25)]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
for url in urls:
re = requests.get(url, headers=headers)
# print(re)
ye = etree.HTML(re.text)
# print(ye)
infos = ye.xpath('//div[@id="wrapper"]//ol[1]/li')
# print(infos)
for info in infos:
yanyuan = info.xpath('.//div[@class="bd"]/p')[0].xpath('string(.)').strip().split('\n')[0].strip().split('\xa0')[-1].split(':')[-1]
# print(info)
num = info.xpath('.//div[@class="pic"]/em/text()')[0]
name = info.xpath('.//div[@class="hd"]/a/span[1]/text()')[0]
daoyan = info.xpath('.//div[@class="bd"]/p')[0].xpath('string(.)').strip().split('\n')[0].split('\xa0')[0].split(':')[1].strip()
guojia = info.xpath('.//div[@class="bd"]/p')[0].xpath('string(.)').strip().replace('\xa0','').split('\n')[1].split('/')[1].split()[0]
pingfen = info.xpath('.//div[@class="star"]/span[2]/text()')[0]
pingjiarenshu = info.xpath('.//div[@class="star"]/span[4]/text()')[0]
mingju = info.xpath('.//div[@class="bd"]//p//span/text()')
if not mingju:
mingju = "无"
mingju = mingju[0]
print(num,name,daoyan,yanyuan,guojia,pingfen,pingjiarenshu,mingju)
# sql = 'insert into doub values ("%s","%s","%s","%s","%s","%s","%s","%s")' % (num,name,daoyan,yanyuan,guojia,pingfen,pingjiarenshu,mingju)
# cursor.execute(sql)
# db.commit()
# writer.writerow((num,name,daoyan,yanyuan,guojia,pingfen,pingjiarenshu,mingju))
zhu()
这就是一个简单的一个小爬虫,先从简单的学,接下来会写一些爬取其他网站的数据,