python爬虫(用bs4方法爬取)
摘豆瓣Top250第一页上的电影标题,观众人数电影评分
1.用xpath爬取网页
from lxml import etree
import requests
import csv
#获取网页源代码
def pares_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Cookie': 'll="118088"; bid=aM7RLYFbEsk; _vwo_uuid_v2=D1DC27FB8057246811F6BDE00F96C97D7|893c526630afcd7358975bfe19c09bc1; __gads=ID=ca9a0498524401aa-2296b43e28cf0073:T=1637238150:RT=1637238150:S=ALNI_MZ7xyFOBvnQUa0kWR_GchHjytkLRg; dbcl2="222359803:VVR4kCUBRI4"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.22235; __yadk_uid=k5F550ejP1LRsJVQ0Z80jkmfSKfsUucx; ck=6NaA; __utmc=30149280; __utmc=223695111; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639106319%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DA0CYe4BP_EkCZ1Cszo4V1ppMwodo2od0eZ9JQgpsLgyNyh944Y-JrNfcZP142gre%26wd%3D%26eqid%3D9812afa40000ece00000000361b2c6e7%22%5D; _pk_id.100001.4cf6=1b1ec4a63617978b.1637238125.6.1639106319.1638966630.; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1559656286.1637238120.1638966631.1639106319.7; __utmb=30149280.0.10.1639106319; __utmz=30149280.1639106319.7.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.542904913.1637238124.1638966631.1639106319.6; __utmb=223695111.0.10.1639106319; __utmz=223695111.1639106319.6.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
}
response = requests.get(url,headers=headers)
text=response.content.decode('utf-8')
html=etree.HTML(text)
#获取电影名字
title_name = []
#获取电影评分
source_name = []
# 获取观众人数
num_name = []
#获取电影名字
pic = html.xpath('//div[@class="pic"]/a/img/@alt') #返回的是一个列表,所以要用循环来变成对象
for data in pic:
title_name.append(data)
print(title_name)
#获取电影评分
span1 = html.xpath('//div/span[@class="rating_num"]/text()')
for data1 in span1:
source_name.append(data1)
print(source_name)
#获得电影观众人数
span2=html.xpath('//div[@class="star"]/span[4]/text()')
for i in span2:
span3=i.replace('人评价','')
num_name.append(span3)
print(num_name)
for i in range(len(title_name)):
dic = {}
dic['title'] = title_name[i]
dic['score'] = source_name[i]
dic['num'] = num_name[i]
print(dic)
def main():
url = 'https://movie.douban.com/top250'
pares_page(url)
if __name__ == '__main__':
main()
总结:用xpath爬取网站,在爬取文本的时候要用text()
## 2.用bs4爬取网页
from bs4 import BeautifulSoup
import requests
import csv
#获取网页源代码
def pares_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Cookie': 'll="118088"; bid=aM7RLYFbEsk; _vwo_uuid_v2=D1DC27FB8057246811F6BDE00F96C97D7|893c526630afcd7358975bfe19c09bc1; __gads=ID=ca9a0498524401aa-2296b43e28cf0073:T=1637238150:RT=1637238150:S=ALNI_MZ7xyFOBvnQUa0kWR_GchHjytkLRg; dbcl2="222359803:VVR4kCUBRI4"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.22235; __yadk_uid=k5F550ejP1LRsJVQ0Z80jkmfSKfsUucx; ck=6NaA; __utmc=30149280; __utmc=223695111; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639106319%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DA0CYe4BP_EkCZ1Cszo4V1ppMwodo2od0eZ9JQgpsLgyNyh944Y-JrNfcZP142gre%26wd%3D%26eqid%3D9812afa40000ece00000000361b2c6e7%22%5D; _pk_id.100001.4cf6=1b1ec4a63617978b.1637238125.6.1639106319.1638966630.; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1559656286.1637238120.1638966631.1639106319.7; __utmb=30149280.0.10.1639106319; __utmz=30149280.1639106319.7.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.542904913.1637238124.1638966631.1639106319.6; __utmb=223695111.0.10.1639106319; __utmz=223695111.1639106319.6.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
}
response = requests.get(url,headers=headers)
tex=response.content.decode('utf-8')
#创建bs4对象
soup = BeautifulSoup(tex,'lxml')
# print(soup)
#获取电影名字
title_name = []
pic =soup.select('div[class="pic"]>a>img')
for i in pic:
title_name.append(i['alt'])
# print(title_name)
#获取电影评分
source_name = []
span1 = soup.select('span[class="rating_num"]')
for data in span1:
text1 = data.string
source_name.append(text1)
# print(source_name)
#获取观众人数
num_name=[]
span2 = soup.select('div[class=star]>span:nth-child(4)')
for num in span2:
text2 = num.string.replace('人评价', '')
num_name.append(text2)
# print(num_name)
#将这三个用字典组合起来
for i in range(len(title_name)):
dic={}
dic['title'] = title_name[i]
dic['score'] = source_name[i]
dic['num'] = num_name[i]
print(dic)
def main():
url = 'https://movie.douban.com/top250'
pares_page(url)
if __name__ == '__main__':
main()