猫眼电影(xpath)
- 目标
1、地址: 猫眼电影 - 榜单 - top100榜
2、目标: 电影名称、主演、上映时间
- 步骤
1、确定是否为静态页面(右键-查看网页源代码,搜索关键字确认)
2、写xpath表达式
3、写程序框架
- xpath表达式
1、基准xpath: 匹配所有电影信息的节点对象列表
//dl[@class="board-wrapper"]/dd # 先匹配大的节点对象
2、遍历对象列表,依次获取每个电影信息
for dd in dd_list:
电影名称 :dd.xpath('./a/@title')[0].strip() # 必须的加. 代表当前的节点对象
电影主演 :dd.xpath('.//p[@class="star"]/text()')[0].strip()
上映时间 :dd.xpath('.//p[@class="releasetime"]/text()')[0].strip()
- 代码实现
import requests
from lxml import etree
import time
import random
class MaoyanSpider(object):
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
self.headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
# 添加计数(页数)
self.page = 1
# 获取页面
def get_page(self,url):
# random.choice一定要写在这里,每次请求都会随机选择
res = requests.get(url,headers=self.headers)
res.encoding = 'utf-8'
html = res.text
self.parse_page(html)
# 解析页面
def parse_page(self,html):
# 创建解析对象
parse_html = etree.HTML(html)
# 基准xpath节点对象列表
dd_list = parse_html.xpath('//dl[@class="board-wrapper"]/dd')
movie_dict = {}
# 依次遍历每个节点对象,提取数据
for dd in dd_list:
movie_dict['name'] = dd.xpath('.//p/a/@title')[0].strip()
movie_dict['star'] = dd.xpath('.//p[@class="star"]/text()')[0].strip()
movie_dict['time'] = dd.xpath('.//p[@class="releasetime"]/text()')[0].strip()
print(movie_dict)
# 主函数
def main(self):
for offset in range(0,31,10):
url = self.url.format(str(offset))
self.get_page(url)
print('第%d页完成' % self.page)
time.sleep(random.randint(1,3))
self.page += 1
if __name__ == '__main__':
spider = MaoyanSpider()
spider.main()
链家二手房案例(xpath)
- 实现步骤
1. 确定是否为静态
打开二手房页面 -> 查看网页源码 -> 搜索关键字
- xpath表达式
1、修改方法: 右键 -> copy xpath -> 测试修改
2、基准xpath表达式(匹配每个房源信息节点列表)
//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"] | //ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]
3、依次遍历后每个房源信息xpath表达式
* 名称: .//a[@data-el="region"]/text()
* 总价: .//div[@class="totalPrice"]/span/text()
* 单价: .//div[@class="unitPrice"]/span/text()
- 代码实现
import requests
from lxml import etree
import time
class LianjiaSpider(object):
def __init__(self):
self.url = 'https://bj.lianjia.com/ershoufang/pg{}/'
self.headers = {'User-Agent' : 'Mozilla/5.0'}
def get_page(self,url):
res = requests.get(url,headers=self.headers,timeout=10)
res.encoding = 'utf-8'
html = res.text
self.parse_page(html)
def parse_page(self,html):
parse_html = etree.HTML(html)
# 基准xpath
li_list = parse_html.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"] | //ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
print(len(li_list))
house_dict = {}
# 遍历依次匹配每个房源信息
for li in li_list:
house_dict['house_name'] = li.xpath('.//a[@data-el="region"]/text()')[0].strip()
house_dict['total_price'] = li.xpath('.//div[@class="totalPrice"]/span/text()')[0].strip()
house_dict['unit_price'] = li.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()
print(house_dict)
def main(self):
for pg in range(1,4):
url = self.url.format(str(pg))
self.get_page(url)
print('第%d页爬取成功' % pg)
time.sleep(0.5)
if __name__ == '__main__':
spider = LianjiaSpider()
spider.main()
百度贴吧图片抓取
- 目标
抓取指定贴吧所有图片
注意:一般User-Agent要换成IE的
- 思路
1、获取贴吧主页URL,下一页,找到不同页的URL规律
2、获取1页中所有帖子URL地址: [帖子链接1,帖子链接2,...]
3、对每个帖子链接发请求,获取图片URL
4、向图片的URL发请求,以wb方式写入本地文件
# 向一个贴子链接发请求后--->提取图片链接,然后把图片保存到本地
- 实现步骤
- 贴吧URL规律
http://tieba.baidu.com/f?kw=??&pn=50
- xpath表达式
1、帖子链接xpath
//*[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a/@href
百度贴吧贴子地址
//*[@id="thread_list"]/li//div[@class='t_con cleafix']/div/div/div/a/@href
2、图片链接xpath
贴子里面图片地址
//div[@class='d_post_content_main d_post_content_firstfloor']//div[@class='d_post_content j_d_post_content ']/img/@src
3、视频链接xpath(百度贴吧视频抓取反爬机制)(对响应内容做处理)
//div[@class="video_src_wrapper"]/embed/@data-video
# 注意: 此处视频链接前端对响应内容做了处理,需要查看网页源代码来查看,复制HTML代码在线格式化
- 代码实现
import requests
from urllib import parse
from lxml import etree
class BaiduImgSpider(object):
def __init__(self):
self.url = 'http://tieba.baidu.com/f?{}'
self.headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}
# 获取帖子链接
def get_tlink(self, url):
html = requests.get(url, headers=self.headers).text
# 提取帖子链接
parse_html = etree.HTML(html)
tlink_list = parse_html.xpath(
'//*[@id="thread_list"]/li//div[@class="t_con cleafix"]/div/div/div/a/@href')
# tlink_list: ['/p/23234','/p/9032323']
for tlink in tlink_list:
t_url = 'http://tieba.baidu.com' + tlink
# 提取图片链接并保存
self.get_imglink(t_url)
# 获取图片链接
def get_imglink(self, t_url):
res = requests.get(t_url, headers=self.headers)
res.encoding = 'utf-8'
html = res.text
# 提取图片链接
parse_html = etree.HTML(html)
# 匹配图片和视频的xpath表达式,中间加或 |
imglink_list = parse_html.xpath(
'//div[@class="d_post_content_main d_post_content_firstfloor"]//div[@class="d_post_content j_d_post_content "]/img/@src | //div[@class="video_src_wrapper"]/embed/@data-video')
for imglink in imglink_list:
self.write_img(imglink)
# 保存图片
def write_img(self, imglink):
res = requests.get(imglink, headers=self.headers)
# 切取链接的后10位作为文件名
filename = imglink[-10:]
with open(filename, 'wb') as f:
f.write(res.content)
print('%s下载成功' % filename)
# 指定贴吧名称,起始页和终止页,爬取图片
def main(self):
name = input('请输入贴吧名:')
begin = int(input('请输入起始页:'))
end = int(input('请输入终止页:'))
for page in range(begin, end + 1):
# 查询参数编码
params = {
'kw': name,
'pn': str((page - 1) * 50)
}
params = parse.urlencode(params)
url = self.url.format(params)
# 开始获取图片
self.get_tlink(url)
if __name__ == '__main__':
spider = BaiduImgSpider()
spider.main()
最后附上git地址(会不断更新自己写的爬虫代码):https://github.com/RyanLove1/spider_code.git