xpath练习_适合联系xpath的网址-CSDN博客

本文链接：https://blog.csdn.net/hmh4640219/article/details/114896135

买家秀

http://www.tbqq.net/
爬取图片与名字

import requests
import os
from lxml import etree
from urllib import request
import threading

def maijiaxiu(url,page):
    res = requests.get(url=url,headers=headers)
    # print(res.text)
    content = res.text
    tree = etree.HTML(content)
    li_list = tree.xpath('//li[@class="deanactions fadeInUp"]')
    # print(len(li_list))
    path = f'maijiaxiu{page}'
    if not os.path.exists(path):
        os.mkdir(path)
    for li in li_list:
        # forum.php?mod=image&aid=8329&size=280x350&key=52eae99ad14ec1b8
        # 匹配到的src没有对应的后缀名，为了避免格式错误
        src = 'http://www.tbqq.net/'+li.xpath('./div[@class="deanmadoupic"]//img/@src')[0]
        name =  li.xpath('.//div[@class="deanmadouname"]//text()')[0]
        # print(src,name)

        # 这是直接加了.jpg,有点不靠谱
        # request.urlretrieve(url=src,filename=f'{path}/{name}.jpg')

        # http://www.tbqq.net/forum.php?mod=image&aid=8329&size=280x350&key=52eae99ad14ec1b8
        # 我们把上面的地址使用浏览器进行加载，加载完成之后，可以清楚的看到地址栏中地址是有后缀名的
        # 匹配到的src没有对应的后缀名，为了避免格式错误，
        # 可以对src先发起请求，获取它的url，再获取url的后缀名
        img_url = requests.get(url=src,headers=headers).url
        suffix = os.path.splitext(img_url)[1]
        request.urlretrieve(url=src, filename=f'{path}/{name}{suffix}')
        # 这样做肯定是会有点慢的，这里就要发送两次请求
    print(f'第{page}页下载成功')
if __name__ == '__main__':
    headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
    }
    # 使用多线程进行爬取
    for i in range(1,5):
        url = f'http://www.tbqq.net/forum.php?mod=forumdisplay&fid=2&sortid=2&sortid=2&page={i}'
        threading.Thread(target=maijiaxiu,args=(url,i)).start()

斗图网

http://www.bbsnet.com/
同样是爬取图片与名字

import requests
from urllib import request
import os
from lxml import etree
import threading

def doutu(url,page):
    res = requests.get(url=url,headers=headers)
    content = res.text
    # print(content)
    tree = etree.HTML(content)
    li_list = tree.xpath('//li[@class="post box row fixed-hight"]')
    path = f'doutu{page}'
    if not os.path.exists(path):
        os.mkdir(path)
    for li in li_list:
        src = li.xpath('.//a[@class="zoom"]/img/@src')[0]
        alt = li.xpath('.//a[@class="zoom"]/img/@alt')[0]
        suffix = os.path.splitext(src)[1]
        name = alt + suffix
        request.urlretrieve(url=src,filename=f'{path}/{name}')
    print(f'第{page}页下载完成')


if __name__ == '__main__':
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
    }
    for i in range(1,6):
        url = f'http://www.bbsnet.com/page/{i}'
        threading.Thread(target=doutu,args=(url,i)).start()