买家秀
http://www.tbqq.net/
爬取图片与名字
import requests
import os
from lxml import etree
from urllib import request
import threading
def maijiaxiu(url,page):
res = requests.get(url=url,headers=headers)
# print(res.text)
content = res.text
tree = etree.HTML(content)
li_list = tree.xpath('//li[@class="deanactions fadeInUp"]')
# print(len(li_list))
path = f'maijiaxiu{page}'
if not os.path.exists(path):
os.mkdir(path)
for li in li_list:
# forum.php?mod=image&aid=8329&size=280x350&key=52eae99ad14ec1b8
# 匹配到的src没有对应的后缀名,为了避免格式错误
src = 'http://www.tbqq.net/'+li.xpath('./div[@class="deanmadoupic"]//img/@src')[0]
name = li.xpath('.//div[@class="deanmadouname"]//text()')[0]
# print(src,name)
# 这是直接加了.jpg,有点不靠谱
# request.urlretrieve(url=src,filename=f'{path}/{name}.jpg')
# http://www.tbqq.net/forum.php?mod=image&aid=8329&size=280x350&key=52eae99ad14ec1b8
# 我们把上面的地址使用浏览器进行加载,加载完成之后,可以清楚的看到地址栏中地址是有后缀名的
# 匹配到的src没有对应的后缀名,为了避免格式错误,
# 可以对src先发起请求,获取它的url,再获取url的后缀名
img_url = requests.get(url=src,headers=headers).url
suffix = os.path.splitext(img_url)[1]
request.urlretrieve(url=src, filename=f'{path}/{name}{suffix}')
# 这样做肯定是会有点慢的,这里就要发送两次请求
print(f'第{page}页下载成功')
if __name__ == '__main__':
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
}
# 使用多线程进行爬取
for i in range(1,5):
url = f'http://www.tbqq.net/forum.php?mod=forumdisplay&fid=2&sortid=2&sortid=2&page={i}'
threading.Thread(target=maijiaxiu,args=(url,i)).start()
斗图网
http://www.bbsnet.com/
同样是爬取图片与名字
import requests
from urllib import request
import os
from lxml import etree
import threading
def doutu(url,page):
res = requests.get(url=url,headers=headers)
content = res.text
# print(content)
tree = etree.HTML(content)
li_list = tree.xpath('//li[@class="post box row fixed-hight"]')
path = f'doutu{page}'
if not os.path.exists(path):
os.mkdir(path)
for li in li_list:
src = li.xpath('.//a[@class="zoom"]/img/@src')[0]
alt = li.xpath('.//a[@class="zoom"]/img/@alt')[0]
suffix = os.path.splitext(src)[1]
name = alt + suffix
request.urlretrieve(url=src,filename=f'{path}/{name}')
print(f'第{page}页下载完成')
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
}
for i in range(1,6):
url = f'http://www.bbsnet.com/page/{i}'
threading.Thread(target=doutu,args=(url,i)).start()