import urllib.request
from lxml import etree
import os
class tieba_img(object):
def __init__(self,tieba_name,start_index,end_index):
"""
作用:建立类的初始化
tieba_name:想要爬取得贴吧名字
start_index:想要从哪一页开始爬去
end_index:想要爬取得结束页
"""
self.tieba_name=tieba_name
self.start_index=start_index
self.end_index=end_index
self.url="http://tieba.baidu.com/f?"
def start(self):
"""
作用:贴吧开始的调度器,用来启动爬虫
"""
print("开始爬取了:")
self.tieba_spider()
def tieba_spider(self):
"""
作用:用来处理地址的信息
"""
for page in range(start_index,end_index+1):
now_page=(page-1)*50
full_name=self.url+"kw="+self.tieba_name+"&pn="+str(now_page)
self.load_page(full_name)
def load_page(self,full_name):
"""
作用:发送http请求,并且进行解析出每个帖子的地址
full_name:所要爬取的贴吧的页面
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
}
my_request=urllib.request.Request(full_name)
html=urllib.request.urlopen(my_request)
content=etree.HTML(html.read().decode("utf-8"))
# print(html.decode("utf-8"))
# print(content)
link_list = content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
# print(link_list)
for link in link_list:
full_link="https://tieba.baidu.com"+link
print(full_link)
self.load_img(full_link)
def load_img(self,full_link):
"""
作用:发送http请求,并且进行解析出每个图片的地址,并且将地址存入到文件中
full_link:所要爬取的每个帖子的地址
"""
my_request = urllib.request.Request(full_link)
html = urllib.request.urlopen(my_request)
content = etree.HTML(html.read().decode("utf-8"))
# print(html.decode("utf-8"))
# print(content)
link_list = content.xpath("//img[@class='BDE_Image']/@src")
# print(link_list)
with open("d:xxxxxx.txt","a") as f:
for i in link_list:
f.write(str(i)+"\n")
if __name__ == '__main__':
tieba_name=input("请输入想要爬取的哪个贴吧:")
start_index=int(input("请输入想要爬取起始页数:"))
end_index=int(input("请输入想要爬取的终止页数:"))
my_tieba=tieba_img(tieba_name,start_index,end_index)
my_tieba.start()
python爬虫爬去贴吧中的所有图片
最新推荐文章于 2021-08-12 10:56:41 发布