import requests
from lxml import html
import requests
import time
url="https://tieba.baidu.com/f"
# //div[contains(@class,"threadlist_title pull_left")]/a/@href
# //div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div/a/@href
def GetHtml(keywords,pages):
global url
kv={'User-Agent': 'Mozilla/5.0'}
for i in range(0,pages):
File_name = '贴吧'+keywords+'第'+str(i)+'页' + '.html'
i=i*50
kv2={'kw' : keywords , 'pn' : i}
time.sleep(3)
r=requests.get(url,params=kv2,headers=kv)
r.encoding="utf8"
File_name=r.text
getLINK(File_name)
# print(r.text)
# with open('1.html','w',encoding="utf8") as TB_File:
# TB_File.write(r.text)
def getLINK(File_name):
# with open(File_name,encoding="utf8") as f:
global url
content=File_name
content=content.replace('<!--','').replace('-->','')
LISTtree=html.etree.HTML(content)
link_list1=LISTtree.xpath('//div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div/a/@href')
# print(link_list1)
for i in link_list1:
link='https://tieba.baidu.com'+i
getIMG(link)
def getIMG(link):
# //div[@class="p_content"]//img[@class="BDE_Image"]/@src
# //div[@id="post_content_128181639806"]/img/@src
kv = {'User-Agent': 'Mozilla/5.0'}
r= requests.get(link,headers=kv)
# with open('22.html','w',encoding="utf8") as TB_File:
# TB_File.write(r.text)
img=r.text
img = img.replace('<!--', '').replace('-->', '')
listree=html.etree.HTML(img)
img_links=listree.xpath('//div[@class="p_content "]//img[@class="BDE_Image"]/@src')
for link in img_links:
try:
print("正在下载:%s"%(link))
download_img(link)
except:
print("出错正在跳过" )
continue
def download_img(link):
kv = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(link, headers=kv)
pic_name='pics/'+link[link.rfind('/')+1:]
with open(pic_name,'wb',) as f:
f.write(r.content)
if __name__ == '__main__':
keywords=input("请输入搜索关键字:")
pages=int(input("请输入需要获得的页数:"))
# timeSleep=int(input('请输入每次爬取的间隔时间:'))
GetHtml(keywords,pages)
# with open("1111.html" , encoding="utf8") as f:
# code = f.read()
# code = code.replace("<!--","").replace("-->","")
# codeTree = html.etree.HTML(code)
# list = codeTree.xpath('//div[@class="result-op c-container xpath-log"]/h3/a/text()')
# print(list)
百度贴吧帖子图片爬虫
最新推荐文章于 2024-04-03 09:50:08 发布