python爬取换页_python爬虫获取下一页

from time importsleepimportfakerimportrequestsfrom lxml importetree

fake=faker.Faker()

base_url= "http://angelimg.spbeen.com"

defget_next_link(url):

content=downloadHtml(url)

html=etree.HTML(content)

next_url= html.xpath("//a[@class='ch next']/@href")ifnext_url:return base_url +next_url[0]else:returnFalsedefdownloadHtml(ur):

user_agent=fake.user_agent()

headers= {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}

response= requests.get(url, headers=headers)returnresponse.textdefgetImgUrl(content):

html=etree.HTML(content)

img_url= html.xpath('//*[@id="content"]/a/img/@src')

title= html.xpath(".//div['@class=article']/h2/text()")returnimg_url[0],title[0]defsaveImg(title,img_url):if img_url is not None and title is notNone:

with open("txt/"+str(title)+".jpg",'wb') as f:

user_agent=fake.user_agent()

headers= {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}

content= requests.get(img_url, headers=headers)#request_view(content)

f.write(content.content)

f.close()defrequest_view(response):importwebbrowser

request_url=response.url

base_url= '

' %(request_url)

base_url=base_url.encode()

content= response.content.replace(b"

",base_url)

tem_html= open('tmp.html','wb')

tem_html.write(content)

tem_html.close()

webbrowser.open_new_tab('tmp.html')defcrawl_img(url):

content=downloadHtml(url)

res=getImgUrl(content)

title= res[1]

img_url=res[0]

saveImg(title,img_url)if __name__ == "__main__":

url= "http://angelimg.spbeen.com/ang/4968/1"

whileurl:print(url)

crawl_img(url)

url= get_next_link(url)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值