python 爬虫 糗百成人

import urllib
from time import sleep

import requests
from lxml import etree



try:
    def all_links(url,page):
        # if "900.html" in url:
        #     print("结束");
        #     return None
        url = url + str(page) + ".html";
        response = requests.get(url)
        print(url, response.status_code)
        html = etree.HTML(response.content.decode('gbk'))
        ## 获取图片 并且保存
        imgs = html.xpath('.//div[@id="wrapper"]//div[@class="ui-module"]//img/@src')
        for img in imgs:
            file_name = img.split('/')[-1]
            first = img.split('/')[0]
            if first != 'http:' and first != 'https:':
                print("错误图片"+img)
            else:
                dir_path = "/www/spider/images/"
                try:
                    file_content = requests.get(img)
                    if file_content.status_code != 200:
                        print(img,"下载失败")
                    else:

                        #urllib.request.urlretrieve(img, dir_path + file_name)
                        with open(dir_path+file_name,"wb") as f:
                            f.write(file_content.content)
                            print("保存图片" + dir_path + file_name + "成功")
                except Exception as ee:
                    print(str(ee))
        # links = html.xpath('.//div[@class="page"]//a[contains(text(),"下一页")]/@href')
        # print(links)
        # if len(links) < 1:
        #     pass
        # else:
        sleep(1)
        host = 'http://www.qiubaichengren.net/'
        next_page = page + 1
        all_links(host,next_page)

    for i in range(1,991):
        all_links("http://www.qiubaichengren.net/",354)
except Exception as e:
    print(str(e))

 循环的版本

import urllib
from time import sleep

import requests
from lxml import etree



try:
def all_links(url):
if "100.html" in url:
print("结束");
return None
response = requests.get(url)
print(url, response.status_code)
html = etree.HTML(response.content.decode('gbk'))
## 获取图片 并且保存
imgs = html.xpath('.//div[@id="wrapper"]//div[@class="ui-module"]//img/@src')
for img in imgs:
file_name = img.split('/')[-1]
first = img.split('/')[0]
if first != 'http:' and first != 'https:':
print("错误图片"+img)
else:
dir_path = "d:\\www\\spider\\images\\"
urllib.request.urlretrieve(img, dir_path + file_name)
print("保存图片" + dir_path + file_name + "成功")
links = html.xpath('.//div[@class="page"]//a[contains(text(),"下一页")]/@href')
print(links)
if len(links) < 1:
pass
else:
sleep(5)
host = 'http://www.qiubaichengren.net/'
new_url = host + links[0];
all_links(new_url)
all_links("http://www.qiubaichengren.net/8.html")
except Exception as e:
print(str(e))

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值