爬取7160美女图片

最新推荐文章于 2024-01-08 14:32:42 发布

weixin_33937913

最新推荐文章于 2024-01-08 14:32:42 发布

阅读量3.4w

点赞数 1

文章标签： python java 爬虫

#coding=utf-8

import urllib.request
from bs4 import BeautifulSoup
from urllib import error
import re
ls = ['zhenrenxiu','meinv',"lianglichemo",'rentiyishu','xiaohua']
def validateTitle(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title)  # 替换为下划线
    return new_title

for j in range(1,60000):
   url_origin = "http://www.7160.com/xiaohua/"+str(j)
   try:
      page_obj = urllib.request.urlopen(url_origin)
      page_soup = BeautifulSoup(page_obj,'lxml')
      total_page_obj = page_soup.find(text=re.compile('共')).string
      pattern = re.compile(r'\d+')
      match = pattern.search(total_page_obj)

      if match == None:
         total_page = 0;
      else:
         total_page = match.group();

      for i in range(1,int(total_page)):
         if i == 1 :
            url = url_origin+"/index.html"
         else:
            url = url_origin+"/index_"+str(i)+".html"
         request = urllib.request.Request(url)
         try:
            res = urllib.request.urlopen(request)

            soup = BeautifulSoup(res,'lxml')
            title_obj = soup.find(attrs={"class":"picmainer"})

            if title_obj is not None:
               print(url)
               title = title_obj.h1.string
               content = soup.find('img')
               src = content.get("src")

               file_name = validateTitle(title)+".jpg"
               urllib.request.urlretrieve(src, "D://img2/"+file_name)
               print(file_name+"保存成功")
         except Exception  as e:
            print("异常"+str(j))
   except Exception  as e:
            print("异常"+str(j))