Python教程：老师傅带你爬取全网妹子图！

最新推荐文章于 2024-03-29 17:18:16 发布

MC数据局

最新推荐文章于 2024-03-29 17:18:16 发布

阅读量1.3k

点赞数

分类专栏：博士生涯

本文链接：https://blog.csdn.net/WASEFADG/article/details/80907682

版权

博士生涯专栏收录该内容

267 篇文章 60 订阅

订阅专栏

https://www.bilibili.com/video/av12721444/

代码如下（python3版本20180704可用）

#-*-coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib #python 3.x中urllib库和urilib2库合并成了urllib库,像你这个 import urlib 然后把urllib2.urlopen() 改成 urllib.request.urlopen（）即可
#import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')
url='http://www.dbmeinv.com/?paper_offset=1'
def crawl(url): #反爬虫，获取不到想要的内容或者请求失败，模拟模拟器访问，加上头部信息
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    req=urllib.request.Request(url,headers=headers) #创建对象
    page=urllib.request.urlopen(req,timeout=20) #设置超时
    contents=page.read()
    #print(contents)
    soup=BeautifulSoup(contents,'html.parser')
    my_girl=soup.find_all('img') #找到img标签
    x=0
    for girl in my_girl: #遍历
        link=girl.get('src') #获取src路径
        print(link)
        #global x    #全局变量
        urllib.request.urlretrieve(link,'image\%s.jpg'% x) #下载
        x +=1
        print("正在下载第%s张"%x)
crawl(url)

'''
html=''
soup=BeautifulSoup(open('a.html'),'html.parser')
print(soup.prettify())
'''
'''
html='<title>同学们都很棒</title>'
soup=BeautifulSoup(html,'html.parser') #创建对象，解析网页
print(soup.title)
'''
#'http://www.dbmeinv.com/?paper_offset=%s'%'2' %占位符

#'http://www.dbmeinv.com/?paper_offset=()'.format()