python（二十六）——爬虫练习（爬取图片，爬取QQ号）

最新推荐文章于 2024-08-10 07:33:13 发布

空城机

最新推荐文章于 2024-08-10 07:33:13 发布

阅读量1.4k

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/qq_36171287/article/details/95628934

版权

python 专栏收录该内容

47 篇文章 7 订阅

订阅专栏

练习：从网上爬取图片到本地

练习：爬取网络中的QQ号

练习：从网上爬取图片到本地

图片来自1号店

可以先将一号店的网页代码爬取到一个HTML中

import urllib.request
import os
import re

def imageCrawler(url,topath):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
    }
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read()
    with open(r'a.html','wb') as f:
        f.write(html)


url = 'https://search.yhd.com/c0-0/k%25E6%2597%25B6%25E5%25B0%259A%25E8%25A3%2599%25E8%25A3%2585/'
toPath = r'C:\Users\asus\Desktop\img'
imageCrawler(url,toPath)

a.html

在网页中找到每张图片的img src来源

然后利用编码去获取(.*?)里的图片地址，通过urllib.request.urlretrieve把图片下载到本地存储

最终代码：

import os
import re

def imageCrawler(url,topath):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
    }
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read().decode('utf-8')
    # with open(r'a.html','wb') as f:
    #     f.write(html)

    # pat = r'<div style="position: relative">(.*?)</div>'
    pat = r'<div style="position: relative">\n<img src="//(.*?)"/>'
    re_image = re.compile(pat,re.S)
    imagelist = re_image.findall(html)
    print(imagelist)
    print(len(imagelist))
    # print(imagelist[0])
    num = 1
    for imageurl in imagelist:
        path = os.path.join(topath,str(num)+'.jpg')
        num += 1
        #把图片下载到本地存储
        urllib.request.urlretrieve('http://'+imageurl,filename=path)

url = 'https://search.yhd.com/c0-0/k%25E6%2597%25B6%25E5%25B0%259A%25E8%25A3%2599%25E8%25A3%2585/'
toPath = r'C:\Users\asus\Desktop\img'
imageCrawler(url,toPath)

运行结果：

练习：爬取网络中的QQ号

从豆瓣的这个页面中爬取QQ号

代码：

import urllib.request
import os
import re
import ssl
from collections import deque

def writeFile1(htmlBytes,topath):
    with open(topath,'wb') as f:
        f.write(htmlBytes)
def writeFileStr(htmlBytes,topath):
    with open(topath,'wb') as f:
        f.write(htmlBytes)

def gethtmlbytes(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
    }
    req = urllib.request.Request(url, headers=headers)
    context = ssl._create_unverified_context()
    response = urllib.request.urlopen(req,context=context)
    return response.read()

def qqCrawler(url,topath):
    htmlbytes = gethtmlbytes(url)
    # writeFile1(htmlbytes,r'c.html')
    # writeFileStr(htmlbytes,r'c.txt')
    htmlStr = str(htmlbytes)
    #爬取QQ号
    pat = r'[1-9]\d{4,9}'
    re_qq = re.compile(pat)
    qqlist = re_qq.findall(htmlStr)
    #QQ号列表去重
    qqlist = list(set(qqlist))
    #将爬到的QQ号写入txt
    f = open(topath,'a')
    for qqstr in qqlist:
        f.write(qqstr+'\n')
    f.close()
    # print(qqlist)
    # print(len(qqlist))
    #爬取里面的一些网址
    pat = '(((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)'
    re_url = re.compile(pat)
    urllist = re_url.findall(htmlStr)
    # QQ号列表去重
    urllist = list(set(urllist))
    # print(urllist)
    # print(len(urllist))
    # print(urllist[10])
    return urllist

url = 'https://www.douban.com/group/topic/110094603/'
topath = r'b.txt'
# qqCrawler(url,topath)
#设置中央控制器
def center(url,topath):
    queue = deque()
    queue.append(url)

    while len(queue) != 0:
        targetUrl = queue.popleft()
        urllist = qqCrawler(targetUrl,topath)
        for item in urllist:
            tempurl = item[0]
            queue.append(tempurl)

center(url,topath)

运行结果：可以爬很久，直接停止了