python(二十六)——爬虫练习(爬取图片,爬取QQ号)

目录

练习:从网上爬取图片到本地

练习:爬取网络中的QQ号


练习:从网上爬取图片到本地

图片来自1号店

可以先将一号店的网页代码爬取到一个HTML中

import urllib.request
import os
import re

def imageCrawler(url,topath):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
    }
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read()
    with open(r'a.html','wb') as f:
        f.write(html)


url = 'https://search.yhd.com/c0-0/k%25E6%2597%25B6%25E5%25B0%259A%25E8%25A3%2599%25E8%25A3%2585/'
toPath = r'C:\Users\asus\Desktop\img'
imageCrawler(url,toPath)

 a.html

在网页中找到每张图片的img src来源

然后利用编码去获取(.*?)里的图片地址,通过urllib.request.urlretrieve把图片下载到本地存储

最终代码: 

import os
import re

def imageCrawler(url,topath):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
    }
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read().decode('utf-8')
    # with open(r'a.html','wb') as f:
    #     f.write(html)

    # pat = r'<div style="position: relative">(.*?)</div>'
    pat = r'<div style="position: relative">\n<img src="//(.*?)"/>'
    re_image = re.compile(pat,re.S)
    imagelist = re_image.findall(html)
    print(imagelist)
    print(len(imagelist))
    # print(imagelist[0])
    num = 1
    for imageurl in imagelist:
        path = os.path.join(topath,str(num)+'.jpg')
        num += 1
        #把图片下载到本地存储
        urllib.request.urlretrieve('http://'+imageurl,filename=path)

url = 'https://search.yhd.com/c0-0/k%25E6%2597%25B6%25E5%25B0%259A%25E8%25A3%2599%25E8%25A3%2585/'
toPath = r'C:\Users\asus\Desktop\img'
imageCrawler(url,toPath)

运行结果:

 

 

练习:爬取网络中的QQ号

从豆瓣的这个页面中爬取QQ号

代码:

import urllib.request
import os
import re
import ssl
from collections import deque

def writeFile1(htmlBytes,topath):
    with open(topath,'wb') as f:
        f.write(htmlBytes)
def writeFileStr(htmlBytes,topath):
    with open(topath,'wb') as f:
        f.write(htmlBytes)

def gethtmlbytes(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
    }
    req = urllib.request.Request(url, headers=headers)
    context = ssl._create_unverified_context()
    response = urllib.request.urlopen(req,context=context)
    return response.read()

def qqCrawler(url,topath):
    htmlbytes = gethtmlbytes(url)
    # writeFile1(htmlbytes,r'c.html')
    # writeFileStr(htmlbytes,r'c.txt')
    htmlStr = str(htmlbytes)
    #爬取QQ号
    pat = r'[1-9]\d{4,9}'
    re_qq = re.compile(pat)
    qqlist = re_qq.findall(htmlStr)
    #QQ号列表去重
    qqlist = list(set(qqlist))
    #将爬到的QQ号写入txt
    f = open(topath,'a')
    for qqstr in qqlist:
        f.write(qqstr+'\n')
    f.close()
    # print(qqlist)
    # print(len(qqlist))
    #爬取里面的一些网址
    pat = '(((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)'
    re_url = re.compile(pat)
    urllist = re_url.findall(htmlStr)
    # QQ号列表去重
    urllist = list(set(urllist))
    # print(urllist)
    # print(len(urllist))
    # print(urllist[10])
    return urllist

url = 'https://www.douban.com/group/topic/110094603/'
topath = r'b.txt'
# qqCrawler(url,topath)
#设置中央控制器
def center(url,topath):
    queue = deque()
    queue.append(url)

    while len(queue) != 0:
        targetUrl = queue.popleft()
        urllist = qqCrawler(targetUrl,topath)
        for item in urllist:
            tempurl = item[0]
            queue.append(tempurl)

center(url,topath)

运行结果:可以爬很久,直接停止了

 

 

 

 

一起学习,一起进步 -.- ,如有错误,可以发评论 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

空城机

有钱捧个钱场,没钱捧个人场

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值