当需要在IOS中使用coreML来进行视觉识别功能时,Apple的开发者官网提供了较多的案例可以用于参考学习。下面我将结合源码(ViewController.swift和VisionObjectRecognitionViewController.swift)来解析整个实现视觉识别的代码逻辑以及流程。当我们需要用到该功能时,只需要新建swift文件粘贴源码,然后导入coreML模型,再修改指定位置的对应模型名称,将VisionObjectRecognitionViewController.swift绑定main.storyboard界面即可。
ViewController.swift
import UIKit
import AVFoundation
import Vision
class ViewController: UIViewController, AVCaptureVideoDataOutputSampleBufferDelegate {
var bufferSize: CGSize = .zero
var rootLayer: CALayer! = nil
@IBOutlet weak private var previewView: UIView! //屏幕可视化界面实例
//AVCaptureSession实例化后的变量,主要用于传递视频数据
private let session = AVCaptureSession()
//核心层,用于播放捕获的视频
private var previewLayer: AVCaptureVideoPreviewLayer! = nil
//用于捕获输出视频数据,同时提供支持对每帧进行修改
private let videoDataOutput = AVCaptureVideoDataOutput()
//接收videoDataOutput的队列
private let videoDataOutputQueue = DispatchQueue(label: "VideoDataOutput", qos: .userInitiated, attributes: [], autoreleaseFrequency: .workItem)
override func viewDidLoad() { //主入口,相当于main函数
super.viewDidLoad()
setupAVCapture()
}
override func didReceiveMemoryWarning() {
super.didReceiveMemoryWarning()
// Dispose of any resources that can be recreated.
}
func setupAVCapture() {
//定义一个用于存储相机输入的数据流的变量
var deviceInput: AVCaptureDeviceInput!
// videoDevice用于选择相机设备,比如前置还是后置
let videoDevice = AVCaptureDevice.DiscoverySession(deviceTypes: [.builtInWideAngleCamera], mediaType: .video, position: .back).devices.first
//绑定deviceInput与指定设备,以使可以从中获得数据
do {
deviceInput = try AVCaptureDeviceInput(device: videoDevice!)
} catch {
print("Could not create video device input: \(error)")
return
}
session.beginConfiguration() //开始设置session的参数
session.sessionPreset = .vga640x480 // 设置输入数据的像素.
//首先判断是否能添加输入数据流,如果不能就结束函数
guard session.canAddInput(deviceInput) else {
print("Could not add video device input to the session")
session.commitConfiguration() //停止设置session参数
return
}
session.addInput(deviceInput) //程序没停止则说明能够添加,正常加入
if session.canAddOutput(videoDataOutput) { //如果可以添加输出接口
session.addOutput(videoDataOutput) //添加
// 如果设备正在处理某张图像帧,则新捕获的帧就丢弃
videoDataOutput.alwaysDiscardsLateVideoFrames = true
videoDataOutput.videoSettings = [kCVPixelBufferPixelFormatTypeKey as String: Int(kCVPixelFormatType_420YpCbCr8BiPlanarFullRange)]
//开启委托,将图片帧存入到队列中,等待videoDataOutput开启连接后就可由captureOutput函数持续接收队列中的数据
videoDataOutput.setSampleBufferDelegate(self, queue: videoDataOutputQueue)
} else {
//失败就结束函数
print("Could not add video data output to the session")
session.commitConfiguration()
return
}
//videoDataOutput开启连接
let captureConnection = videoDataOutput.connection(with: .video)
captureConnection?.isEnabled = true
do {
try videoDevice!.lockForConfiguration() //锁定设备
//更新记录当前设备输入帧的格式大小
let dimensions = CMVideoFormatDescriptionGetDimensions((videoDevice?.activeFormat.formatDescription)!)
bufferSize.width = CGFloat(dimensions.width)
bufferSize.height = CGFloat(dimensions.height)
videoDevice!.unlockForConfiguration() //解锁
} catch {
print(error)
}
session.commitConfiguration() //session设置完成
//将展示层和session绑定
previewLayer = AVCaptureVideoPreviewLayer(session: session)
//设置 rootLayer和previewView、rootLayer和previewLayer,使适配
previewLayer.videoGravity = AVLayerVideoGravity.resizeAspectFill
rootLayer = previewView.layer
previewLayer.frame = rootLayer.bounds
//将perviewLayer加入到rootLayer中作为最下方的一层,后续再叠加标签、方框等层,最后只需要展示rootLayer即可
rootLayer.addSublayer(previewLayer)
}
func startCaptureSession() {
session.startRunning()
}
// Clean up capture setup
func teardownAVCapture() {
previewLayer.removeFromSuperlayer()
previewLayer = nil
}
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
// to be implemented in the subclass
}
func captureOutput(_ captureOutput: AVCaptureOutput, didDrop didDropSampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
// print("frame dropped")
}
public func exifOrientationFromDeviceOrientation() -> CGImagePropertyOrientation {
let curDeviceOrientation = UIDevice.current.orientation
let exifOrientation: CGImagePropertyOrientation
switch curDeviceOrientation {
case UIDeviceOrientation.portraitUpsideDown: // Device oriented vertically, home button on the top
exifOrientation = .left
case UIDeviceOrientation.landscapeLeft: // Device oriented horizontally, home button on the right
exifOrientation = .upMirrored
case UIDeviceOrientation.landscapeRight: // Device oriented horizontally, home button on the left
exifOrientation = .down
case UIDeviceOrientation.portrait: // Device oriented vertically, home button on the bottom
exifOrientation = .up
default:
exifOrientation = .up
}
return exifOrientation
}
}
VisionObjectRecognitionViewController.swift
import UIKit
import AVFoundation
import Vision
class VisionObjectRecognitionViewController: ViewController {
private var detectionOverlay: CALayer! = nil
// Vision parts
private var requests = [VNRequest]()
@discardableResult
func setupVision() -> NSError? {
// Setup Vision parts
let error: NSError! = nil
//链接coreML模型
guard let modelURL = Bundle.main.url(forResource: "ObjectDetector", withExtension: "mlmodelc") else {
return NSError(domain: "VisionObjectRecognitionViewController", code: -1, userInfo: [NSLocalizedDescriptionKey: "Model file is missing"])
}
do {
let visionModel = try VNCoreMLModel(for: MLModel(contentsOf: modelURL))
//completionHandler 将开启一个在后台运行的队列持续处理数据
let objectRecognition = VNCoreMLRequest(model: visionModel, completionHandler: { (request, error) in
DispatchQueue.main.async(execute: { //加入到主队列
//将识别结果(标签、线框)显示出来
if let results = request.results {
self.drawVisionRequestResults(results)
}
})
})
self.requests = [objectRecognition]
} catch let error as NSError {
print("Model loading went wrong: \(error)")
}
return error
}
//添加识别结果信息的图层到detectionOverlay层中
func drawVisionRequestResults(_ results: [Any]) {
CATransaction.begin()
CATransaction.setValue(kCFBooleanTrue, forKey: kCATransactionDisableActions)
detectionOverlay.sublayers = nil // 清空detectionOverlay层之前的识别结果
for observation in results where observation is VNRecognizedObjectObservation {
guard let objectObservation = observation as? VNRecognizedObjectObservation else {
continue
}
//仅选择识别结果中概率最高的物体
let topLabelObservation = objectObservation.labels[0]
let objectBounds = VNImageRectForNormalizedRect(objectObservation.boundingBox, Int(bufferSize.width), Int(bufferSize.height))
//生成标签和线框两个图层
let shapeLayer = self.createRoundedRectLayerWithBounds(objectBounds)
let textLayer = self.createTextSubLayerInBounds(objectBounds,
identifier: topLabelObservation.identifier,
confidence: topLabelObservation.confidence)
shapeLayer.addSublayer(textLayer)
detectionOverlay.addSublayer(shapeLayer) //将两个图层都加入到detectionOverlay层中去
}
self.updateLayerGeometry() //调整detectionOverlay层使适应屏幕
CATransaction.commit()
}
//重写父类的函数,响应委托(connetion为true时被激活),读取队列中的数据
override func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
//读入队列中的数据并进行处理
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
return
}
let exifOrientation = exifOrientationFromDeviceOrientation()
let imageRequestHandler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: exifOrientation, options: [:])
//将数据流转入到requests里面
do {
try imageRequestHandler.perform(self.requests)
} catch {
print(error)
}
}
override func setupAVCapture() { //重载父类
super.setupAVCapture() //配置session等的信息,准备获取和输出相机数据
setupLayers() //创建detectionOverlay并加入根图层
updateLayerGeometry() //调整并更新图层
setupVision() //识别,添加结果并将图层刷新
startCaptureSession() //开启session,传递当前相机输出数据
}
//创建一个新的用于承载障碍物信息(标签、线框)图层的detectionOverlay层
func setupLayers() {
detectionOverlay = CALayer() // 实例化
//配置信息
detectionOverlay.name = "DetectionOverlay"
detectionOverlay.bounds = CGRect(x: 0.0,
y: 0.0,
width: bufferSize.width,
height: bufferSize.height)
detectionOverlay.position = CGPoint(x: rootLayer.bounds.midX, y: rootLayer.bounds.midY)
rootLayer.addSublayer(detectionOverlay) //加入到根图层
}
//调整detectionOverlay层使适应屏幕,同时更新图层
func updateLayerGeometry() {
let bounds = rootLayer.bounds
var scale: CGFloat
let xScale: CGFloat = bounds.size.width / bufferSize.height
let yScale: CGFloat = bounds.size.height / bufferSize.width
scale = fmax(xScale, yScale)
if scale.isInfinite {
scale = 1.0
}
CATransaction.begin()
CATransaction.setValue(kCFBooleanTrue, forKey: kCATransactionDisableActions)
// rotate the layer into screen orientation and scale and mirror
detectionOverlay.setAffineTransform(CGAffineTransform(rotationAngle: CGFloat(.pi / 2.0)).scaledBy(x: scale, y: -scale))
// center the layer
detectionOverlay.position = CGPoint(x: bounds.midX, y: bounds.midY)
CATransaction.commit()
}
//创建标签层
func createTextSubLayerInBounds(_ bounds: CGRect, identifier: String, confidence: VNConfidence) -> CATextLayer {
let textLayer = CATextLayer()
textLayer.name = "Object Label"
let formattedString = NSMutableAttributedString(string: String(format: "\(identifier)\nConfidence: %.2f", confidence))
let largeFont = UIFont(name: "Helvetica", size: 24.0)!
formattedString.addAttributes([NSAttributedString.Key.font: largeFont], range: NSRange(location: 0, length: identifier.count))
textLayer.string = formattedString
textLayer.bounds = CGRect(x: 0, y: 0, width: bounds.size.height - 10, height: bounds.size.width - 10)
textLayer.position = CGPoint(x: bounds.midX, y: bounds.midY)
textLayer.shadowOpacity = 0.7
textLayer.shadowOffset = CGSize(width: 2, height: 2)
textLayer.foregroundColor = CGColor(colorSpace: CGColorSpaceCreateDeviceRGB(), components: [0.0, 0.0, 0.0, 1.0])
textLayer.contentsScale = 2.0 // retina rendering
// rotate the layer into screen orientation and scale and mirror
textLayer.setAffineTransform(CGAffineTransform(rotationAngle: CGFloat(.pi / 2.0)).scaledBy(x: 1.0, y: -1.0))
return textLayer
}
//创建线框层
func createRoundedRectLayerWithBounds(_ bounds: CGRect) -> CALayer {
let shapeLayer = CALayer()
shapeLayer.bounds = bounds
shapeLayer.position = CGPoint(x: bounds.midX, y: bounds.midY)
shapeLayer.name = "Found Object"
shapeLayer.backgroundColor = CGColor(colorSpace: CGColorSpaceCreateDeviceRGB(), components: [1.0, 1.0, 0.2, 0.4])
shapeLayer.cornerRadius = 7
return shapeLayer
}
}
关于重点部分的函数功能和逻辑已经在注释中较详细的说明了,我们仅需要再需要再补充一些地方:
显然VisionObjectRecognitionViewController是ViewController的子类,而在实际运行中,swift程序能够被正常执行的前提是,有类似于main的函数主入口。当我们建立某些继承了类的swift文件时,会自带一些函数,比如viewDidLoad()等,这些函数都是重载的父类中的函数,这些函数是和可视化组件绑定,在程序构建的过程中是会被自动运行的,因为我们重载了这个函数,所以父类中的这个函数就会被掩盖,只会执行我们重载的这个函数,这样,就相当于有了类似与main一样被自动执行的函数入口。但是,如果我们的子类中没有这样的函数,那么系统就会自动去执行父类的函数。
所以,虽然main.storyBoard是和VisionObjectRecognitionViewController.swift绑定,但是由于VisionObjectRecognitionViewController.swift中没有可以作为“main函数”一样的入口,所以在程序运行时,是去回到其父类ViewController中寻找第一个执行的函数的。
然后在父类的viewDidLoad()中又调用了 setupAVCapture(),由于该函数在子类中重写过了,所以实际调用的是子类的重载函数。
那整个流程是怎么持续进行的呢?主要是根据以下两句代码:
videoDataOutput.setSampleBufferDelegate(self, queue: videoDataOutputQueue)
let objectRecognition = VNCoreMLRequest(model: visionModel, completionHandler: { (request, error) in
DispatchQueue.main.async(execute: {
if let results = request.results {
self.drawVisionRequestResults(results)
}
})
})
第一段代码主要是设置了一个委托,当有视频流数据被session写入到videDataOutputQueue这个队列时,就会触发子类中的captureOutput函数,然后将数据都读入到子类中的requests中。这样就保证了数据输入的持续性。
而第二段代码则是分为了两个部分,首先completionHander会开启一个运行在后台的队列,用于通过coreML模型来处理数据,而DispatchQueue.main.async()则是将一段语句加入了主队列中,用于读取处理结果并添加到图层中。这两个的相似之处就是都打开一段在持续执行的语句。