Предварительный просмотр API Vision от Apple

Предварительный просмотр API Vision от Apple ⇐ IOS

1 сообщение • Страница 1 из 1

Гость

Предварительный просмотр API Vision от Apple

Цитата

Сообщение Гость » 14 мар 2024, 12:40

Я пытаюсь написать приложение для iOS, которое сканирует документы для обработки с помощью Apple Vision API. Цель состоит в том, чтобы прямая трансляция видео отображала то, что камера видит на экране, с выделенными контурами документа и заполненными полупрозрачным цветом, а также отслеживание видео по мере перемещения телефона до тех пор, пока пользователь не он создает кадр так, как им нравится, и делает снимок.
Для этого я создаю 3 вещи: AVCaptureVideoDataOutput для получения буферов кадров для анализа, AVCaptureVideoPreviewLayer для отображения предварительного просмотра видео и CAShapeLayer, чтобы показать контур/заливку. Кажется, все работает отлично: я получаю кадр, запускаю VNDetectDocumentSegmentationRequest в фоновом режиме, чтобы быстро получить четырехугольник-кандидат, и бросаю задачу в основную DispatchQueue для обновления слоя формы.
Однако системы координат не совпадают. В зависимости от настроек сеанса захвата или в зависимости от устройства или размера экрана слоя предварительного просмотра системы координат буфера кадра и отображаемой области могут меняться. Я перепробовал все возможные комбинации преобразований, но пока не нашел волшебной формулы. Кто-нибудь знает, как это сделать?
Вот код...
Я инициализирую выходные данные/слои:
detectionOutput = AVCaptureVideoDataOutput()
detectionOutput.alwaysDiscardsLateVideoFrames = true
detectionOutput.setSampleBufferDelegate(self, queue: DispatchQueue(label: "sampleBufferQueue"))
if let captureSession = captureSession, captureSession.canAddOutput(detectionOutput) {
captureSession.addOutput(detectionOutput)
} else {
print("Capture session could not be established.")
return
}

videoPreviewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
videoPreviewLayer.frame = view.layer.bounds
videoPreviewLayer.videoGravity = .resizeAspectFill
view.layer.addSublayer(videoPreviewLayer)

documentOverlayLayer = CAShapeLayer()
documentOverlayLayer.frame = videoPreviewLayer.frame
documentOverlayLayer.strokeColor = UIColor.red.cgColor
documentOverlayLayer.lineWidth = 2
documentOverlayLayer.fillColor = UIColor.clear.cgColor
videoPreviewLayer.addSublayer(documentOverlayLayer)

Затем я захватываю кадровые буферы следующим образом:
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
let ciImage = CIImage(cvPixelBuffer: pixelBuffer)
detectDocument(in: ciImage, withOrientation: exifOrientationFromDeviceOrientation())
}

Определите документ следующим образом:
private func detectDocument(in image: CIImage, withOrientation orientation: CGImagePropertyOrientation) {
let requestHandler = VNImageRequestHandler(ciImage: image, orientation: orientation, options: [:])
let documentDetectionRequest = VNDetectDocumentSegmentationRequest { [weak self] request, error in
DispatchQueue.main.async {
guard let self = self else { return }

guard let results = request.results as? [VNRectangleObservation],
let result = results.first else {
// No results
self.detectedRectangle = nil
self.documentOverlayLayer.path = nil
return
}

if result.confidence < 0.5 {
// Too low confidence
self.detectedRectangle = nil
self.documentOverlayLayer.path = nil
}
else {
self.detectedRectangle = result
self.drawRectangle(result, inBounds: image.extent)
}
}
}

А затем попытайтесь просмотреть его следующим образом:
private func drawRectangle(_ rectangle: VNRectangleObservation, inBounds: CGRect) {
let xScale = videoPreviewLayer.frame.width * videoPreviewLayer.contentsScale
let yScale = videoPreviewLayer.frame.height * videoPreviewLayer.contentsScale

// Transforming Vision coordinates to UIKit coordinates
// HELP!!! Despite all kinds of combinations of outputRectConverted, layerRectConverted, manually-created transforms or others, I can't get the rectangles to consistently line up with the image...
let topLeft = CGPoint(x: rectangle.topLeft.x * xScale, y: (1 - rectangle.topLeft.y) * yScale)
let topRight = CGPoint(x: rectangle.topRight.x * xScale, y: (1 - rectangle.topRight.y) * yScale)
let bottomLeft = CGPoint(x: rectangle.bottomLeft.x * xScale, y: (1 - rectangle.bottomLeft.y) * yScale)
let bottomRight = CGPoint(x: rectangle.bottomRight.x * xScale, y: (1 - rectangle.bottomRight.y) * yScale)

// Create a UIBezierPath from the transformed points
let path = UIBezierPath()
path.move(to: topLeft)
path.addLine(to: topRight)
path.addLine(to: bottomRight)
path.addLine(to: bottomLeft)
path.close()

DispatchQueue.main.async {
self.documentOverlayLayer.path = path.cgPath
}
}

Источник: https://stackoverflow.com/questions/775 ... ision-apis

1710409208

Гость

Я пытаюсь написать приложение для iOS, которое сканирует документы для обработки с помощью Apple Vision API. Цель состоит в том, чтобы прямая трансляция видео отображала то, что камера видит на экране, с выделенными контурами документа и заполненными полупрозрачным цветом, а также отслеживание видео по мере перемещения телефона до тех пор, пока пользователь не он создает кадр так, как им нравится, и делает снимок.
Для этого я создаю 3 вещи: AVCaptureVideoDataOutput для получения буферов кадров для анализа, AVCaptureVideoPreviewLayer для отображения предварительного просмотра видео и CAShapeLayer, чтобы показать контур/заливку. Кажется, все работает отлично: я получаю кадр, запускаю VNDetectDocumentSegmentationRequest в фоновом режиме, чтобы быстро получить четырехугольник-кандидат, и бросаю задачу в основную DispatchQueue для обновления слоя формы.
Однако системы координат не совпадают. В зависимости от настроек сеанса захвата или в зависимости от устройства или размера экрана слоя предварительного просмотра системы координат буфера кадра и отображаемой области могут меняться. Я перепробовал все возможные комбинации преобразований, но пока не нашел волшебной формулы. Кто-нибудь знает, как это сделать?
Вот код...
Я инициализирую выходные данные/слои:
detectionOutput = AVCaptureVideoDataOutput()
detectionOutput.alwaysDiscardsLateVideoFrames = true
detectionOutput.setSampleBufferDelegate(self, queue: DispatchQueue(label: "sampleBufferQueue"))
if let captureSession = captureSession, captureSession.canAddOutput(detectionOutput) {
captureSession.addOutput(detectionOutput)
} else {
print("Capture session could not be established.")
return
}

videoPreviewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
videoPreviewLayer.frame = view.layer.bounds
videoPreviewLayer.videoGravity = .resizeAspectFill
view.layer.addSublayer(videoPreviewLayer)

documentOverlayLayer = CAShapeLayer()
documentOverlayLayer.frame = videoPreviewLayer.frame
documentOverlayLayer.strokeColor = UIColor.red.cgColor
documentOverlayLayer.lineWidth = 2
documentOverlayLayer.fillColor = UIColor.clear.cgColor
videoPreviewLayer.addSublayer(documentOverlayLayer)

Затем я захватываю кадровые буферы следующим образом:
  func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
let ciImage = CIImage(cvPixelBuffer: pixelBuffer)
detectDocument(in: ciImage, withOrientation: exifOrientationFromDeviceOrientation())
}

Определите документ следующим образом:
  private func detectDocument(in image: CIImage, withOrientation orientation: CGImagePropertyOrientation) {
let requestHandler = VNImageRequestHandler(ciImage: image, orientation: orientation, options: [:])
let documentDetectionRequest = VNDetectDocumentSegmentationRequest { [weak self] request, error in
DispatchQueue.main.async {
guard let self = self else { return }

guard let results = request.results as? [VNRectangleObservation],
let result = results.first else {
// No results
self.detectedRectangle = nil
self.documentOverlayLayer.path = nil
return
}

if result.confidence < 0.5 {
// Too low confidence
self.detectedRectangle = nil
self.documentOverlayLayer.path = nil
}
else {
self.detectedRectangle = result
self.drawRectangle(result, inBounds: image.extent)
}
}
}

А затем попытайтесь просмотреть его следующим образом:
  private func drawRectangle(_ rectangle: VNRectangleObservation, inBounds: CGRect) {
let xScale = videoPreviewLayer.frame.width * videoPreviewLayer.contentsScale
let yScale = videoPreviewLayer.frame.height * videoPreviewLayer.contentsScale

// Transforming Vision coordinates to UIKit coordinates
// HELP!!! Despite all kinds of combinations of outputRectConverted, layerRectConverted, manually-created transforms or others, I can't get the rectangles to consistently line up with the image...
let topLeft = CGPoint(x: rectangle.topLeft.x * xScale, y: (1 - rectangle.topLeft.y) * yScale)
let topRight = CGPoint(x: rectangle.topRight.x * xScale, y: (1 - rectangle.topRight.y) * yScale)
let bottomLeft = CGPoint(x: rectangle.bottomLeft.x * xScale, y: (1 - rectangle.bottomLeft.y) * yScale)
let bottomRight = CGPoint(x: rectangle.bottomRight.x * xScale, y: (1 - rectangle.bottomRight.y) * yScale)

// Create a UIBezierPath from the transformed points
let path = UIBezierPath()
path.move(to: topLeft)
path.addLine(to: topRight)
path.addLine(to: bottomRight)
path.addLine(to: bottomLeft)
path.close()

DispatchQueue.main.async {
self.documentOverlayLayer.path = path.cgPath
}
}
 

Источник: [url]https://stackoverflow.com/questions/77529970/live-preview-of-apples-vision-apis[/url]