Обнаружение перекодирования звука с помощью ffmpeg

Обнаружение перекодирования звука с помощью ffmpeg ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Обнаружение перекодирования звука с помощью ffmpeg

Сообщение Anonymous » 12 май 2026, 03:39

Я создаю для себя небольшой инструмент CLI, который поможет мне найти перекодированные аудиофайлы. Идея состоит в том, чтобы обнаружить «поддельные» файлы с высоким битрейтом, которые на самом деле были перекодированы из источника более низкого качества.
Обработка сигналов выходит за рамки моей компетенции, поэтому я хотел бы получить обратную связь:

- Правильный ли этот подход или в моем анализе есть очевидные ошибки?

- Есть ли более быстрые/эффективные способы сделать это? (Без распараллеливания)

- Есть ли крайние случаи, которые этот подход пропускает?

Код: Выделить всё

# Analyzes the frequency content of audio files to verify whether
# the reported bitrate reflects the actual audio quality, or whether
# the file has been transcoded from a lower-quality source.

import os
import time
import subprocess
import numpy as np

LIBS_PATH = "../libs"

def generate_spectrogram(path: str):
base        = os.path.splitext(os.path.basename(path))[0]
output_path = f"{base}_spectrogram.png"

subprocess.run(
[
f"{LIBS_PATH}/ffmpeg",
"-v",     "error",                                                        # Suppress all non-error messages
"-i",     path,                                                           # Input file
"-y",     output_path,                                                    # Output file, overwrite if exists
"-lavfi", "showspectrumpic=s=400x250:legend=1:scale=log:color=intensity", # Settings for the spectrogram
],
check = True,
)

def read_audio_chunk(path: str, start_at_sec: float):
DURATION_SEC = 2

# Use `ffmpeg` to decode the chunk of the audio file to raw PCM samples.
result = subprocess.run(
[
f"{LIBS_PATH}/ffmpeg",
"-v",  "error",           # Suppress all non-error messages
"-ss", str(start_at_sec), # Seek to this position in the track before decoding
"-i",  path,              # Input file
"-t",  str(DURATION_SEC), # Decode only this many seconds
"-ac", "1",               # Downmix to mono to simplify the analysis
"-f",  "f32le",           # Set output as raw little-endian float32 samples with no container
"-"                       # Write to stdout
],
capture_output = True,
)

return np.frombuffer(result.stdout, dtype = np.float32)

# Finds the frequency cutoff of an audio file.
#
# Scan the PCM samples in 4 different chunks spaced through the track.  For each chunk,
# apply Real Fast Fourier Transform (RFFT) to separate the frequencies into bins, and analyze
# them to find the highest frequency bin across the track that is not "silence".
def find_frequency_cutoff(path: str):
# Use `ffprobe` to get the metadata we need to interpret the PCM samples
ffprobe_out = subprocess.run(
[
f"{LIBS_PATH}/ffprobe",
"-v",              "error",                                           # Suppress all non-error messages
"-select_streams", "a:0",                                             # Target only the first audio stream, ignoring video streams
"-show_entries",   "stream=sample_rate,bit_rate,codec_name,duration", # Fields to extract, one per output line
"-of",             "default=noprint_wrappers=1:nokey=1",              # Output values only, no keys or section headers
path                                                                  # Input file
],
capture_output = True,
text           = True,
)

lines        = ffprobe_out.stdout.strip().splitlines()
codec        = lines[0]
sample_rate  = int(lines[1])
duration_sec = float(lines[2])
bitrate_bps  = int(lines[3])

# =====

max_cutoff_hz = -1.0

for pct in [0.25, 0.4, 0.6, 0.75]:
start_at_sec = duration_sec * pct
chunk        = read_audio_chunk(path, start_at_sec)

# Apply a Hanning window to reduce spectral leakage, then compute
# the frequency spectrum using RFFT.
chunk    = chunk * np.hanning(len(chunk))
spectrum = np.abs(np.fft.rfft(chunk))

peak = spectrum.max()

if peak == 0:
continue  # This chunk is silent

# Normalize to `[0, 1]` so all files share the same reference level.
spectrum = spectrum / peak

# Convert to dBFS (decibels relative to full scale)
# 0 dBFS is the loudest bin; everything else is negative.
spectrum = 20 * np.log10(spectrum)

# =====

bin_resolution_hz = sample_rate / len(chunk) # Hz per bin

BAND_SIZE = 50

# Scan from the highest frequency bin downward, looking for the
# first `band` where most bins are above the noise floor.
for i in range(len(spectrum), BAND_SIZE, -1):
band = spectrum[i - BAND_SIZE : i]

# Sounds below this dBFS level are considered "silence".
NOISE_FLOOR_DBFS = -100

# If most bins are above the noise floor, it's probably real content
# rather than just noise. If so, the upper edge of this band is the cutoff.
if (band > NOISE_FLOOR_DBFS).mean() >= 0.5:
cutoff_hz     = (i - 1) * bin_resolution_hz
max_cutoff_hz = max(max_cutoff_hz, cutoff_hz)
break

# If we never found a band above the noise floor, the file is likely silent/corrupt.
if max_cutoff_hz < 0:
max_cutoff_hz = None

return path, codec, bitrate_bps, max_cutoff_hz

# =====

FILES = [
os.path.join("../downloads", f)
for f in os.listdir("../downloads")
if f.endswith(".mp3")
]

COL_SIZE = [20, 6, 10, 10]
header   = f"{'File':{COL_SIZE[3]}}"

print(header)
print("-" * len(header))

started_at = time.time()

for p in FILES:
path, codec, bitrate, cutoff_hz = find_frequency_cutoff(p)

if cutoff_hz is None:
cutoff_str = "N/A"
else:
cutoff_str = f"{cutoff_hz:.0f} Hz"

bitrate = f"{bitrate / 1000:.0f} kBPS"
name    = os.path.basename(p)

if len(name) > COL_SIZE[0]:
name = name[:COL_SIZE[0] - 4] + "..."

print(f"{name:{COL_SIZE[3]}}")

elapsed_sec = time.time() - started_at
print(f"\nAnalyzed {len(FILES)} files in {elapsed_sec:.1f} seconds")

(Не обращайте внимания на чрезмерные комментарии, я оставил это для себя в будущем, который, вероятно, совершенно забудет, почему все это работает)

Anonymous

1 сообщение • Страница 1 из 1

Вернуться в «Python»