А расскажите про вашу настройку: модель, параметры, где крутится, через что и как дёргается?
Крутится на локальной машине, вызовы Groq. Стоит при плотном ежедневном использовании где-то $1.5- $2.0 / мес. Под катом исходник main.py. В нем можно найти детали. Личные словари и доп. модули приводить не буду, но смысл из вызываемых функций понятен.
Из приятного - время ответа от сервера в районе 0.5-2 сек.
main.py
from itertools import chain
import faulthandler
import tempfile
import sys
import os
import time
from pathlib import Path
import keyboard
import pyaudio
import wave
from dotenv import load_dotenv
from groq import Groq
import win32gui
import win32con
# import win32api
import text_processing as tp
from character_groups import get_character_groups
from clipboard_operations import clipboard_manager
import logging
import win32clipboard
import ctypes
from ctypes import wintypes
import win32process
import psutil
from pynput.keyboard import Key, Controller as KeyboardController
from constants import CONTEXT_EXCEPTION_PROCESSES
from runtime_exceptions import RuntimeExceptions
# Disable groq logging
logging.getLogger('groq').setLevel(logging.WARNING)
logging.getLogger('httpx').setLevel(logging.WARNING)
logging.getLogger('httpcore').setLevel(logging.WARNING)
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('clipboard_manager.log')
]
)
logger = logging.getLogger('clipboard_manager')
# Define input structures for SendInput
class MOUSEINPUT(ctypes.Structure):
_fields_ = [
("dx", wintypes.LONG),
("dy", wintypes.LONG),
("mouseData", wintypes.DWORD),
("dwFlags", wintypes.DWORD),
("time", wintypes.DWORD),
("dwExtraInfo", ctypes.POINTER(ctypes.c_ulong))
]
class KEYBDINPUT(ctypes.Structure):
_fields_ = [
("wVk", wintypes.WORD),
("wScan", wintypes.WORD),
("dwFlags", wintypes.DWORD),
("time", wintypes.DWORD),
("dwExtraInfo", ctypes.POINTER(ctypes.c_ulong))
]
class HARDWAREINPUT(ctypes.Structure):
_fields_ = [
("uMsg", wintypes.DWORD),
("wParamL", wintypes.WORD),
("wParamH", wintypes.WORD)
]
class INPUT_union(ctypes.Union):
_fields_ = [
("mi", MOUSEINPUT),
("ki", KEYBDINPUT),
("hi", HARDWAREINPUT)
]
class INPUT(ctypes.Structure):
_fields_ = [
("type", wintypes.DWORD),
("union", INPUT_union)
]
# Constants
INPUT_KEYBOARD = 1
KEYEVENTF_KEYUP = 0x0002
KEYEVENTF_UNICODE = 0x0004
# VK codes
VK_CONTROL = 0x11
VK_SHIFT = 0x10
VK_C = 0x43
VK_V = 0x56
runtime_exceptions = RuntimeExceptions(Path(__file__).resolve().parent / "runtime_exceptions.json")
def get_foreground_process_info():
"""
Returns (process_name, window_title). If anything fails, returns (None, "").
"""
try:
hwnd = win32gui.GetForegroundWindow()
if not hwnd:
return None, ""
try:
window_title = win32gui.GetWindowText(hwnd)
except Exception:
window_title = ""
try:
_, pid = win32process.GetWindowThreadProcessId(hwnd)
if not pid:
return None, window_title
except Exception:
return None, window_title
try:
process = psutil.Process(pid)
process_name = process.name()
except Exception:
process_name = None
return process_name, window_title
except Exception:
return None, ""
def send_unicode_text(text: str, chunk_size_chars: int = 32) -> bool:
"""
Types text without clipboard using SendInput + KEYEVENTF_UNICODE.
This is an implementation choice for reliability when clipboard is unstable.
"""
if not text:
return True
try:
for i in range(0, len(text), chunk_size_chars):
part = text[i:i + chunk_size_chars]
events = []
for ch in part:
scan = ord(ch)
down = INPUT(type=INPUT_KEYBOARD, union=INPUT_union(ki=KEYBDINPUT(
wVk=0,
wScan=scan,
dwFlags=KEYEVENTF_UNICODE,
time=0,
dwExtraInfo=ctypes.POINTER(ctypes.c_ulong)()
)))
up = INPUT(type=INPUT_KEYBOARD, union=INPUT_union(ki=KEYBDINPUT(
wVk=0,
wScan=scan,
dwFlags=KEYEVENTF_UNICODE | KEYEVENTF_KEYUP,
time=0,
dwExtraInfo=ctypes.POINTER(ctypes.c_ulong)()
)))
events.append(down)
events.append(up)
arr = (INPUT * len(events))(*events)
sent = ctypes.windll.user32.SendInput(len(events), ctypes.pointer(arr), ctypes.sizeof(INPUT))
if sent != len(events):
logger.error(f"SendInput (unicode typing) failed, only sent {sent} of {len(events)} inputs")
return False
return True
except Exception as exc:
logger.error(f"send_unicode_text failed: {exc}", exc_info=True)
return False
def handle_exception(exc_type, exc_value, exc_traceback):
if issubclass(exc_type, KeyboardInterrupt):
sys.__excepthook__(exc_type, exc_value, exc_traceback)
return
logging.critical("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
sys.excepthook = handle_exception
# Load environment variables
main_folder_path = Path(__file__).resolve().parent
dotenv_path = main_folder_path.parent.parent / 'EnvData' / '.env'
load_dotenv(dotenv_path=dotenv_path)
# Set up Groq client
client = Groq(api_key=os.getenv('GROQ_API_KEY'))
def int16_from_bytes(data: bytes) -> list:
# Convert bytes to list of int16 values.
return [int.from_bytes(data[i:i+2], byteorder='little', signed=True) for i in range(0, len(data), 2)]
def record_audio(sample_rate: int = 16000, channels: int = 1, chunk: int = 1024) -> tuple:
"""Record audio from the microphone while the recording button is held down."""
p = pyaudio.PyAudio()
input_device = p.get_default_input_device_info()
input_device_name = input_device['name']
if input_device_name:
logger.info(f"Using microphone: {input_device_name}")
else:
logger.error("No default microphone found for recording")
try:
stream = p.open(
format=pyaudio.paInt16,
channels=channels,
rate=sample_rate,
input=True,
frames_per_buffer=chunk,
)
except Exception as exc:
logger.error(f"Error opening audio stream: {exc}")
if str(exc).find("-9999") != -1:
logger.error("Please check microphone privacy settings in: Settings -> Privacy & Security -> Microphone")
p.terminate()
return [], sample_rate
# Enable automatically for unknown devices so you can evaluate packets and adjust threshold
DEBUG_SOUND_LEVEL = False
logger.info("Recording... (Release the button to stop)")
# Options are:
# - Headset Microphone (Plantronics BT600) -> threshold = 250
# - Headset (2- G9) -> threshold = 50
silence_threshold = 250 # Default threshold if no match is found
# Set the threshold conditionally based on the device name
if "BT600" in input_device_name:
silence_threshold = 250
elif "G9" in input_device_name:
silence_threshold = 50
elif 'Headset Microphone (Plantronics' in input_device_name:
silence_threshold = 90
elif 'Headset (LEVN LE-HS017)' in input_device_name:
silence_threshold = 200
else:
DEBUG_SOUND_LEVEL = True # Unknown headset: auto-enable so you can evaluate packets and set threshold
logger.info("Device not in known list - using default threshold; debug sound level auto-enabled to find a suitable value")
logger.info(f"Silence threshold set to: {silence_threshold} (device: {input_device_name})")
frames = []
recorded_count = 0
skipped_count = 0
max_amplitude_seen = 0
last_debug_time = time.time()
DEBUG_INTERVAL_SEC = 0.5 # Print debug line at most this often
while keyboard.is_pressed("pause") or keyboard.is_pressed("scroll lock") or keyboard.is_pressed("shift+F11") or keyboard.is_pressed("shift+F12"):
data = stream.read(chunk)
# Convert data to list of int16 audio data values
audio_data = int16_from_bytes(data)
amplitude = max(abs(x) for x in audio_data) # Return max abs value in audio_data chunk
above = amplitude > silence_threshold
if above:
frames.append(data)
recorded_count += 1
else:
skipped_count += 1
if amplitude > max_amplitude_seen:
max_amplitude_seen = amplitude
if DEBUG_SOUND_LEVEL and above:
now = time.time()
if now - last_debug_time >= DEBUG_INTERVAL_SEC:
print(f"[sound] amplitude={amplitude} threshold={silence_threshold} -> RECORDED | total: recorded={recorded_count} skipped={skipped_count} max_amplitude={max_amplitude_seen}")
last_debug_time = now
if DEBUG_SOUND_LEVEL and (recorded_count > 0 or skipped_count > 0):
print(f"[sound] --- Recording ended: chunks recorded={recorded_count} skipped={skipped_count} max_amplitude={max_amplitude_seen} ---")
print(f"[sound] Tip: set silence_threshold below speech level (e.g. < {max_amplitude_seen}) and above room noise to filter silence.")
stream.stop_stream()
stream.close()
p.terminate()
return frames, sample_rate
def save_audio(frames: list, sample_rate: int) -> str:
"""Save recorded audio to a temporary WAV file."""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
with wave.open(temp_audio.name, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(pyaudio.PyAudio().get_sample_size(pyaudio.paInt16))
wf.setframerate(sample_rate)
wf.writeframes(b"".join(frames))
return temp_audio.name
def transcribe_audio(audio_file_path: str, language: str) -> str:
"""Transcribe audio using Groq's Whisper implementation with specified language."""
prompt = 'SoftCreator, OpenClaw, Goncharenko, Vasiliy, Marina, Denisova, MISMO'
try:
with open(audio_file_path, "rb") as file:
transcription = client.audio.transcriptions.create(
file=(os.path.basename(audio_file_path), file.read()),
model="whisper-large-v3",
prompt=prompt,
response_format="text",
language=language,
temperature=0.0
)
# Ensure we return a string
return str(transcription) if transcription is not None else ''
except Exception as exc:
logging.error(f"An error occurred during transcription: {str(exc)}")
print(f"An error occurred during transcription: {str(exc)}")
return ''
def get_context(process_name: str, window_title: str):
"""Get the characters to the left and right of the cursor."""
try:
original_content_type, original_content = clipboard_manager.get_content(raise_on_error=True)
keyboard_controller = KeyboardController()
def get_directional_context(direction):
clipboard_manager.set_content("text", '', raise_on_error=True)
for _ in range(3):
with keyboard_controller.pressed(Key.ctrl, Key.shift):
keyboard_controller.press(getattr(Key, direction))
time.sleep(0.01)
keyboard_controller.release(getattr(Key, direction))
time.sleep(0.01)
with keyboard_controller.pressed(Key.ctrl):
keyboard_controller.press('c')
time.sleep(0.01)
keyboard_controller.release('c')
time.sleep(0.01)
content_type, content = clipboard_manager.get_content(raise_on_error=True)
return content if content_type == "text" else ""
left_context = get_directional_context('left')
if left_context:
keyboard_controller.press(Key.right)
time.sleep(0.01)
keyboard_controller.release(Key.right)
right_context = get_directional_context('right')
if right_context:
keyboard_controller.press(Key.left)
keyboard_controller.release(Key.left)
clipboard_manager.set_content(original_content_type, original_content)
print(f'Left context: {left_context}')
print(f'Right context: {right_context}')
return left_context, right_context
except Exception as exc:
# If context capture breaks clipboard in a specific app, treat it as a context-exception candidate.
action = runtime_exceptions.record_failure(process_name or "", "context", reason=str(exc))
if action.get("action") == "add_context_exception":
runtime_exceptions.add_context_exception(process_name, reason=f"context failures (count={action.get('count')})")
logging.error(f"An error occurred while getting context: {str(exc)}")
return '', ''
def is_application_in_exceptions():
"""Check if current application is in exceptions list."""
try:
process_name, window_title = get_foreground_process_info()
if not process_name:
logger.warning("Failed to get foreground process name")
return True # Assume it's in exceptions for safety
# Log process information for debugging
logger.info(f"Current process: {process_name}") #, Window title: {window_title}")
# Check if either process name or window title is in exceptions (case-insensitive)
is_exception = any(exc.lower() in process_name.lower() or exc.lower() in window_title.lower()
for exc in CONTEXT_EXCEPTION_PROCESSES)
# Dynamic runtime exceptions (auto-added after repeated failures)
if runtime_exceptions.is_context_exception(process_name):
is_exception = True
if is_exception:
logger.info(f"Process {process_name} is in exceptions list")
#else:
# logger.info(f"Process {process_name} is not in exceptions list, will try to get context")
return is_exception
except Exception as exc:
logger.error(f"Error checking application exceptions: {exc}")
return True # If we can't check, assume it's in exceptions for safety
def copy_transcription_to_clipboard(text):
"""
Copy transcription to clipboard and paste it at cursor position using a thread-safe manager.
"""
try:
process_name, window_title = get_foreground_process_info()
if process_name and runtime_exceptions.should_simplify_clipboard(process_name):
logger.info(f"Clipboard simplify enabled for {process_name} -> typing text directly")
return send_unicode_text(text)
# Step 1: Safely get and store the original clipboard content
original_content_type, original_content = clipboard_manager.get_content()
if original_content_type is None:
logger.warning("Failed to save original clipboard content or clipboard was empty.")
# Get context if application is not in exceptions
if not is_application_in_exceptions():
left_context, right_context = get_context(process_name, window_title)
if left_context or right_context:
text = format_text_for_context(text, left_context, right_context)
# Step 2: Safely set the new transcription text to the clipboard
if not clipboard_manager.set_content("text", text):
# Clipboard failed while trying to paste transcription.
# Record and potentially auto-enable clipboard-simplify for this process.
action = runtime_exceptions.record_failure(process_name or "", "paste", reason="set_content failed (text)")
if action.get("action") == "add_clipboard_simplify":
runtime_exceptions.add_clipboard_simplify(process_name, reason=f"clipboard set failures (count={action.get('count')})")
# Fallback: type directly so the user still gets the transcription.
return send_unicode_text(text)
# Let the system process the clipboard change
time.sleep(0.2)
# Step 3: Send Ctrl+V to paste
if not send_ctrl_v():
logger.error("Failed to send Ctrl+V keystrokes")
# Attempt to restore clipboard even if paste fails
if original_content_type is not None:
clipboard_manager.set_content(original_content_type, original_content)
return False
# Wait for paste operation to complete
time.sleep(0.2)
# Step 4: Safely restore the original clipboard content
if original_content_type is not None:
clipboard_manager.set_content(original_content_type, original_content)
return True
except Exception as exc:
# The clipboard_manager already logs detailed errors, but we log here for function context.
logger.error(f"An exception occurred in copy_transcription_to_clipboard: {exc}")
# Attempt to restore the clipboard in case of an error mid-way
try:
if 'original_content_type' in locals() and original_content_type is not None:
clipboard_manager.set_content(original_content_type, original_content)
except Exception as restore_exc:
logger.error(f"Failed to restore clipboard during exception handling: {restore_exc}")
return False
def format_text_for_context(text: str, left_context: str, right_context: str) -> str:
"""Format the transcribed text based on the surrounding context."""
left_context_trimmed = left_context.rstrip()
character_groups = get_character_groups()
new_line_chars = set(character_groups['line_break_commands']['replacements'].values())
end_sentence_chars = set(character_groups['sentence_ending']['replacements'].values())
grouping_open_chars = set(character_groups['grouping_open']['replacements'].values())
in_sentence_chars = set(chain(character_groups['separators']['replacements'].values(),
character_groups['punctuation']['replacements'].values(),
character_groups['grouping_close']['replacements'].values()))
#whitespace_chars = set(character_groups['spacing_characters']['replacements'].values())
if not left_context or left_context[-1] in new_line_chars:
formatted_text = tp.capitalize_first_letter(text.lstrip())
elif left_context_trimmed and left_context_trimmed[-1] in end_sentence_chars:
formatted_text = ' ' + tp.capitalize_first_letter(text.lstrip())
elif left_context[-1] in grouping_open_chars:
formatted_text = text
elif left_context[-1] in in_sentence_chars or left_context[-1].isalnum():
formatted_text = ' ' + tp.lowercase_first_letter(text.lstrip())
elif left_context[-1].isalnum():
formatted_text = ' ' + text
#elif left_context[-1] in whitespace_chars:
# formatted_text = text
else:
formatted_text = text
return formatted_text
def start_recording(language: str) -> None:
"""Start audio recording and transcribe with specified language."""
lang_name = "English" if language == "en" else "Russian"
logger.info(f"Starting recording in {lang_name}...")
frames, sample_rate = record_audio()
processed_transcription = None # Initialize to ensure it always exists
if frames:
temp_audio_file = save_audio(frames, sample_rate)
transcription = transcribe_audio(temp_audio_file, language=language)
if transcription:
processed_transcription = tp.process_text(transcription.rstrip())
print(f'Before processing: {transcription}')
print(f'After processing: {processed_transcription}\n')
else:
print("No transcription received.")
# Check if processed_transcription has content before copying
if processed_transcription:
copy_transcription_to_clipboard(processed_transcription)
else:
print("Processed transcription is empty or not available.")
os.unlink(temp_audio_file)
print("Press:\n- PAUSE or SHIFT+F12 for English recording \n- Scroll Lock or SHIFT+F11 for Russian recording.")
def main():
print("Press:\n- PAUSE or SHIFT+F12 for English recording \n- Scroll Lock or SHIFT+F11 for Russian recording.")
while True:
if keyboard.is_pressed('pause') or keyboard.is_pressed('shift+F12'):
start_recording('en')
elif keyboard.is_pressed('scroll lock') or keyboard.is_pressed('shift+F11'):
start_recording('ru')
time.sleep(0.1)
def send_ctrl_v():
"""
Send Ctrl+V keystrokes using SendInput
"""
# Prepare input array for Ctrl key down
ctrl_down = INPUT(type=INPUT_KEYBOARD,
union=INPUT_union(ki=KEYBDINPUT(
wVk=VK_CONTROL,
wScan=0,
dwFlags=0,
time=0,
dwExtraInfo=ctypes.POINTER(ctypes.c_ulong)()
)))
# Prepare input for V key down
v_down = INPUT(type=INPUT_KEYBOARD,
union=INPUT_union(ki=KEYBDINPUT(
wVk=VK_V,
wScan=0,
dwFlags=0,
time=0,
dwExtraInfo=ctypes.POINTER(ctypes.c_ulong)()
)))
# Prepare input for V key up
v_up = INPUT(type=INPUT_KEYBOARD,
union=INPUT_union(ki=KEYBDINPUT(
wVk=VK_V,
wScan=0,
dwFlags=KEYEVENTF_KEYUP,
time=0,
dwExtraInfo=ctypes.POINTER(ctypes.c_ulong)()
)))
# Prepare input for Ctrl key up
ctrl_up = INPUT(type=INPUT_KEYBOARD,
union=INPUT_union(ki=KEYBDINPUT(
wVk=VK_CONTROL,
wScan=0,
dwFlags=KEYEVENTF_KEYUP,
time=0,
dwExtraInfo=ctypes.POINTER(ctypes.c_ulong)()
)))
# Create input array
inputs = (INPUT * 4)(ctrl_down, v_down, v_up, ctrl_up)
# Send inputs
sent = ctypes.windll.user32.SendInput(4, ctypes.pointer(inputs), ctypes.sizeof(INPUT))
# Log result
if sent != 4:
logger.error(f"SendInput failed, only sent {sent} of 4 inputs")
return False
return True
if __name__ == "__main__":
faulthandler.enable()
while True:
try:
main()
except Exception as e:
logging.error("Unhandled exception occurred: %s", e, exc_info=True)
print("Application crashed due to an error. Restarting...")
time.sleep(1) # Delay 1 sec before restart
Пример лога на английском и русском
026-03-21 03:15:08,281 - Current process: claude.exe 2026-03-21 03:15:08,281 - Process claude.exe is in exceptions list Press:
PAUSE or SHIFT+F12 for English recording
Scroll Lock or SHIFT+F11 for Russian recording. 2026-03-21 03:15:17,444
Starting recording in English... 2026-03-21 03:15:17,444 - Using microphone: Headset Microphone (Plantronics 2026-03-21 03:15:17,450 - Recording... (Release the button to stop) 2026-03-21 03:15:17,450 - Silence threshold set to: 90 (device: Headset Microphone (Plantronics)
Before processing: The initial run looks great. Period, new paragraph. Colors might need to be adjusted, comma, but this can be taken care of later. Period, new paragraph. Currently the problem is that arrow keys are recognized incorrectly and rendered as text instead of performing commands like arrow up, down, left or right period. New paragraph C screenshot where I was trying to navigate through terminal dialog screen moving cursor up or down, but instead I received text injected into the entry field period
After processing: The initial run looks great.
Colors might need to be adjusted, but this can be taken care of later.
Currently the problem is that arrow keys are recognized incorrectly and rendered as text instead of performing commands like arrow up, down, left or right.
C screenshot where I was trying to navigate through terminal dialog screen moving cursor up or down, but instead I received text injected into the entry field.
Press:
PAUSE or SHIFT+F12 for English recording
Scroll Lock or SHIFT+F11 for Russian recording. 2026-03-20 19:03:13,484
Starting recording in Russian... 2026-03-20 19:03:13,484 - Using microphone: Headset Microphone (Plantronics 2026-03-20 19:03:13,494 - Recording... (Release the button to stop) 2026-03-20 19:03:13,494 - Silence threshold set to: 90 (device: Headset Microphone (Plantronics) 2026-03-20 19:03:27,332 - Current process: WhatsApp.Root.exe 2026-03-20 19:03:27,332 - Process WhatsApp.Root.exe is in exceptions list
Before processing: Если чё, я могу съездить в магаз или сварить ещё яйца.
After processing: Если чё, я могу съездить в магаз или сварить ещё яйца.
Можно, конечно, потешаться над говнокодом, но идея вполне рабочая и при совершенствовании моделей путем обучения на более грамотном коде, ошибок будет меньше.
Не знаю откуда взялась проблема с Whisper. Я пользуюсь им с июня 2024 ежедневно. После добавления простого кода обнаружения пауз и добавления в словаре терминов, по аналогии с Dragon NaturallySpeaking, проблем вообще никаких. Распознает шустро и аккуратно, поддерживает русский и английский.
Кстати, этот коммент тоже был наговорен через Whisper.
Конечно же, не только на Хабр. Но фраза в контексте Хабра. Не допускаю, что русский человек 10 лет нигде не писал заметок на русском.
Резанула сама фраза, что русский дистанцируется от русского языка. Живя в другой стране более четверти века, никогда не дистанцировался от своей русскости. Как-то странно.
Есть ещё speech-to-speech модели типа personaplex, там это всё из коробки смоделировано и заводится на видеокарте с 12гб vram.
Заинтересовался, попытался. Проблемы. 1. поддержка только английского, 2. в 12 гигабайт не влезет.
The RTX A5000 16 GB is not sufficient for this model at its native bf16 precision. You'd need at minimum a 24 GB card (like RTX A5000 24GB variant, RTX 4090, or RTX 3090) to have a chance with some optimization, or ideally a 48+ GB card (A6000, A40) for comfortable operation.
Analysis: PersonaPlex-7B-V1 on RTX A5000 (16 GB VRAM)
Short answer: It will not run out-of-the-box on your card.
Here's the breakdown:
Model Requirements
Parameters: 7 billion
Tested hardware: NVIDIA A100 80 GB
Supported GPUs: A100 (Ampere), H100 (Hopper) — both data-center class
Precision: bf16 (based on the base model moshiko-pytorch-bf16)
VRAM Estimate
A 7B parameter model in bf16 (2 bytes per parameter) requires:
Your RTX A5000 has 16 GB, which is too tight even just for the weights, and insufficient once you account for the audio encoder (Mimi), decoder, and runtime buffers.
Additional Concerns
Architecture: This is not a simple text LLM — it has three components (Mimi encoder, Temporal+Depth transformers, Mimi decoder) all needing VRAM simultaneously
Official support: NVIDIA only lists A100/H100 as supported GPUs — consumer/workstation Ampere cards aren't mentioned
Real-time streaming: The model does full-duplex audio which adds memory pressure
Possible Workarounds (with caveats)
ApproachFeasibility4-bit quantization (GPTQ/AWQ)Could fit weights in ~4-5 GB, but no official quant is provided and the multi-component architecture makes this non-trivialCPU offloadingPossible via accelerate, but real-time audio streaming would be too slowModel sharding across GPUsIf you have a second GPU, you could split the modelWait for official optimizationNVIDIA may release TensorRT-LLM or smaller variants
Индекс vector_kmeans_tree реализует иерархическую кластеризацию данных. ... Параметры:levels: число уровней в дереве, задает глубину поиска (рекомендуется 1-3);
Если вы эмпирически уже вычислили условия определения количества уровней и кластеров, то может задавать эти параметры автоматически? При создании многоуровневого векторного индекса разработчику сложно разобраться с правильными параметрами - новая область.
Работаю в узко-специализированной индустрии (банковское кредитование - соблюдение законов). Раньше каждая компания имела свои особености и накручивала цены за мелкие фишки. Сейчас все копируют всех по скриншотам и спецификациям - spec-driven development. Прям сейчас набрасываем демо для нового клиента на базе spec-driven development.
Грядет тектонический сдвиг в разработке и копирайтах.
Внесли следующие правки и перезалили пакеты для моделей языков РФ и СНГ:
А где и как протестировать?
Также, если хочу self-hosted text-to-speech с использованием вашей библиотеки для личного использования, каковы условия использования и где взять цены, если модели платные?
Не пессимист, но если всё действительно так замечательно, то компанию поглотят с потрохами.
Ближайший пример: Groq. Поглощен NVIDIA. На этих ребят были большие надежды, пользовался их AI API с 2024. Последние шесть месяцев делают физиономию, что живые.
А вот как бы ваш опыт автоматизировать?
Нет смысла каждому HR тратить время и терять хороших кандидатов.
Может, ваш опыт систематизировать и автоматизировать?
После более 30 лет кодинга, который, кстати, мне очень нравится, я подписан на Cursor, ChatGPT и Anthropic Claude (Max Subscription).
Со всеми вашими аргументами я бы с удовольствием согласился, если бы не реалии.
Приводить весовые доводы глупо, их нет - ситуация быстро меняется,
Давайте тупо подождем годик-другой и посмотрим на результаты.
Крутится на локальной машине, вызовы Groq.
Стоит при плотном ежедневном использовании где-то $1.5- $2.0 / мес.
Под катом исходник main.py. В нем можно найти детали.
Личные словари и доп. модули приводить не буду, но смысл из вызываемых функций понятен.
Whisper V3 Large
https://groq.com/pricing
Из приятного - время ответа от сервера в районе 0.5-2 сек.
main.py
Пример лога на английском и русском
026-03-21 03:15:08,281 - Current process: claude.exe
2026-03-21 03:15:08,281 - Process claude.exe is in exceptions list
Press:
PAUSE or SHIFT+F12 for English recording
Scroll Lock or SHIFT+F11 for Russian recording. 2026-03-21 03:15:17,444
Starting recording in English...
2026-03-21 03:15:17,444 - Using microphone: Headset Microphone (Plantronics 2026-03-21 03:15:17,450 - Recording... (Release the button to stop)
2026-03-21 03:15:17,450 - Silence threshold set to: 90 (device: Headset Microphone (Plantronics)
Before processing: The initial run looks great. Period, new paragraph. Colors might need to be adjusted, comma, but this can be taken care of later. Period, new paragraph. Currently the problem is that arrow keys are recognized incorrectly and rendered as text instead of performing commands like arrow up, down, left or right period. New paragraph C screenshot where I was trying to navigate through terminal dialog screen moving cursor up or down, but instead I received text injected into the entry field period
After processing: The initial run looks great.
Colors might need to be adjusted, but this can be taken care of later.
Currently the problem is that arrow keys are recognized incorrectly and rendered as text instead of performing commands like arrow up, down, left or right.
C screenshot where I was trying to navigate through terminal dialog screen moving cursor up or down, but instead I received text injected into the entry field.
Press:
PAUSE or SHIFT+F12 for English recording
Scroll Lock or SHIFT+F11 for Russian recording. 2026-03-20 19:03:13,484
Starting recording in Russian...
2026-03-20 19:03:13,484 - Using microphone: Headset Microphone (Plantronics
2026-03-20 19:03:13,494 - Recording... (Release the button to stop)
2026-03-20 19:03:13,494 - Silence threshold set to: 90 (device: Headset Microphone (Plantronics)
2026-03-20 19:03:27,332 - Current process: WhatsApp.Root.exe
2026-03-20 19:03:27,332 - Process WhatsApp.Root.exe is in exceptions list
Before processing: Если чё, я могу съездить в магаз или сварить ещё яйца.
After processing: Если чё, я могу съездить в магаз или сварить ещё яйца.
Время ответа
Можно, конечно, потешаться над говнокодом, но идея вполне рабочая и при совершенствовании моделей путем обучения на более грамотном коде, ошибок будет меньше.
Не знаю откуда взялась проблема с Whisper.
Я пользуюсь им с июня 2024 ежедневно.
После добавления простого кода обнаружения пауз и добавления в словаре терминов, по аналогии с Dragon NaturallySpeaking, проблем вообще никаких.
Распознает шустро и аккуратно, поддерживает русский и английский.
Кстати, этот коммент тоже был наговорен через Whisper.
Конечно же, не только на Хабр.
Но фраза в контексте Хабра.
Не допускаю, что русский человек 10 лет нигде не писал заметок на русском.
Резанула сама фраза, что русский дистанцируется от русского языка.
Живя в другой стране более четверти века, никогда не дистанцировался от своей русскости. Как-то странно.
Так, вы зарегились на Хабре меньше месяца назад. :)
Заинтересовался, попытался.
Проблемы.
1. поддержка только английского,
2. в 12 гигабайт не влезет.
The RTX A5000 16 GB is not sufficient for this model at its native bf16 precision. You'd need at minimum a 24 GB card (like RTX A5000 24GB variant, RTX 4090, or RTX 3090) to have a chance with some optimization, or ideally a 48+ GB card (A6000, A40) for comfortable operation.
Analysis: PersonaPlex-7B-V1 on RTX A5000 (16 GB VRAM)
Short answer: It will not run out-of-the-box on your card.
Here's the breakdown:
Model Requirements
Parameters: 7 billion
Tested hardware: NVIDIA A100 80 GB
Supported GPUs: A100 (Ampere), H100 (Hopper) — both data-center class
Precision: bf16 (based on the base model
moshiko-pytorch-bf16)VRAM Estimate
A 7B parameter model in bf16 (2 bytes per parameter) requires:
Model weights alone: ~14 GB
Inference overhead (KV cache, activations, audio encoder/decoder): additional ~4-10 GB
Total estimated: ~18-24 GB minimum
Your RTX A5000 has 16 GB, which is too tight even just for the weights, and insufficient once you account for the audio encoder (Mimi), decoder, and runtime buffers.
Additional Concerns
Architecture: This is not a simple text LLM — it has three components (Mimi encoder, Temporal+Depth transformers, Mimi decoder) all needing VRAM simultaneously
Official support: NVIDIA only lists A100/H100 as supported GPUs — consumer/workstation Ampere cards aren't mentioned
Real-time streaming: The model does full-duplex audio which adds memory pressure
Possible Workarounds (with caveats)
ApproachFeasibility4-bit quantization (GPTQ/AWQ)Could fit weights in ~4-5 GB, but no official quant is provided and the multi-component architecture makes this non-trivialCPU offloadingPossible via
accelerate, but real-time audio streaming would be too slowModel sharding across GPUsIf you have a second GPU, you could split the modelWait for official optimizationNVIDIA may release TensorRT-LLM or smaller variantsИз доки:
Если вы эмпирически уже вычислили условия определения количества уровней и кластеров, то может задавать эти параметры автоматически?
При создании многоуровневого векторного индекса разработчику сложно разобраться с правильными параметрами - новая область.
Аналогичный опыт:
Работаю в узко-специализированной индустрии (банковское кредитование - соблюдение законов).
Раньше каждая компания имела свои особености и накручивала цены за мелкие фишки.
Сейчас все копируют всех по скриншотам и спецификациям - spec-driven development.
Прям сейчас набрасываем демо для нового клиента на базе spec-driven development.
Грядет тектонический сдвиг в разработке и копирайтах.
А где и как протестировать?
Также, если хочу self-hosted text-to-speech с использованием вашей библиотеки для личного использования, каковы условия использования и где взять цены, если модели платные?
Не пессимист, но если всё действительно так замечательно, то компанию поглотят с потрохами.
Ближайший пример: Groq. Поглощен NVIDIA.
На этих ребят были большие надежды, пользовался их AI API с 2024.
Последние шесть месяцев делают физиономию, что живые.
Подход спорный, работает для литературных текстов, но создаст проблемы для документов со сложной структурой.
Dell Precision + RTX A5000
Громкая фраза.
Вы себе льстите.
А существуют ли решения, на которых можно развернуть S3 Bucket?
Типа MinIO.
Если я правильно понял суть проблемы, была необходимость отсылки сообщений в определенную дату-время или с определенной периодичностью.
А рассматривали ли вы комбинацию использования Scheduler в связке с очередью?
Была похожая проблема/задача.
Использовал Jitsi.
Если кому интересно, с 2006 года использую MDaemon. Под винды, если что.