{"id":2701,"date":"2024-06-07T15:04:08","date_gmt":"2024-06-07T07:04:08","guid":{"rendered":"https:\/\/blog.md5.red\/?p=2701"},"modified":"2024-06-07T15:39:45","modified_gmt":"2024-06-07T07:39:45","slug":"whisperx%e5%8f%82%e6%95%b0%e8%af%b4%e6%98%8e","status":"publish","type":"post","link":"https:\/\/blog.md5.red\/?p=2701","title":{"rendered":"whisperX\u53c2\u6570\u8bf4\u660e"},"content":{"rendered":"\n<pre class=\"wp-block-code\"><code>usage: whisperx &#91;-h] &#91;--model MODEL] &#91;--model_dir MODEL_DIR] &#91;--device DEVICE]\n                &#91;--device_index DEVICE_INDEX] &#91;--batch_size BATCH_SIZE]\n                &#91;--compute_type {float16,float32,int8}]\n                &#91;--output_dir OUTPUT_DIR]\n                &#91;--output_format {all,srt,vtt,txt,tsv,json,aud}]\n                &#91;--verbose VERBOSE] &#91;--task {transcribe,translate}]\n                &#91;--language {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,yue,zh,Afrikaans,Albanian,Amharic,Arabic,Armenian,Assamese,Azerbaijani,Bashkir,Basque,Belarusian,Bengali,Bosnian,Breton,Bulgarian,Burmese,Cantonese,Castilian,Catalan,Chinese,Croatian,Czech,Danish,Dutch,English,Estonian,Faroese,Finnish,Flemish,French,Galician,Georgian,German,Greek,Gujarati,Haitian,Haitian Creole,Hausa,Hawaiian,Hebrew,Hindi,Hungarian,Icelandic,Indonesian,Italian,Japanese,Javanese,Kannada,Kazakh,Khmer,Korean,Lao,Latin,Latvian,Letzeburgesch,Lingala,Lithuanian,Luxembourgish,Macedonian,Malagasy,Malay,Malayalam,Maltese,Maori,Marathi,Moldavian,Moldovan,Mongolian,Myanmar,Nepali,Norwegian,Nynorsk,Occitan,Panjabi,Pashto,Persian,Polish,Portuguese,Punjabi,Pushto,Romanian,Russian,Sanskrit,Serbian,Shona,Sindhi,Sinhala,Sinhalese,Slovak,Slovenian,Somali,Spanish,Sundanese,Swahili,Swedish,Tagalog,Tajik,Tamil,Tatar,Telugu,Thai,Tibetan,Turkish,Turkmen,Ukrainian,Urdu,Uzbek,Valencian,Vietnamese,Welsh,Yiddish,Yoruba}]\n                &#91;--align_model ALIGN_MODEL]\n                &#91;--interpolate_method {nearest,linear,ignore}] &#91;--no_align]\n                &#91;--return_char_alignments] &#91;--vad_onset VAD_ONSET]\n                &#91;--vad_offset VAD_OFFSET] &#91;--chunk_size CHUNK_SIZE]\n                &#91;--diarize] &#91;--min_speakers MIN_SPEAKERS]\n                &#91;--max_speakers MAX_SPEAKERS] &#91;--temperature TEMPERATURE]\n                &#91;--best_of BEST_OF] &#91;--beam_size BEAM_SIZE]\n                &#91;--patience PATIENCE] &#91;--length_penalty LENGTH_PENALTY]\n                &#91;--suppress_tokens SUPPRESS_TOKENS] &#91;--suppress_numerals]\n                &#91;--initial_prompt INITIAL_PROMPT]\n                &#91;--condition_on_previous_text CONDITION_ON_PREVIOUS_TEXT]\n                &#91;--fp16 FP16]\n                &#91;--temperature_increment_on_fallback TEMPERATURE_INCREMENT_ON_FALLBACK]\n                &#91;--compression_ratio_threshold COMPRESSION_RATIO_THRESHOLD]\n                &#91;--logprob_threshold LOGPROB_THRESHOLD]\n                &#91;--no_speech_threshold NO_SPEECH_THRESHOLD]\n                &#91;--max_line_width MAX_LINE_WIDTH]\n                &#91;--max_line_count MAX_LINE_COUNT]\n                &#91;--highlight_words HIGHLIGHT_WORDS]\n                &#91;--segment_resolution {sentence,chunk}] &#91;--threads THREADS]\n                &#91;--hf_token HF_TOKEN] &#91;--print_progress PRINT_PROGRESS]\n                audio &#91;audio ...]\n\npositional arguments:\n  audio                 audio file(s) to transcribe\n\noptions:\n  -h, --help            show this help message and exit\n  --model MODEL         name of the Whisper model to use (default: small)\n  --model_dir MODEL_DIR\n                        the path to save model files; uses ~\/.cache\/whisper by\n                        default (default: None)\n  --device DEVICE       device to use for PyTorch inference (default: cuda)\n  --device_index DEVICE_INDEX\n                        device index to use for FasterWhisper inference\n                        (default: 0)\n  --batch_size BATCH_SIZE\n                        the preferred batch size for inference (default: 8)\n  --compute_type {float16,float32,int8}\n                        compute type for computation (default: float16)\n  --output_dir OUTPUT_DIR, -o OUTPUT_DIR\n                        directory to save the outputs (default: .)\n  --output_format {all,srt,vtt,txt,tsv,json,aud}, -f {all,srt,vtt,txt,tsv,json,aud}\n                        format of the output file; if not specified, all\n                        available formats will be produced (default: all)\n  --verbose VERBOSE     whether to print out the progress and debug messages\n                        (default: True)\n  --task {transcribe,translate}\n                        whether to perform X->X speech recognition\n                        ('transcribe') or X->English translation ('translate')\n                        (default: transcribe)\n  --language {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,yue,zh,Afrikaans,Albanian,Amharic,Arabic,Armenian,Assamese,Azerbaijani,Bashkir,Basque,Belarusian,Bengali,Bosnian,Breton,Bulgarian,Burmese,Cantonese,Castilian,Catalan,Chinese,Croatian,Czech,Danish,Dutch,English,Estonian,Faroese,Finnish,Flemish,French,Galician,Georgian,German,Greek,Gujarati,Haitian,Haitian Creole,Hausa,Hawaiian,Hebrew,Hindi,Hungarian,Icelandic,Indonesian,Italian,Japanese,Javanese,Kannada,Kazakh,Khmer,Korean,Lao,Latin,Latvian,Letzeburgesch,Lingala,Lithuanian,Luxembourgish,Macedonian,Malagasy,Malay,Malayalam,Maltese,Maori,Marathi,Moldavian,Moldovan,Mongolian,Myanmar,Nepali,Norwegian,Nynorsk,Occitan,Panjabi,Pashto,Persian,Polish,Portuguese,Punjabi,Pushto,Romanian,Russian,Sanskrit,Serbian,Shona,Sindhi,Sinhala,Sinhalese,Slovak,Slovenian,Somali,Spanish,Sundanese,Swahili,Swedish,Tagalog,Tajik,Tamil,Tatar,Telugu,Thai,Tibetan,Turkish,Turkmen,Ukrainian,Urdu,Uzbek,Valencian,Vietnamese,Welsh,Yiddish,Yoruba}\n                        language spoken in the audio, specify None to perform\n                        language detection (default: None)\n  --align_model ALIGN_MODEL\n                        Name of phoneme-level ASR model to do alignment\n                        (default: None)\n  --interpolate_method {nearest,linear,ignore}\n                        For word .srt, method to assign timestamps to non-\n                        aligned words, or merge them into neighbouring.\n                        (default: nearest)\n  --no_align            Do not perform phoneme alignment (default: False)\n  --return_char_alignments\n                        Return character-level alignments in the output json\n                        file (default: False)\n  --vad_onset VAD_ONSET\n                        Onset threshold for VAD (see pyannote.audio), reduce\n                        this if speech is not being detected (default: 0.5)\n  --vad_offset VAD_OFFSET\n                        Offset threshold for VAD (see pyannote.audio), reduce\n                        this if speech is not being detected. (default: 0.363)\n  --chunk_size CHUNK_SIZE\n                        Chunk size for merging VAD segments. Default is 30,\n                        reduce this if the chunk is too long. (default: 30)\n  --diarize             Apply diarization to assign speaker labels to each\n                        segment\/word (default: False)\n  --min_speakers MIN_SPEAKERS\n                        Minimum number of speakers to in audio file (default:\n                        None)\n  --max_speakers MAX_SPEAKERS\n                        Maximum number of speakers to in audio file (default:\n                        None)\n  --temperature TEMPERATURE\n                        temperature to use for sampling (default: 0)\n  --best_of BEST_OF     number of candidates when sampling with non-zero\n                        temperature (default: 5)\n  --beam_size BEAM_SIZE\n                        number of beams in beam search, only applicable when\n                        temperature is zero (default: 5)\n  --patience PATIENCE   optional patience value to use in beam decoding, as in\n                        https:\/\/arxiv.org\/abs\/2204.05424, the default (1.0) is\n                        equivalent to conventional beam search (default: 1.0)\n  --length_penalty LENGTH_PENALTY\n                        optional token length penalty coefficient (alpha) as\n                        in https:\/\/arxiv.org\/abs\/1609.08144, uses simple\n                        length normalization by default (default: 1.0)\n  --suppress_tokens SUPPRESS_TOKENS\n                        comma-separated list of token ids to suppress during\n                        sampling; '-1' will suppress most special characters\n                        except common punctuations (default: -1)\n  --suppress_numerals   whether to suppress numeric symbols and currency\n                        symbols during sampling, since wav2vec2 cannot align\n                        them correctly (default: False)\n  --initial_prompt INITIAL_PROMPT\n                        optional text to provide as a prompt for the first\n                        window. (default: None)\n  --condition_on_previous_text CONDITION_ON_PREVIOUS_TEXT\n                        if True, provide the previous output of the model as a\n                        prompt for the next window; disabling may make the\n                        text inconsistent across windows, but the model\n                        becomes less prone to getting stuck in a failure loop\n                        (default: False)\n  --fp16 FP16           whether to perform inference in fp16; True by default\n                        (default: True)\n  --temperature_increment_on_fallback TEMPERATURE_INCREMENT_ON_FALLBACK\n                        temperature to increase when falling back when the\n                        decoding fails to meet either of the thresholds below\n                        (default: 0.2)\n  --compression_ratio_threshold COMPRESSION_RATIO_THRESHOLD\n                        if the gzip compression ratio is higher than this\n                        value, treat the decoding as failed (default: 2.4)\n  --logprob_threshold LOGPROB_THRESHOLD\n                        if the average log probability is lower than this\n                        value, treat the decoding as failed (default: -1.0)\n  --no_speech_threshold NO_SPEECH_THRESHOLD\n                        if the probability of the &lt;|nospeech|> token is higher\n                        than this value AND the decoding has failed due to\n                        `logprob_threshold`, consider the segment as silence\n                        (default: 0.6)\n  --max_line_width MAX_LINE_WIDTH\n                        (not possible with --no_align) the maximum number of\n                        characters in a line before breaking the line\n                        (default: None)\n  --max_line_count MAX_LINE_COUNT\n                        (not possible with --no_align) the maximum number of\n                        lines in a segment (default: None)\n  --highlight_words HIGHLIGHT_WORDS\n                        (not possible with --no_align) underline each word as\n                        it is spoken in srt and vtt (default: False)\n  --segment_resolution {sentence,chunk}\n                        (not possible with --no_align) the maximum number of\n                        characters in a line before breaking the line\n                        (default: sentence)\n  --threads THREADS     number of threads used by torch for CPU inference;\n                        supercedes MKL_NUM_THREADS\/OMP_NUM_THREADS (default:\n                        0)\n  --hf_token HF_TOKEN   Hugging Face Access Token to access PyAnnote gated\n                        models (default: None)\n  --print_progress PRINT_PROGRESS\n                        if True, progress will be printed in transcribe() and\n                        align() methods. (default: False)\n<\/code><\/pre>\n\n\n\n<p>1\u3001-h\uff0c\u5e2e\u52a9<\/p>\n\n\n\n<p>2\u3001--model MODEL \u4f7f\u7528\u7684\u6a21\u578b<\/p>\n\n\n\n<p>3\u3001--model_dir MODEL_DIR \u6a21\u578b\u6587\u4ef6\u7684\u4fdd\u5b58\u8def\u5f84<\/p>\n\n\n\n<p>4\u3001--device DEVICE PyTorch\u63a5\u53e3\u4f7f\u7528\u7684\u8bbe\u5907\uff0cCPU\u6216GPU<\/p>\n\n\n\n<p>5\u3001--device_index DEVICE_INDEX \u7b2c\u51e0\u5757\u8bbe\u5907<\/p>\n\n\n\n<p>6\u3001--batch_size BATCH_SIZE<\/p>\n\n\n\n<p>7\u3001--compute_type {float16,float32,int8}<\/p>\n\n\n\n<p>8\u3001--output_dir OUTPUT_DIR \u8f93\u51fa\u7ed3\u679c\u4fdd\u5b58\u7684\u76ee\u5f55\uff0c\u9ed8\u8ba4\u503c\uff1a\u5f53\u524d\u76ee\u5f55<\/p>\n\n\n\n<p>9\u3001--output_format {all,srt,vtt,txt,tsv,json,aud} \u8f93\u51fa\u6587\u4ef6\u7684\u683c\u5f0f\uff0c\u9ed8\u8ba4\u503c\uff1aall<\/p>\n\n\n\n<p>10\u3001--verbose VERBOSE \u662f\u5426\u6253\u5370\u8fdb\u5c55\u548cdebug\u4fe1\u606f\uff0c\u9ed8\u8ba4\u503c\uff1atrue<\/p>\n\n\n\n<p>11\u3001--task {transcribe,translate}  \u8f6c\u5199\u8fd8\u662f\u7ffb\u8bd1<\/p>\n\n\n\n<p>12\u3001--language  \u8bbe\u7f6e\u4e3a\u65e0\u5219\u4f1a\u8fdb\u884c\u8bed\u8a00\u68c0\u6d4b\uff0c\u9ed8\u8ba4\u503c\uff1a\u65e0<\/p>\n\n\n\n<p>13\u3001--align_model ALIGN_MODEL<\/p>\n\n\n\n<p>14\u3001--interpolate_method {nearest,linear,ignore}<\/p>\n\n\n\n<p>15\u3001--no_align<\/p>\n\n\n\n<p>16\u3001--return_char_alignments<\/p>\n\n\n\n<p>17\u3001--vad_onset VAD_ONSET<\/p>\n\n\n\n<p>18\u3001<\/p>\n","protected":false},"excerpt":{"rendered":"<p>1\u3001-h\uff0c\u5e2e\u52a9 2\u3001--model MODEL \u4f7f\u7528\u7684\u6a21\u578b 3\u3001--model_dir MODEL_DIR \u6a21 ...<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[19],"tags":[],"class_list":["post-2701","post","type-post","status-publish","format-standard","hentry","category-19"],"_links":{"self":[{"href":"https:\/\/blog.md5.red\/index.php?rest_route=\/wp\/v2\/posts\/2701","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/blog.md5.red\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/blog.md5.red\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/blog.md5.red\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/blog.md5.red\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=2701"}],"version-history":[{"count":2,"href":"https:\/\/blog.md5.red\/index.php?rest_route=\/wp\/v2\/posts\/2701\/revisions"}],"predecessor-version":[{"id":2707,"href":"https:\/\/blog.md5.red\/index.php?rest_route=\/wp\/v2\/posts\/2701\/revisions\/2707"}],"wp:attachment":[{"href":"https:\/\/blog.md5.red\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=2701"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/blog.md5.red\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=2701"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/blog.md5.red\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=2701"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}