@@ -0,0 +1,7 @@ |
||
| 1 |
+.DS_Store |
|
| 2 |
+*/.DS_Store |
|
| 3 |
+audio-0.6.0.tar.gz |
|
| 4 |
+audio/* |
|
| 5 |
+deepspeech-0.6.0-models.tar.gz |
|
| 6 |
+deepspeech-0.6.0-models/* |
|
| 7 |
+temp_audio/* |
@@ -0,0 +1,161 @@ |
||
| 1 |
+#!/usr/bin/env python |
|
| 2 |
+# -*- coding: utf-8 -*- |
|
| 3 |
+from __future__ import absolute_import, division, print_function |
|
| 4 |
+ |
|
| 5 |
+import argparse |
|
| 6 |
+import numpy as np |
|
| 7 |
+import shlex |
|
| 8 |
+import subprocess |
|
| 9 |
+import sys |
|
| 10 |
+import wave |
|
| 11 |
+import json |
|
| 12 |
+ |
|
| 13 |
+import os |
|
| 14 |
+from bottle import route, run, template, request, response |
|
| 15 |
+ |
|
| 16 |
+from deepspeech import Model, printVersions |
|
| 17 |
+from timeit import default_timer as timer |
|
| 18 |
+ |
|
| 19 |
+try: |
|
| 20 |
+ from shhlex import quote |
|
| 21 |
+except ImportError: |
|
| 22 |
+ from pipes import quote |
|
| 23 |
+ |
|
| 24 |
+model = "deepspeech-0.6.0-models/output_graph.pbmm" |
|
| 25 |
+lm = "deepspeech-0.6.0-models/lm.binary" |
|
| 26 |
+trie = "deepspeech-0.6.0-models/trie" |
|
| 27 |
+beam_width = 500 |
|
| 28 |
+lm_alpha = 0.75 |
|
| 29 |
+lm_beta = 1.85 |
|
| 30 |
+desired_sample_rate = 16000 |
|
| 31 |
+ds = {}
|
|
| 32 |
+ |
|
| 33 |
+def convert_samplerate(audio_path, desired_sample_rate): |
|
| 34 |
+ sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
|
|
| 35 |
+ try: |
|
| 36 |
+ output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) |
|
| 37 |
+ except subprocess.CalledProcessError as e: |
|
| 38 |
+ raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
|
|
| 39 |
+ except OSError as e: |
|
| 40 |
+ raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(desired_sample_rate, e.strerror))
|
|
| 41 |
+ |
|
| 42 |
+ return desired_sample_rate, np.frombuffer(output, np.int16) |
|
| 43 |
+ |
|
| 44 |
+ |
|
| 45 |
+def metadata_to_string(metadata): |
|
| 46 |
+ return ''.join(item.character for item in metadata.items) |
|
| 47 |
+ |
|
| 48 |
+def words_from_metadata(metadata): |
|
| 49 |
+ word = "" |
|
| 50 |
+ word_list = [] |
|
| 51 |
+ word_start_time = 0 |
|
| 52 |
+ # Loop through each character |
|
| 53 |
+ for i in range(0, metadata.num_items): |
|
| 54 |
+ item = metadata.items[i] |
|
| 55 |
+ # Append character to word if it's not a space |
|
| 56 |
+ if item.character != " ": |
|
| 57 |
+ word = word + item.character |
|
| 58 |
+ # Word boundary is either a space or the last character in the array |
|
| 59 |
+ if item.character == " " or i == metadata.num_items - 1: |
|
| 60 |
+ word_duration = item.start_time - word_start_time |
|
| 61 |
+ |
|
| 62 |
+ if word_duration < 0: |
|
| 63 |
+ word_duration = 0 |
|
| 64 |
+ |
|
| 65 |
+ each_word = dict() |
|
| 66 |
+ each_word["word"] = word |
|
| 67 |
+ each_word["start_time "] = round(word_start_time, 4) |
|
| 68 |
+ each_word["duration"] = round(word_duration, 4) |
|
| 69 |
+ |
|
| 70 |
+ word_list.append(each_word) |
|
| 71 |
+ # Reset |
|
| 72 |
+ word = "" |
|
| 73 |
+ word_start_time = 0 |
|
| 74 |
+ else: |
|
| 75 |
+ if len(word) == 1: |
|
| 76 |
+ # Log the start time of the new word |
|
| 77 |
+ word_start_time = item.start_time |
|
| 78 |
+ |
|
| 79 |
+ return word_list |
|
| 80 |
+ |
|
| 81 |
+ |
|
| 82 |
+def metadata_json_output(metadata): |
|
| 83 |
+ json_result = dict() |
|
| 84 |
+ json_result["words"] = words_from_metadata(metadata) |
|
| 85 |
+ json_result["confidence"] = metadata.confidence |
|
| 86 |
+ return json.dumps(json_result) |
|
| 87 |
+ |
|
| 88 |
+ |
|
| 89 |
+ |
|
| 90 |
+class VersionAction(argparse.Action): |
|
| 91 |
+ def __init__(self, *args, **kwargs): |
|
| 92 |
+ super(VersionAction, self).__init__(nargs=0, *args, **kwargs) |
|
| 93 |
+ |
|
| 94 |
+ def __call__(self, *args, **kwargs): |
|
| 95 |
+ printVersions() |
|
| 96 |
+ exit(0) |
|
| 97 |
+ |
|
| 98 |
+@route('/upload', method='POST')
|
|
| 99 |
+def do_upload(): |
|
| 100 |
+ print('received a request')
|
|
| 101 |
+ response.content_type = 'application/json' |
|
| 102 |
+ upload = request.POST['file'] |
|
| 103 |
+ name, ext = os.path.splitext(upload.filename) |
|
| 104 |
+ if ext not in ('.wav'):
|
|
| 105 |
+ return "File extension not allowed." |
|
| 106 |
+ |
|
| 107 |
+ save_path = "temp_audio" |
|
| 108 |
+ if not os.path.exists(save_path): |
|
| 109 |
+ os.makedirs(save_path) |
|
| 110 |
+ |
|
| 111 |
+ file_path = "{path}/{file}".format(path=save_path, file=upload.filename)
|
|
| 112 |
+ print(file_path) |
|
| 113 |
+ upload.save(file_path, overwrite=True) |
|
| 114 |
+ data = process(file_path) |
|
| 115 |
+ return data |
|
| 116 |
+ #return { "message": "File successfully saved to '{0}'.".format(save_path)}
|
|
| 117 |
+ |
|
| 118 |
+def start(): |
|
| 119 |
+ global ds |
|
| 120 |
+ print('Loading model from file {}'.format(model), file=sys.stderr)
|
|
| 121 |
+ model_load_start = timer() |
|
| 122 |
+ ds = Model(model, beam_width) |
|
| 123 |
+ model_load_end = timer() - model_load_start |
|
| 124 |
+ print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
|
|
| 125 |
+ |
|
| 126 |
+ #desired_sample_rate = ds.sampleRate() |
|
| 127 |
+ |
|
| 128 |
+ if lm and trie: |
|
| 129 |
+ print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr)
|
|
| 130 |
+ lm_load_start = timer() |
|
| 131 |
+ ds.enableDecoderWithLM(lm, trie, lm_alpha, lm_beta) |
|
| 132 |
+ lm_load_end = timer() - lm_load_start |
|
| 133 |
+ print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)
|
|
| 134 |
+ |
|
| 135 |
+def process(audio): |
|
| 136 |
+ global ds |
|
| 137 |
+ fin = wave.open(audio, 'rb') |
|
| 138 |
+ fs = fin.getframerate() |
|
| 139 |
+ if fs != desired_sample_rate: |
|
| 140 |
+ print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr)
|
|
| 141 |
+ fs, audio = convert_samplerate(audio, desired_sample_rate) |
|
| 142 |
+ else: |
|
| 143 |
+ audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) |
|
| 144 |
+ |
|
| 145 |
+ audio_length = fin.getnframes() * (1/fs) |
|
| 146 |
+ fin.close() |
|
| 147 |
+ |
|
| 148 |
+ print('Running inference.', file=sys.stderr)
|
|
| 149 |
+ inference_start = timer() |
|
| 150 |
+ data = metadata_json_output(ds.sttWithMetadata(audio)) |
|
| 151 |
+ inference_end = timer() - inference_start |
|
| 152 |
+ print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
|
|
| 153 |
+ print(data) |
|
| 154 |
+ return data |
|
| 155 |
+ |
|
| 156 |
+if __name__ == '__main__': |
|
| 157 |
+ start() |
|
| 158 |
+ run(host='localhost', port=8050) |
|
| 159 |
+ |
|
| 160 |
+if __name__ == '__main__': |
|
| 161 |
+ main() |