@@ -0,0 +1,7 @@ |
||
1 |
+.DS_Store |
|
2 |
+*/.DS_Store |
|
3 |
+audio-0.6.0.tar.gz |
|
4 |
+audio/* |
|
5 |
+deepspeech-0.6.0-models.tar.gz |
|
6 |
+deepspeech-0.6.0-models/* |
|
7 |
+temp_audio/* |
@@ -0,0 +1,161 @@ |
||
1 |
+#!/usr/bin/env python |
|
2 |
+# -*- coding: utf-8 -*- |
|
3 |
+from __future__ import absolute_import, division, print_function |
|
4 |
+ |
|
5 |
+import argparse |
|
6 |
+import numpy as np |
|
7 |
+import shlex |
|
8 |
+import subprocess |
|
9 |
+import sys |
|
10 |
+import wave |
|
11 |
+import json |
|
12 |
+ |
|
13 |
+import os |
|
14 |
+from bottle import route, run, template, request, response |
|
15 |
+ |
|
16 |
+from deepspeech import Model, printVersions |
|
17 |
+from timeit import default_timer as timer |
|
18 |
+ |
|
19 |
+try: |
|
20 |
+ from shhlex import quote |
|
21 |
+except ImportError: |
|
22 |
+ from pipes import quote |
|
23 |
+ |
|
24 |
+model = "deepspeech-0.6.0-models/output_graph.pbmm" |
|
25 |
+lm = "deepspeech-0.6.0-models/lm.binary" |
|
26 |
+trie = "deepspeech-0.6.0-models/trie" |
|
27 |
+beam_width = 500 |
|
28 |
+lm_alpha = 0.75 |
|
29 |
+lm_beta = 1.85 |
|
30 |
+desired_sample_rate = 16000 |
|
31 |
+ds = {} |
|
32 |
+ |
|
33 |
+def convert_samplerate(audio_path, desired_sample_rate): |
|
34 |
+ sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate) |
|
35 |
+ try: |
|
36 |
+ output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) |
|
37 |
+ except subprocess.CalledProcessError as e: |
|
38 |
+ raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr)) |
|
39 |
+ except OSError as e: |
|
40 |
+ raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(desired_sample_rate, e.strerror)) |
|
41 |
+ |
|
42 |
+ return desired_sample_rate, np.frombuffer(output, np.int16) |
|
43 |
+ |
|
44 |
+ |
|
45 |
+def metadata_to_string(metadata): |
|
46 |
+ return ''.join(item.character for item in metadata.items) |
|
47 |
+ |
|
48 |
+def words_from_metadata(metadata): |
|
49 |
+ word = "" |
|
50 |
+ word_list = [] |
|
51 |
+ word_start_time = 0 |
|
52 |
+ # Loop through each character |
|
53 |
+ for i in range(0, metadata.num_items): |
|
54 |
+ item = metadata.items[i] |
|
55 |
+ # Append character to word if it's not a space |
|
56 |
+ if item.character != " ": |
|
57 |
+ word = word + item.character |
|
58 |
+ # Word boundary is either a space or the last character in the array |
|
59 |
+ if item.character == " " or i == metadata.num_items - 1: |
|
60 |
+ word_duration = item.start_time - word_start_time |
|
61 |
+ |
|
62 |
+ if word_duration < 0: |
|
63 |
+ word_duration = 0 |
|
64 |
+ |
|
65 |
+ each_word = dict() |
|
66 |
+ each_word["word"] = word |
|
67 |
+ each_word["start_time "] = round(word_start_time, 4) |
|
68 |
+ each_word["duration"] = round(word_duration, 4) |
|
69 |
+ |
|
70 |
+ word_list.append(each_word) |
|
71 |
+ # Reset |
|
72 |
+ word = "" |
|
73 |
+ word_start_time = 0 |
|
74 |
+ else: |
|
75 |
+ if len(word) == 1: |
|
76 |
+ # Log the start time of the new word |
|
77 |
+ word_start_time = item.start_time |
|
78 |
+ |
|
79 |
+ return word_list |
|
80 |
+ |
|
81 |
+ |
|
82 |
+def metadata_json_output(metadata): |
|
83 |
+ json_result = dict() |
|
84 |
+ json_result["words"] = words_from_metadata(metadata) |
|
85 |
+ json_result["confidence"] = metadata.confidence |
|
86 |
+ return json.dumps(json_result) |
|
87 |
+ |
|
88 |
+ |
|
89 |
+ |
|
90 |
+class VersionAction(argparse.Action): |
|
91 |
+ def __init__(self, *args, **kwargs): |
|
92 |
+ super(VersionAction, self).__init__(nargs=0, *args, **kwargs) |
|
93 |
+ |
|
94 |
+ def __call__(self, *args, **kwargs): |
|
95 |
+ printVersions() |
|
96 |
+ exit(0) |
|
97 |
+ |
|
98 |
+@route('/upload', method='POST') |
|
99 |
+def do_upload(): |
|
100 |
+ print('received a request') |
|
101 |
+ response.content_type = 'application/json' |
|
102 |
+ upload = request.POST['file'] |
|
103 |
+ name, ext = os.path.splitext(upload.filename) |
|
104 |
+ if ext not in ('.wav'): |
|
105 |
+ return "File extension not allowed." |
|
106 |
+ |
|
107 |
+ save_path = "temp_audio" |
|
108 |
+ if not os.path.exists(save_path): |
|
109 |
+ os.makedirs(save_path) |
|
110 |
+ |
|
111 |
+ file_path = "{path}/{file}".format(path=save_path, file=upload.filename) |
|
112 |
+ print(file_path) |
|
113 |
+ upload.save(file_path, overwrite=True) |
|
114 |
+ data = process(file_path) |
|
115 |
+ return data |
|
116 |
+ #return { "message": "File successfully saved to '{0}'.".format(save_path)} |
|
117 |
+ |
|
118 |
+def start(): |
|
119 |
+ global ds |
|
120 |
+ print('Loading model from file {}'.format(model), file=sys.stderr) |
|
121 |
+ model_load_start = timer() |
|
122 |
+ ds = Model(model, beam_width) |
|
123 |
+ model_load_end = timer() - model_load_start |
|
124 |
+ print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) |
|
125 |
+ |
|
126 |
+ #desired_sample_rate = ds.sampleRate() |
|
127 |
+ |
|
128 |
+ if lm and trie: |
|
129 |
+ print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr) |
|
130 |
+ lm_load_start = timer() |
|
131 |
+ ds.enableDecoderWithLM(lm, trie, lm_alpha, lm_beta) |
|
132 |
+ lm_load_end = timer() - lm_load_start |
|
133 |
+ print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) |
|
134 |
+ |
|
135 |
+def process(audio): |
|
136 |
+ global ds |
|
137 |
+ fin = wave.open(audio, 'rb') |
|
138 |
+ fs = fin.getframerate() |
|
139 |
+ if fs != desired_sample_rate: |
|
140 |
+ print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs, desired_sample_rate), file=sys.stderr) |
|
141 |
+ fs, audio = convert_samplerate(audio, desired_sample_rate) |
|
142 |
+ else: |
|
143 |
+ audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) |
|
144 |
+ |
|
145 |
+ audio_length = fin.getnframes() * (1/fs) |
|
146 |
+ fin.close() |
|
147 |
+ |
|
148 |
+ print('Running inference.', file=sys.stderr) |
|
149 |
+ inference_start = timer() |
|
150 |
+ data = metadata_json_output(ds.sttWithMetadata(audio)) |
|
151 |
+ inference_end = timer() - inference_start |
|
152 |
+ print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) |
|
153 |
+ print(data) |
|
154 |
+ return data |
|
155 |
+ |
|
156 |
+if __name__ == '__main__': |
|
157 |
+ start() |
|
158 |
+ run(host='localhost', port=8050) |
|
159 |
+ |
|
160 |
+if __name__ == '__main__': |
|
161 |
+ main() |