sapi5.py
9.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# -*- coding: UTF-8 -*-
#synthDrivers/sapi5.py
#A part of NonVisual Desktop Access (NVDA)
#Copyright (C) 2006-2014 NV Access Limited, Peter Vágner, Aleksey Sadovoy
#This file is covered by the GNU General Public License.
#See the file COPYING for more details.
import locale
from collections import OrderedDict
import threading
import time
import os
from ctypes import *
import comtypes.client
from comtypes import COMError
import _winreg
import audioDucking
import NVDAHelper
import globalVars
import speech
from synthDriverHandler import SynthDriver,VoiceInfo
import config
import nvwave
from logHandler import log
class FunctionHooker(object):
def __init__(self,targetDll,importDll,funcName,newFunction):
hook=NVDAHelper.localLib.dllImportTableHooks_hookSingle(targetDll,importDll,funcName,newFunction)
if hook:
print "hooked %s"%funcName
else:
print "could not hook %s"%funcName
raise RuntimeError("could not hook %s"%funcName)
def __del__(self):
NVDAHelper.localLib.dllImportTableHooks_unhookSingle(self._hook)
_duckersByHandle={}
@WINFUNCTYPE(windll.winmm.waveOutOpen.restype,*windll.winmm.waveOutOpen.argtypes,use_errno=False,use_last_error=False)
def waveOutOpen(pWaveOutHandle,deviceID,wfx,callback,callbackInstance,flags):
try:
res=windll.winmm.waveOutOpen(pWaveOutHandle,deviceID,wfx,callback,callbackInstance,flags) or 0
except WindowsError as e:
res=e.winerror
if res==0 and pWaveOutHandle:
h=pWaveOutHandle.contents.value
d=audioDucking.AudioDucker()
d.enable()
_duckersByHandle[h]=d
return res
@WINFUNCTYPE(c_long,c_long)
def waveOutClose(waveOutHandle):
try:
res=windll.winmm.waveOutClose(waveOutHandle) or 0
except WindowsError as e:
res=e.winerror
if res==0 and waveOutHandle:
_duckersByHandle.pop(waveOutHandle,None)
return res
_waveOutHooks=[]
def ensureWaveOutHooks():
if not _waveOutHooks and audioDucking.isAudioDuckingSupported():
sapiPath=os.path.join(os.path.expandvars("$SYSTEMROOT"),"system32","speech","common","sapi.dll")
_waveOutHooks.append(FunctionHooker(sapiPath,"WINMM.dll","waveOutOpen",waveOutOpen))
_waveOutHooks.append(FunctionHooker(sapiPath,"WINMM.dll","waveOutClose",waveOutClose))
class constants:
SVSFlagsAsync = 1
SVSFPurgeBeforeSpeak = 2
SVSFIsXML = 8
class SynthDriver(SynthDriver):
supportedSettings=(SynthDriver.VoiceSetting(),SynthDriver.RateSetting(),SynthDriver.PitchSetting(),SynthDriver.VolumeSetting())
COM_CLASS = "SAPI.SPVoice"
name="sapi5"
description="Microsoft Speech API version 5"
@classmethod
def check(cls):
try:
r=_winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT,cls.COM_CLASS)
r.Close()
return True
except:
return False
def __init__(self,_defaultVoiceToken=None):
"""
@param _defaultVoiceToken: an optional sapi voice token which should be used as the default voice (only useful for subclasses)
@type _defaultVoiceToken: ISpeechObjectToken
"""
ensureWaveOutHooks()
self._pitch=50
self._initTts(_defaultVoiceToken)
def terminate(self):
del self.tts
def _getAvailableVoices(self):
voices=OrderedDict()
v=self._getVoiceTokens()
# #2629: Iterating uses IEnumVARIANT and GetBestInterface doesn't work on tokens returned by some token enumerators.
# Therefore, fetch the items by index, as that method explicitly returns the correct interface.
for i in xrange(len(v)):
try:
ID=v[i].Id
name=v[i].GetDescription()
try:
language=locale.windows_locale[int(v[i].getattribute('language').split(';')[0],16)]
except KeyError:
language=None
except COMError:
log.warning("Could not get the voice info. Skipping...")
voices[ID]=VoiceInfo(ID,name,language)
return voices
def _getVoiceTokens(self):
"""Provides a collection of sapi5 voice tokens. Can be overridden by subclasses if tokens should be looked for in some other registry location."""
return self.tts.getVoices()
def _get_rate(self):
return (self.tts.rate*5)+50
def _get_pitch(self):
return self._pitch
def _get_volume(self):
return self.tts.volume
def _get_voice(self):
return self.tts.voice.Id
def _get_lastIndex(self):
bookmark=self.tts.status.LastBookmark
if bookmark!="" and bookmark is not None:
return int(bookmark)
else:
return None
def _percentToRate(self, percent):
return (percent - 50) / 5
def _set_rate(self,rate):
self.tts.Rate = self._percentToRate(rate)
def _set_pitch(self,value):
#pitch is really controled with xml around speak commands
self._pitch=value
def _set_volume(self,value):
self.tts.Volume = value
def _initTts(self, voice=None):
self.tts=comtypes.client.CreateObject(self.COM_CLASS)
if voice:
# #749: It seems that SAPI 5 doesn't reset the audio parameters when the voice is changed,
# but only when the audio output is changed.
# Therefore, set the voice before setting the audio output.
# Otherwise, we will get poor speech quality in some cases.
self.tts.voice = voice
outputDeviceID=nvwave.outputDeviceNameToID(config.conf["speech"]["outputDevice"], True)
if outputDeviceID>=0:
self.tts.audioOutput=self.tts.getAudioOutputs()[outputDeviceID]
def _set_voice(self,value):
tokens = self._getVoiceTokens()
# #2629: Iterating uses IEnumVARIANT and GetBestInterface doesn't work on tokens returned by some token enumerators.
# Therefore, fetch the items by index, as that method explicitly returns the correct interface.
for i in xrange(len(tokens)):
voice=tokens[i]
if value==voice.Id:
break
else:
# Voice not found.
return
self._initTts(voice=voice)
def _percentToPitch(self, percent):
return percent / 2 - 25
IPA_TO_SAPI = {
u"θ": u"th",
u"s": u"s",
}
def _convertPhoneme(self, ipa):
# We only know about US English phonemes.
# Rather than just ignoring unknown phonemes, SAPI throws an exception.
# Therefore, don't bother with any other language.
if self.tts.voice.GetAttribute("language") != "409":
raise LookupError("No data for this language")
out = []
outAfter = None
for ipaChar in ipa:
if ipaChar == u"ˈ":
outAfter = u"1"
continue
out.append(self.IPA_TO_SAPI[ipaChar])
if outAfter:
out.append(outAfter)
outAfter = None
if outAfter:
out.append(outAfter)
return u" ".join(out)
def speak(self, speechSequence):
textList = []
# NVDA SpeechCommands are linear, but XML is hierarchical.
# Therefore, we track values for non-empty tags.
# When a tag changes, we close all previously opened tags and open new ones.
tags = {}
# We have to use something mutable here because it needs to be changed by the inner function.
tagsChanged = [True]
openedTags = []
def outputTags():
if not tagsChanged[0]:
return
for tag in reversed(openedTags):
textList.append("</%s>" % tag)
del openedTags[:]
for tag, attrs in tags.iteritems():
textList.append("<%s" % tag)
for attr, val in attrs.iteritems():
textList.append(' %s="%s"' % (attr, val))
textList.append(">")
openedTags.append(tag)
tagsChanged[0] = False
pitch = self._pitch
# Pitch must always be specified in the markup.
tags["pitch"] = {"absmiddle": self._percentToPitch(pitch)}
rate = self.rate
volume = self.volume
for item in speechSequence:
if isinstance(item, basestring):
outputTags()
textList.append(item.replace("<", "<"))
elif isinstance(item, speech.IndexCommand):
textList.append('<Bookmark Mark="%d" />' % item.index)
elif isinstance(item, speech.CharacterModeCommand):
if item.state:
tags["spell"] = {}
else:
try:
del tags["spell"]
except KeyError:
pass
tagsChanged[0] = True
elif isinstance(item, speech.BreakCommand):
textList.append('<silence msec="%d" />' % item.time)
elif isinstance(item, speech.PitchCommand):
tags["pitch"] = {"absmiddle": self._percentToPitch(int(pitch * item.multiplier))}
tagsChanged[0] = True
elif isinstance(item, speech.VolumeCommand):
if item.multiplier == 1:
try:
del tags["volume"]
except KeyError:
pass
else:
tags["volume"] = {"level": int(volume * item.multiplier)}
tagsChanged[0] = True
elif isinstance(item, speech.RateCommand):
if item.multiplier == 1:
try:
del tags["rate"]
except KeyError:
pass
else:
tags["rate"] = {"absspeed": self._percentToRate(int(rate * item.multiplier))}
tagsChanged[0] = True
elif isinstance(item, speech.PhonemeCommand):
try:
textList.append(u'<pron sym="%s">%s</pron>'
% (self._convertPhoneme(item.ipa), item.text or u""))
except LookupError:
log.debugWarning("Couldn't convert character in IPA string: %s" % item.ipa)
if item.text:
textList.append(item.text)
elif isinstance(item, speech.SpeechCommand):
log.debugWarning("Unsupported speech command: %s" % item)
else:
log.error("Unknown speech: %s" % item)
# Close any tags that are still open.
tags.clear()
tagsChanged[0] = True
outputTags()
text = "".join(textList)
flags = constants.SVSFIsXML | constants.SVSFlagsAsync
self.tts.Speak(text, flags)
def cancel(self):
#if self.tts.Status.RunningState == 2:
self.tts.Speak(None, 1|constants.SVSFPurgeBeforeSpeak)
def pause(self,switch):
if switch:
self.cancel()