onvif · Peggy0422 · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 11, 2025
diff --git a/doc/Media2.xml b/doc/Media2.xml
@@ -2355,7 +2355,65 @@
             <para>When the size of the audio clip upload exceeds the MaxAudioClipSize parameter in KB, the device should return an HTTP 413, Request Entity Too Large error to the client.</para>
           </listitem>
         </itemizedlist>              
-      </section>    
+      </section>  
+	  <section xml:id="section_wvd_dzg_rye">
+        <title>AddTTSAudioClip</title>
+        <para>This operation adds a text, audio clip configuration and TTS configuration to the device, for device converting the text to an audio clip based on the TTS configuration. 
+			The response to the command includes a unique token for this converted audio clip. 
+			If the device is unable to support language specified in the TTS configuration, the associated configuration will deleted from the device.</para>
+        <variablelist role="op">
+          <varlistentry>
+            <term>request</term>
+            <listitem>
+      <para role="param">Token - optional[tt:ReferenceToken]</para>
+              <para role="text"> Optional token associated with the audio clip.</para>
+              <para role="param">Configuration - 
+                [tr2:AudioClip]</para>
+              <para role="text"> Audio clip configuration to add.</para>
+      <para role="param">TTSConfiguration - 
+                [tr2:TTSAudio]</para>
+              <para role="text"> TTS configuration to add.</para>
+            </listitem>
+          </varlistentry>
+          <varlistentry>
+            <term>response</term>
+            <listitem>
+              <para role="param">Token - [tt:ReferenceToken]</para>
+              <para role="text">Unique token of the TTS audio clip to be uploaded.</para>             
+            </listitem>
+          </varlistentry>
+          <varlistentry>
+            <term>faults</term>
+            <listitem>
+              <para role="param">env:Receiver - ter:Action - ter:MaxAudioClipLimit</para>
+              <para role="text">The maximum number of audio clip configurations supported by the device has been reached.</para>
+              <para role="param">env:Sender - ter:InvalidArgVal - ter:InvalidConfig</para>
+              <para role="text">The configuration parameters are not possible to set.</para>  
+              <para role="param">env:Sender - ter:InvalidArgVal - ter:InvalidLanguage</para>
+              <para role="text">The language is not supported.</para>  
+            </listitem>
+          </varlistentry>
+          <varlistentry>
+            <term>access class</term>
+            <listitem>
+              <para role="access">WRITE_SYSTEM</para>
+            </listitem>
+          </varlistentry>
+        </variablelist>
+        <para></para>
+        <para><emphasis role="bold">Note:</emphasis> Audio clip uploads to the device can fail in the following scenarios, and a specific HTTP error code should be returned to the client when an upload fails.</para>
+        <itemizedlist>
+          <listitem>
+            <para>When the upload URL has expired, the device should return an HTTP 410 error to the client.</para>
+          </listitem>
+          <listitem>
+            <para>When the format of the audio clip upload does not match the Type parameter in the audio clip configuration, the device should return an HTTP 415 Unsupported Media Type error to the client.</para>
+          </listitem>
+          <listitem>
+            <para>When the size of the audio clip upload exceeds the MaxAudioClipSize parameter in KB, the device should return an HTTP 413, Request Entity Too Large error to the client.</para>
+          </listitem>
+        </itemizedlist>              
+      </section>       
 	  <section xml:id="section_wvd_dzg_rye">
         <title>SetAudioClip</title>
         <para>This operation modifies the existing audio clip configuration on the device.</para>
@@ -2754,6 +2812,10 @@
       </variablelist>    
     <para>The following audio clip Capabilities are available:</para>
 	<variablelist>
+		<varlistentry>
+          <term>TTSCapabilities</term>
+          <listitem><para>Indicates device supports TTS function and TTS configuration.</para></listitem>
+        </varlistentry>
         <varlistentry>
           <term>MaxAudioClipLimit</term>
           <listitem><para>Indicates the maximum number of audio clips that can be uploaded to the device.</para></listitem>

diff --git a/wsdl/ver20/media/wsdl/media.wsdl b/wsdl/ver20/media/wsdl/media.wsdl
@@ -202,6 +202,13 @@ IN NO EVENT WILL THE CORPORATION OR ITS MEMBERS OR THEIR AFFILIATES BE LIABLE FO
 			<!--===============================-->
 			<xs:complexType name="AudioClipCapabilities">
 				<xs:sequence>
+                  <!--==============TTS Capability=================-->
+                  <xs:element name="TTSCapabilities" type="tr2:TTSCapabilities" minOccurs="0">
+                     <xs:annotation>
+                        <xs:documentation>Indicates device has TTS capability.</xs:documentation>
+                     </xs:annotation>
+                  </xs:element>
+                 <!--=============================================-->
 					<xs:any namespace="##any" processContents="lax" minOccurs="0" maxOccurs="unbounded"/>   <!-- first ONVIF then Vendor -->
 				</xs:sequence>
 				<xs:attribute name="MaxAudioClipLimit" type="xs:int">
@@ -222,6 +229,50 @@ IN NO EVENT WILL THE CORPORATION OR ITS MEMBERS OR THEIR AFFILIATES BE LIABLE FO
 				<xs:anyAttribute processContents="lax"/>
 			</xs:complexType>
 			<!--===============================-->
+            <!--=============TTS Capability=================-->
+            <xs:complexType name="TTSCapabilities">
+				<xs:sequence>
+                   <xs:any namespace="##any" processContents="lax" minOccurs="0" maxOccurs="unbounded"/>   <!-- first ONVIF then Vendor -->
+                </xs:sequence>
+				<xs:attribute name="MaxContentLength" type="xs:int">
+                   <xs:annotation>
+                       <xs:documentation> Indicates the maximum length of content of a text for device to convert to an audio clip. </xs:documentation>
+                   </xs:annotation>
+                 </xs:attribute>
+                 <xs:attribute name="TTSLanguage" type="tt:StringAttrList">
+                    <xs:annotation>
+                        <xs:documentation> Indicates what language(s) the device supports for TTS function.</xs:documentation>
+                   </xs:annotation>
+                 </xs:attribute>
+                 <xs:attribute name="TTSVoiceType" type="tt:StringAttrList">
+                   <xs:annotation>
+                        <xs:documentation> Indicates the voice type for TTS funciton.</xs:documentation>     
+                   </xs:annotation>
+                 </xs:attribute>
+                <xs:anyAttribute processContents="lax"/>
+            </xs:complexType>
+            <!--===============TTS Language================-->
+            <xs:simpleType name="TTSLanguage">
+              <xs:restriction base="xs:string">
+              <xs:enumeration value="english"/>
+              <xs:enumeration value="chinese"/>
+              <xs:enumeration value="spanish"/>
+              <xs:enumeration value="japanese"/>
+              <xs:enumeration value="thai"/>
+              <xs:enumeration value="korean"/>
+              <xs:enumeration value="french"/>
+              <xs:enumeration value="brazilianPortuguese"/>
+              </xs:restriction>
+            </xs:simpleType>
+          <!--=============TTS Voice Type==================-->
+          <xs:simpleType name="TTSVoiceType">
+              <xs:restriction base="xs:string">
+              <xs:enumeration value="male"/>
+              <xs:enumeration value="female"/>
+              </xs:restriction>
+          </xs:simpleType>
+         <!--==============TTS End===============-->
+         <!--====================================-->
 			<xs:simpleType name="SupportedAudioClipFormat">
 				<xs:restriction base="xs:string">
 					<xs:enumeration value="audio/vnd.wave;codec=1"/>
@@ -1445,8 +1496,30 @@ IN NO EVENT WILL THE CORPORATION OR ITS MEMBERS OR THEIR AFFILIATES BE LIABLE FO
 				</xs:sequence>
 				<xs:anyAttribute processContents="lax"/>
 			</xs:complexType>
-
-			<!--===============================-->
+             <!--=========TTS Audio======================-->
+			 <xs:complexType name="TTSAudio">
+				<xs:sequence>					
+				 <xs:element name="Content" type="xs:string">
+                     <xs:annotation>
+                         <xs:documentation>Content of the audio clip.</xs:documentation>
+                     </xs:annotation>
+                 </xs:element>
+                 <xs:element name="Language" type="xs:string">
+                     <xs:annotation>
+                         <xs:documentation>Language for the audio clip content, it is the same as the language during playback.</xs:documentation>
+                     </xs:annotation>
+                 </xs:element>
+                 <xs:element name="VoiceType" type="xs:string">
+                     <xs:annotation>
+                         <xs:documentation>The voice type of audio clip playback.</xs:documentation>
+                     </xs:annotation>
+                 </xs:element>
+					<xs:any namespace="##any" processContents="lax" minOccurs="0" maxOccurs="unbounded"/>   <!-- first ONVIF then Vendor -->
+				</xs:sequence>
+				<xs:anyAttribute processContents="lax"/>
+			</xs:complexType>
+			<!--=========TTS Audio END=======================-->
+            <!--===============================-->
 			<xs:complexType name="GetAudioClipsResponseItem">
 				<xs:sequence>
 					<xs:element name="Token" type="tt:ReferenceToken">
@@ -1579,7 +1652,40 @@ IN NO EVENT WILL THE CORPORATION OR ITS MEMBERS OR THEIR AFFILIATES BE LIABLE FO
 					</xs:sequence>
 				</xs:complexType>
 			</xs:element>
-
+        <!--==============TTS=================-->
+         <xs:element name="AddTTSAudioClip">
+				<xs:complexType>				
+					<xs:sequence>
+						<xs:element name="Token" type="tt:ReferenceToken" minOccurs="0">				
+							<xs:annotation>
+								<xs:documentation>Optional token associated with the audio clip.</xs:documentation>
+							</xs:annotation>
+						</xs:element>
+						<xs:element name="Configuration" type="tr2:AudioClip">
+							<xs:annotation>
+								<xs:documentation>Audio clip configuration to add.</xs:documentation>
+							</xs:annotation>
+					    </xs:element>
+                        <xs:element name="TTSConfiguration" type="tr2:TTSAudio">
+                        <xs:annotation>
+                        <xs:documentation>The configuration for the TTS audio clip to add.</xs:documentation>
+                        </xs:annotation>
+                       </xs:element>        
+					</xs:sequence>
+				</xs:complexType>
+			</xs:element>			
+			<xs:element name="AddTTSAudioClipResponse">
+				<xs:complexType>
+					<xs:sequence>
+						<xs:element name="Token" type="tt:ReferenceToken">						
+							<xs:annotation>
+								<xs:documentation>Unique token of the TTS audio clip to be uploaded.</xs:documentation>
+							</xs:annotation>
+						</xs:element>
+					</xs:sequence>
+				</xs:complexType>				
+			</xs:element>
+<!--==============TTS END=================-->
 			<xs:element name="DeleteAudioClip">
 				<xs:complexType>
 					<xs:sequence>
@@ -2018,6 +2124,14 @@ IN NO EVENT WILL THE CORPORATION OR ITS MEMBERS OR THEIR AFFILIATES BE LIABLE FO
 	<wsdl:message name="AddAudioClipResponse">
 		<wsdl:part name="parameters" element="tr2:AddAudioClipResponse"/>
 	</wsdl:message>
+<!--==============TTS=================--> 
+    <wsdl:message name="AddTTSAudioClipRequest">
+		<wsdl:part name="parameters" element="tr2:AddTTSAudioClip"/>
+	</wsdl:message>
+	<wsdl:message name="AddTTSAudioClipResponse">
+		<wsdl:part name="parameters" element="tr2:AddTTSAudioClipResponse"/>
+	</wsdl:message>
+<!--==============================--> 
 	<wsdl:message name="SetAudioClipRequest">
 		<wsdl:part name="parameters" element="tr2:SetAudioClip"/>
 	</wsdl:message>
@@ -2412,6 +2526,13 @@ image will be updated automatically and independent from calls to GetSnapshotUri
 			<wsdl:input message="tr2:AddAudioClipRequest"/>
 			<wsdl:output message="tr2:AddAudioClipResponse"/>
 		</wsdl:operation>
+        <!--==============TTS=================--> 
+        <wsdl:operation name="AddTTSAudioClip">
+			<wsdl:documentation>This operation sends a text and its configuartion to device that supports TTS function, so that device could convert the text into an audio clip and play it according to audio clip Configuration and TTS Configuration.</wsdl:documentation>
+			<wsdl:input message="tr2:AddTTSAudioClipRequest"/>
+			<wsdl:output message="tr2:AddTTSAudioClipResponse"/>
+		</wsdl:operation>
+        <!--=============================--> 
 		<wsdl:operation name="SetAudioClip">
 			<wsdl:documentation>This operation modifies the existing audio clip configuration on the device.</wsdl:documentation>
 			<wsdl:input message="tr2:SetAudioClipRequest"/>
@@ -2940,6 +3061,17 @@ image will be updated automatically and independent from calls to GetSnapshotUri
 				<soap:body use="literal"/>
 			</wsdl:output>
 		</wsdl:operation>
+        <!--==============TTS=================-->
+        <wsdl:operation name="AddTTSAudioClip">
+			<soap:operation soapAction="http://www.onvif.org/ver20/media/wsdl/AddTTSAudioClip"/>
+			<wsdl:input>
+				<soap:body use="literal"/>
+			</wsdl:input>
+			<wsdl:output>
+				<soap:body use="literal"/>
+			</wsdl:output>
+		</wsdl:operation>
+       <!--=================================-->
 		<wsdl:operation name="SetAudioClip">
 			<soap:operation soapAction="http://www.onvif.org/ver20/media/wsdl/SetAudioClip"/>
 			<wsdl:input>