Răsfoiți Sursa

阿里云语音识别demo提交

liyanbo 1 lună în urmă
părinte
comite
b8fa4a5033

+ 7 - 0
byzs-module-ai/pom.xml

@@ -233,6 +233,13 @@
             <artifactId>nls-sdk-common</artifactId>
             <version>2.2.14</version>
         </dependency>
+
+        <!-- 阿里云 - 语音识别 -->
+        <dependency>
+            <groupId>com.alibaba.nls</groupId>
+            <artifactId>nls-sdk-transcriber</artifactId>
+            <version>2.2.14</version>
+        </dependency>
     </dependencies>
 
 </project>

+ 337 - 0
byzs-module-ai/src/main/java/cn/iocoder/byzs/module/ai/controller/admin/speech/RealTimeSpeechController.java

@@ -0,0 +1,337 @@
+package cn.iocoder.byzs.module.ai.controller.admin.speech;
+
+import com.alibaba.nls.client.AccessToken;
+import com.alibaba.nls.client.protocol.NlsClient;
+import com.alibaba.nls.client.protocol.InputFormatEnum;
+import com.alibaba.nls.client.protocol.SampleRateEnum;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriber;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriberListener;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriberResponse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.web.bind.annotation.*;
+import org.springframework.web.multipart.MultipartFile;
+import org.springframework.web.socket.CloseStatus;
+import org.springframework.web.socket.TextMessage;
+import org.springframework.web.socket.WebSocketSession;
+import org.springframework.web.socket.handler.TextWebSocketHandler;
+
+import jakarta.servlet.http.HttpServletRequest;
+import org.springframework.web.context.request.RequestContextHolder;
+import org.springframework.web.context.request.ServletRequestAttributes;
+import org.springframework.web.multipart.MultipartFile;
+import org.springframework.web.multipart.MultipartHttpServletRequest;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+
+@RestController
+@RequestMapping("/admin/ai/speech")
+public class RealTimeSpeechController {
+
+    private static final Logger logger = LoggerFactory.getLogger(RealTimeSpeechController.class);
+
+    @Value("${ai.aliyun.app-key:4SUOF4LfaU7FekyW}")
+    private String appKey;
+
+    @Value("${ai.aliyun.access-key-id:LTAI5tQhMPLXtSgXiPiWbw6D}")
+    private String accessKeyId;
+
+    @Value("${ai.aliyun.access-key-secret:HCXpFYjl4swk0qwfIKa9s2bXx0AWcG}")
+    private String accessKeySecret;
+
+    @Value("${ai.aliyun.nls-gateway-url:wss://nls-gateway-cn-shanghai.aliyuncs.com/ws/v1}")
+    private String nlsGatewayUrl;
+
+    // 存储每个会话的识别结果
+    private final Map<String, StringBuilder> transcriptionResults = new ConcurrentHashMap<>();
+
+    // 存储每个会话的NlsClient
+    private final Map<String, NlsClient> nlsClients = new ConcurrentHashMap<>();
+
+    // 存储每个会话的SpeechTranscriber
+    private final Map<String, SpeechTranscriber> transcribers = new ConcurrentHashMap<>();
+
+    // 存储每个会话的结束信号
+    private final Map<String, CountDownLatch> latches = new ConcurrentHashMap<>();
+    
+    // 存储WebSocket会话,用于实时返回中间结果
+    private final Map<String, WebSocketSession> webSocketSessions = new ConcurrentHashMap<>();
+
+    // WebSocket处理器
+    public class SpeechWebSocketHandler extends TextWebSocketHandler {
+        private final Map<String, WebSocketSession> webSocketSessions;
+        private final Logger logger = LoggerFactory.getLogger(SpeechWebSocketHandler.class);
+        
+        public SpeechWebSocketHandler(Map<String, WebSocketSession> webSocketSessions) {
+            this.webSocketSessions = webSocketSessions;
+        }
+        
+        @Override
+        public void afterConnectionEstablished(WebSocketSession session) throws Exception {
+            // 从会话参数中获取sessionId
+            String sessionId = session.getUri().getQuery().split("=")[1];
+            webSocketSessions.put(sessionId, session);
+            logger.info("WebSocket连接建立,sessionId: {}", sessionId);
+        }
+        
+        @Override
+        protected void handleTextMessage(WebSocketSession session, TextMessage message) throws Exception {
+            // 处理客户端消息
+        }
+        
+        @Override
+        public void afterConnectionClosed(WebSocketSession session, CloseStatus status) throws Exception {
+            // 从会话参数中获取sessionId
+            String sessionId = session.getUri().getQuery().split("=")[1];
+            webSocketSessions.remove(sessionId);
+            logger.info("WebSocket连接关闭,sessionId: {}", sessionId);
+        }
+    }
+
+    /**
+     * 开始语音识别会话
+     */
+    @PostMapping("/start")
+    public Map<String, Object> startRecognition() {
+        // 生成唯一的会话ID
+        String sessionId = java.util.UUID.randomUUID().toString();
+        try {
+            // 清理旧的会话资源
+            cleanupSession(sessionId);
+            
+            // 初始化NlsClient
+            AccessToken accessToken = new AccessToken(accessKeyId, accessKeySecret);
+            accessToken.apply();
+            NlsClient client = new NlsClient(nlsGatewayUrl, accessToken.getToken());
+            nlsClients.put(sessionId, client);
+
+            // 初始化识别结果
+            transcriptionResults.put(sessionId, new StringBuilder());
+
+            // 初始化结束信号
+            latches.put(sessionId, new CountDownLatch(1));
+
+            // 创建SpeechTranscriber
+            SpeechTranscriber transcriber = new SpeechTranscriber(client, getTranscriberListener(sessionId));
+            transcriber.setAppKey(appKey);
+            transcriber.setFormat(InputFormatEnum.PCM);
+            transcriber.setSampleRate(SampleRateEnum.SAMPLE_RATE_16K);
+            transcriber.setEnableIntermediateResult(true);
+            transcriber.setEnablePunctuation(true);
+            transcriber.setEnableITN(false);
+            transcriber.start();
+
+            transcribers.put(sessionId, transcriber);
+
+            // 启动心跳线程,避免WebSocket会话超时
+            startHeartbeatThread(sessionId, transcriber);
+
+            return Map.of("success", true, "message", "语音识别会话已开始", "sessionId", sessionId);
+        } catch (Exception e) {
+            logger.error("开始语音识别失败", e);
+            cleanupSession(sessionId);
+            return Map.of("success", false, "message", "开始语音识别失败: " + e.getMessage());
+        }
+    }
+
+    /**
+     * 启动心跳线程,定期发送空数据保持WebSocket连接
+     */
+    private void startHeartbeatThread(String sessionId, SpeechTranscriber transcriber) {
+        Thread heartbeatThread = new Thread(() -> {
+            try {
+                while (transcribers.containsKey(sessionId)) {
+                    // 发送空数据保持连接
+                    transcriber.send(new byte[0], 0);
+                    Thread.sleep(5000); // 每5秒发送一次心跳
+                }
+            } catch (Exception e) {
+                logger.error("心跳线程异常", e);
+            }
+        });
+        heartbeatThread.setDaemon(true);
+        heartbeatThread.start();
+    }
+
+    /**
+     * 清理会话资源
+     */
+    private void cleanupSession(String sessionId) {
+        try {
+            SpeechTranscriber transcriber = transcribers.remove(sessionId);
+            if (transcriber != null) {
+                transcriber.close();
+            }
+            NlsClient client = nlsClients.remove(sessionId);
+            if (client != null) {
+                client.shutdown();
+            }
+            transcriptionResults.remove(sessionId);
+            latches.remove(sessionId);
+        } catch (Exception e) {
+            logger.error("清理会话资源失败", e);
+        }
+    }
+
+    /**
+     * 接收音频数据并发送到阿里云
+     */
+    @PostMapping("/stream")
+    public Map<String, Object> streamAudio(HttpServletRequest request) {
+        try {
+            // 从请求参数中获取sessionId
+            String sessionId = null;
+            
+            // 首先尝试从URL参数中获取
+            sessionId = request.getParameter("sessionId");
+            
+            // 如果URL参数中没有,尝试从multipart表单中获取
+            if (sessionId == null || sessionId.isEmpty()) {
+                if (request instanceof MultipartHttpServletRequest) {
+                    MultipartHttpServletRequest multipartRequest = (MultipartHttpServletRequest) request;
+                    sessionId = multipartRequest.getParameter("sessionId");
+                }
+            }
+            
+            if (sessionId == null || sessionId.isEmpty()) {
+                logger.error("sessionId参数缺失,请求参数: {}", request.getParameterMap());
+                return Map.of("success", false, "message", "请求参数缺失:sessionId");
+            }
+            
+            // 获取音频文件
+            MultipartFile audioFile = null;
+            if (request instanceof MultipartHttpServletRequest) {
+                MultipartHttpServletRequest multipartRequest = (MultipartHttpServletRequest) request;
+                audioFile = multipartRequest.getFile("audio");
+            }
+            
+            SpeechTranscriber transcriber = transcribers.get(sessionId);
+            if (transcriber == null) {
+                logger.error("会话不存在,sessionId: {}", sessionId);
+                return Map.of("success", false, "message", "语音识别会话未开始");
+            }
+
+            if (audioFile == null || audioFile.isEmpty()) {
+                // 忽略空数据,避免发送空数据到阿里云
+                logger.info("接收到空音频数据,忽略处理");
+                return Map.of("success", true, "message", "音频数据已接收");
+            }
+
+            byte[] audioData = audioFile.getBytes();
+            if (audioData.length > 0) {
+                transcriber.send(audioData, audioData.length);
+            } else {
+                // 忽略空数据,避免发送空数据到阿里云
+                logger.info("接收到空音频数据,忽略处理");
+            }
+
+            return Map.of("success", true, "message", "音频数据已接收");
+        } catch (Exception e) {
+            logger.error("处理音频数据失败", e);
+            return Map.of("success", false, "message", "处理音频数据失败: " + e.getMessage());
+        }
+    }
+
+    /**
+     * 结束语音识别会话并返回结果
+     */
+    @PostMapping("/stop")
+    public Map<String, Object> stopRecognition(@RequestParam("sessionId") String sessionId) {
+        try {
+            SpeechTranscriber transcriber = transcribers.get(sessionId);
+            CountDownLatch latch = latches.get(sessionId);
+
+            if (transcriber != null) {
+                try {
+                    transcriber.stop();
+                    // 等待识别完成
+                    if (latch != null) {
+                        latch.await(5, TimeUnit.SECONDS);
+                    }
+                } catch (Exception e) {
+                    logger.error("停止transcriber失败", e);
+                    // 继续执行,确保资源被清理
+                }
+            }
+
+            // 获取识别结果
+            StringBuilder result = transcriptionResults.get(sessionId);
+            String finalResult = result != null ? result.toString() : "";
+
+            // 清理资源
+            cleanupSession(sessionId);
+
+            return Map.of("success", true, "result", finalResult);
+        } catch (Exception e) {
+            logger.error("停止语音识别失败", e);
+            // 确保资源被清理
+            cleanupSession(sessionId);
+            return Map.of("success", false, "message", "停止语音识别失败: " + e.getMessage());
+        }
+    }
+
+    /**
+     * 获取语音识别监听器
+     */
+    private SpeechTranscriberListener getTranscriberListener(String sessionId) {
+        return new SpeechTranscriberListener() {
+            @Override
+            public void onTranscriptionResultChange(SpeechTranscriberResponse response) {
+                String result = response.getTransSentenceText();
+                logger.info("实时识别中间结果: " + result);
+                
+                // 通过WebSocket实时返回中间结果
+                WebSocketSession session = webSocketSessions.get(sessionId);
+                if (session != null && session.isOpen()) {
+                    try {
+                        session.sendMessage(new TextMessage("{\"type\":\"intermediate\",\"result\":\"" + result + "\"}"));
+                    } catch (Exception e) {
+                        logger.error("发送WebSocket消息失败", e);
+                    }
+                }
+            }
+
+            @Override
+            public void onTranscriberStart(SpeechTranscriberResponse response) {
+                logger.info("语音识别会话开始, task_id: " + response.getTaskId());
+            }
+
+            @Override
+            public void onSentenceBegin(SpeechTranscriberResponse response) {
+                logger.info("开始识别新句子");
+
+            }
+
+            @Override
+            public void onSentenceEnd(SpeechTranscriberResponse response) {
+                logger.info("句子识别完成, 结果: " + response.getTransSentenceText());
+                logger.info("置信度: " + response.getConfidence() + ", 开始时间: " + response.getSentenceBeginTime() + ", 处理时长: " + response.getTransSentenceTime() + "ms");
+                StringBuilder result = transcriptionResults.get(sessionId);
+                if (result != null) {
+                    result.append(response.getTransSentenceText());
+                }
+            }
+
+            @Override
+            public void onTranscriptionComplete(SpeechTranscriberResponse response) {
+                logger.info("语音识别会话完成");
+                CountDownLatch latch = latches.get(sessionId);
+                if (latch != null) {
+                    latch.countDown();
+                }
+            }
+
+            @Override
+            public void onFail(SpeechTranscriberResponse response) {
+                logger.error("语音识别失败: " + response.getStatusText() + ", 状态码: " + response.getStatus());
+                CountDownLatch latch = latches.get(sessionId);
+                if (latch != null) {
+                    latch.countDown();
+                }
+            }
+        };
+    }
+}

+ 189 - 0
byzs-module-ai/src/main/java/cn/iocoder/byzs/module/ai/controller/admin/speech/RealTimeSpeechTranscriberDemo.java

@@ -0,0 +1,189 @@
+package cn.iocoder.byzs.module.ai.controller.admin.speech;
+
+import com.alibaba.nls.client.AccessToken;
+import com.alibaba.nls.client.protocol.NlsClient;
+import com.alibaba.nls.client.protocol.InputFormatEnum;
+import com.alibaba.nls.client.protocol.SampleRateEnum;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriber;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriberListener;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriberResponse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.DataLine;
+import javax.sound.sampled.TargetDataLine;
+
+public class RealTimeSpeechTranscriberDemo {
+    private static final Logger logger = LoggerFactory.getLogger(RealTimeSpeechTranscriberDemo.class);
+    private String appKey;
+    private NlsClient client;
+
+    public RealTimeSpeechTranscriberDemo(String appKey, String id, String secret, String url) {
+        this.appKey = appKey;
+        //创建NlsClient实例,应用全局创建一个即可,默认服务地址为阿里云线上服务地址。
+        //获取token,实际使用时注意在accessToken.getExpireTime()过期前再次获取。
+        AccessToken accessToken = new AccessToken(id, secret);
+        try {
+            accessToken.apply();
+            System.out.println("get token: " + ", expire time: " + accessToken.getExpireTime());
+            if(url.isEmpty()) {
+                client = new NlsClient(accessToken.getToken());
+            }else {
+                client = new NlsClient(url, accessToken.getToken());
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    private SpeechTranscriberListener getTranscriberListener() {
+        return new SpeechTranscriberListener() {
+            //识别出中间结果。仅当setEnableIntermediateResult为true时,才会返回该消息。
+            @Override
+            public void onTranscriptionResultChange(SpeechTranscriberResponse response) {
+                System.out.println("task_id: " + response.getTaskId() +
+                    ", name: " + response.getName() +
+                    //状态码"20000000"表示正常识别。
+                    ", status: " + response.getStatus() +
+                    //句子编号,从1开始递增。
+                    ", index: " + response.getTransSentenceIndex() +
+                    //当前的识别结果。
+                    ", result: " + response.getTransSentenceText() +
+                    //当前已处理的音频时长,单位为毫秒。
+                    ", time: " + response.getTransSentenceTime());
+            }
+
+            @Override
+            public void onTranscriberStart(SpeechTranscriberResponse response) {
+                //task_id是调用方和服务端通信的唯一标识,遇到问题时,需要提供此task_id。
+                System.out.println("task_id: " + response.getTaskId() + ", name: " + response.getName() + ", status: " + response.getStatus());
+            }
+
+            @Override
+            public void onSentenceBegin(SpeechTranscriberResponse response) {
+                System.out.println("task_id: " + response.getTaskId() + ", name: " + response.getName() + ", status: " + response.getStatus());
+
+            }
+
+            //识别出一句话。服务端会智能断句,当识别到一句话结束时会返回此消息。
+            @Override
+            public void onSentenceEnd(SpeechTranscriberResponse response) {
+                System.out.println("task_id: " + response.getTaskId() +
+                    ", name: " + response.getName() +
+                    //状态码"20000000"表示正常识别。
+                    ", status: " + response.getStatus() +
+                    //句子编号,从1开始递增。
+                    ", index: " + response.getTransSentenceIndex() +
+                    //当前的识别结果。
+                    ", result: " + response.getTransSentenceText() +
+                    //置信度
+                    ", confidence: " + response.getConfidence() +
+                    //开始时间
+                    ", begin_time: " + response.getSentenceBeginTime() +
+                    //当前已处理的音频时长,单位为毫秒。
+                    ", time: " + response.getTransSentenceTime());
+            }
+
+            //识别完毕
+            @Override
+            public void onTranscriptionComplete(SpeechTranscriberResponse response) {
+                System.out.println("task_id: " + response.getTaskId() + ", name: " + response.getName() + ", status: " + response.getStatus());
+            }
+
+            @Override
+            public void onFail(SpeechTranscriberResponse response) {
+                //task_id是调用方和服务端通信的唯一标识,遇到问题时,需要提供此task_id。
+                System.out.println("task_id: " + response.getTaskId() +  ", status: " + response.getStatus() + ", status_text: " + response.getStatusText());
+            }
+        };
+    }
+
+    public void process() {
+        SpeechTranscriber transcriber = null;
+        TargetDataLine targetDataLine = null;
+        try {
+            //创建实例、建立连接。
+            transcriber = new SpeechTranscriber(client, getTranscriberListener());
+            transcriber.setAppKey(appKey);
+            //输入音频编码方式。
+            transcriber.setFormat(InputFormatEnum.PCM);
+            //输入音频采样率。
+            transcriber.setSampleRate(SampleRateEnum.SAMPLE_RATE_16K);
+            //是否返回中间识别结果。
+            transcriber.setEnableIntermediateResult(false);
+            //是否生成并返回标点符号。
+            transcriber.setEnablePunctuation(true);
+            //是否将返回结果规整化,比如将一百返回为100。
+            transcriber.setEnableITN(false);
+
+            //此方法将以上参数设置序列化为JSON发送给服务端,并等待服务端确认。
+            transcriber.start();
+
+            //配置音频采集格式(16kHz,16位,单声道,小端序)
+            AudioFormat format = new AudioFormat(16000, 16, 1, true, false);
+            DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
+            
+            //打开麦克风输入流
+            targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
+            targetDataLine.open(format);
+            targetDataLine.start();
+
+            //缓冲区大小(3200字节 = 100ms音频)
+            byte[] buffer = new byte[3200];
+            int len;
+
+            System.out.println("开始实时采集音频...");
+            System.out.println("按Ctrl+C停止程序");
+            
+            //实时读取音频数据并发送
+            while (true) {
+                len = targetDataLine.read(buffer, 0, buffer.length);
+                if (len > 0) {
+                    logger.info("send data pack length: " + len);
+                    transcriber.send(buffer, len);
+                }
+            }
+        } catch (Exception e) {
+            System.err.println(e.getMessage());
+        } finally {
+            //停止音频采集
+            if (targetDataLine != null) {
+                targetDataLine.stop();
+                targetDataLine.close();
+            }
+            //通知服务端语音数据发送完毕,等待服务端处理完成。
+            if (transcriber != null) {
+                try {
+                    long now = System.currentTimeMillis();
+                    logger.info("ASR wait for complete");
+                    transcriber.stop();
+                    logger.info("ASR latency : " + (System.currentTimeMillis() - now) + " ms");
+                } catch (Exception e) {
+                    e.printStackTrace();
+                } finally {
+                    transcriber.close();
+                }
+            }
+        }
+    }
+
+    public void shutdown() {
+        client.shutdown();
+    }
+
+    public static void main(String[] args) throws Exception {
+        String appKey = "4SUOF4LfaU7FekyW";
+        String id = "LTAI5tQhMPLXtSgXiPiWbw6D";
+        String secret = "HCXpFYjl4swk0qwfIKa9s2bXx0AWcG";
+        String url = System.getenv().getOrDefault("NLS_GATEWAY_URL", "wss://nls-gateway-cn-shanghai.aliyuncs.com/ws/v1");
+        
+        System.out.println("实时语音识别Demo启动中...");
+        System.out.println("请确保已正确配置AccessKey凭证");
+        
+        RealTimeSpeechTranscriberDemo demo = new RealTimeSpeechTranscriberDemo(appKey, id, secret, url);
+        demo.process();
+        demo.shutdown();
+    }
+}

+ 201 - 0
byzs-module-ai/src/main/java/cn/iocoder/byzs/module/ai/controller/admin/speech/SpeechTranscriberDemo.java

@@ -0,0 +1,201 @@
+package cn.iocoder.byzs.module.ai.controller.admin.speech;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+
+import com.alibaba.nls.client.AccessToken;
+import com.alibaba.nls.client.protocol.InputFormatEnum;
+import com.alibaba.nls.client.protocol.NlsClient;
+import com.alibaba.nls.client.protocol.SampleRateEnum;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriber;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriberListener;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriberResponse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+/**
+ * 此示例演示了:
+ * ASR实时识别API调用。
+ * 动态获取token。获取Token具体操作,请参见:https://help.aliyun.com/document_detail/450514.html
+ * 通过本地模拟实时流发送。
+ * 识别耗时计算。
+ */
+public class SpeechTranscriberDemo {
+    private String appKey;
+    private NlsClient client;
+    private static final Logger logger = LoggerFactory.getLogger(SpeechTranscriberDemo.class);
+
+    public SpeechTranscriberDemo(String appKey, String id, String secret, String url) {
+        this.appKey = appKey;
+        //应用全局创建一个NlsClient实例,默认服务地址为阿里云线上服务地址。
+        //获取token,实际使用时注意在accessToken.getExpireTime()过期前再次获取。
+        AccessToken accessToken = new AccessToken(id, secret);
+        try {
+            accessToken.apply();
+            System.out.println("get token: " + ", expire time: " + accessToken.getExpireTime());
+            if(url.isEmpty()) {
+                client = new NlsClient(accessToken.getToken());
+            }else {
+                client = new NlsClient(url, accessToken.getToken());
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
+    private static SpeechTranscriberListener getTranscriberListener() {
+        SpeechTranscriberListener listener = new SpeechTranscriberListener() {
+            //识别出中间结果。仅当setEnableIntermediateResult为true时,才会返回该消息。
+            @Override
+            public void onTranscriptionResultChange(SpeechTranscriberResponse response) {
+                System.out.println("task_id: " + response.getTaskId() +
+                    ", name: " + response.getName() +
+                    //状态码“20000000”表示正常识别。
+                    ", status: " + response.getStatus() +
+                    //句子编号,从1开始递增。
+                    ", index: " + response.getTransSentenceIndex() +
+                    //当前的识别结果。
+                    ", result: " + response.getTransSentenceText() +
+                    //当前已处理的音频时长,单位为毫秒。
+                    ", time: " + response.getTransSentenceTime());
+            }
+
+            @Override
+            public void onTranscriberStart(SpeechTranscriberResponse response) {
+                //task_id是调用方和服务端通信的唯一标识,遇到问题时,需要提供此task_id。
+                System.out.println("task_id: " + response.getTaskId() + ", name: " + response.getName() + ", status: " + response.getStatus());
+            }
+
+            @Override
+            public void onSentenceBegin(SpeechTranscriberResponse response) {
+                System.out.println("task_id: " + response.getTaskId() + ", name: " + response.getName() + ", status: " + response.getStatus());
+
+            }
+
+            //识别出一句话。服务端会智能断句,当识别到一句话结束时会返回此消息。
+            @Override
+            public void onSentenceEnd(SpeechTranscriberResponse response) {
+                System.out.println("task_id: " + response.getTaskId() +
+                    ", name: " + response.getName() +
+                    //状态码“20000000”表示正常识别。
+                    ", status: " + response.getStatus() +
+                    //句子编号,从1开始递增。
+                    ", index: " + response.getTransSentenceIndex() +
+                    //当前的识别结果。
+                    ", result: " + response.getTransSentenceText() +
+                    //置信度
+                    ", confidence: " + response.getConfidence() +
+                    //开始时间
+                    ", begin_time: " + response.getSentenceBeginTime() +
+                    //当前已处理的音频时长,单位为毫秒。
+                    ", time: " + response.getTransSentenceTime());
+            }
+
+            //识别完毕
+            @Override
+            public void onTranscriptionComplete(SpeechTranscriberResponse response) {
+                System.out.println("task_id: " + response.getTaskId() + ", name: " + response.getName() + ", status: " + response.getStatus());
+            }
+
+            @Override
+            public void onFail(SpeechTranscriberResponse response) {
+                //task_id是调用方和服务端通信的唯一标识,遇到问题时,需要提供此task_id。
+                System.out.println("task_id: " + response.getTaskId() +  ", status: " + response.getStatus() + ", status_text: " + response.getStatusText());
+            }
+        };
+
+        return listener;
+    }
+
+    //根据二进制数据大小计算对应的同等语音长度。
+    //sampleRate:支持8000或16000。
+    public static int getSleepDelta(int dataSize, int sampleRate) {
+        // 仅支持16位采样。
+        int sampleBytes = 16;
+        // 仅支持单通道。
+        int soundChannel = 1;
+        return (dataSize * 10 * 8000) / (160 * sampleRate);
+    }
+
+    public void process(String filepath) {
+        SpeechTranscriber transcriber = null;
+        try {
+            //创建实例、建立连接。
+            transcriber = new SpeechTranscriber(client, getTranscriberListener());
+            transcriber.setAppKey(appKey);
+            //输入音频编码方式。
+            transcriber.setFormat(InputFormatEnum.WAV);
+            //输入音频采样率。
+            transcriber.setSampleRate(SampleRateEnum.SAMPLE_RATE_16K);
+            //是否返回中间识别结果。
+            transcriber.setEnableIntermediateResult(false);
+            //是否生成并返回标点符号。
+            transcriber.setEnablePunctuation(true);
+            //是否将返回结果规整化,比如将一百返回为100。
+            transcriber.setEnableITN(false);
+
+            //设置vad断句参数。默认值:800ms,有效值:200ms~6000ms。
+            //transcriber.addCustomedParam("max_sentence_silence", 600);
+            //设置是否语义断句。
+            //transcriber.addCustomedParam("enable_semantic_sentence_detection",false);
+            //设置是否开启过滤语气词,即声音顺滑。
+            //transcriber.addCustomedParam("disfluency",true);
+            //设置是否开启词模式。
+            //transcriber.addCustomedParam("enable_words",true);
+           //设置vad噪音阈值参数,参数取值为-1~+1,如-0.9、-0.8、0.2、0.9。
+            //取值越趋于-1,判定为语音的概率越大,亦即有可能更多噪声被当成语音被误识别。
+            //取值越趋于+1,判定为噪音的越多,亦即有可能更多语音段被当成噪音被拒绝识别。
+            //该参数属高级参数,调整需慎重和重点测试。
+            //transcriber.addCustomedParam("speech_noise_threshold",0.3);
+            //设置训练后的定制语言模型id。
+            //transcriber.addCustomedParam("customization_id","你的定制语言模型id");
+            //设置训练后的定制热词id。
+            //transcriber.addCustomedParam("vocabulary_id","你的定制热词id");
+
+            //此方法将以上参数设置序列化为JSON发送给服务端,并等待服务端确认。
+            transcriber.start();
+
+            File file = new File(filepath);
+            FileInputStream fis = new FileInputStream(file);
+            byte[] b = new byte[3200];
+            int len;
+            while ((len = fis.read(b)) > 0) {
+                logger.info("send data pack length: " + len);
+                transcriber.send(b, len);
+                //本案例用读取本地文件的形式模拟实时获取语音流并发送的,因为读取速度较快,这里需要设置sleep。
+                //如果实时获取语音则无需设置sleep, 如果是8k采样率语音第二个参数设置为8000。
+                int deltaSleep = getSleepDelta(len, 16000);
+                Thread.sleep(deltaSleep);
+            }
+
+            //通知服务端语音数据发送完毕,等待服务端处理完成。
+            long now = System.currentTimeMillis();
+            logger.info("ASR wait for complete");
+            transcriber.stop();
+            logger.info("ASR latency : " + (System.currentTimeMillis() - now) + " ms");
+        } catch (Exception e) {
+            System.err.println(e.getMessage());
+        } finally {
+            if (null != transcriber) {
+                transcriber.close();
+            }
+        }
+    }
+
+    public void shutdown() {
+        client.shutdown();
+    }
+
+    public static void main(String[] args) throws Exception {
+        String appKey = "4SUOF4LfaU7FekyW";
+        String id = "LTAI5tQhMPLXtSgXiPiWbw6D";
+        String secret = "HCXpFYjl4swk0qwfIKa9s2bXx0AWcG";
+        String url = System.getenv().getOrDefault("NLS_GATEWAY_URL", "wss://nls-gateway-cn-shanghai.aliyuncs.com/ws/v1");
+      
+        //本案例使用本地文件模拟发送实时流数据。您在实际使用时,可以实时采集或接收语音流并发送到ASR服务端。
+        String filepath = "D:\\Administrator\\Documents\\录音\\duiuha.mpga.wav";
+        SpeechTranscriberDemo demo = new SpeechTranscriberDemo(appKey, id, secret, url);
+        demo.process(filepath);
+        demo.shutdown();
+    }
+}

+ 286 - 0
byzs-web/src/views/SpeechRecognition.vue

@@ -0,0 +1,286 @@
+<template>
+  <div class="speech-recognition">
+    <h3>实时语音识别</h3>
+    <div class="controls">
+      <button
+          @click="startRecognition"
+          :disabled="isRecording"
+          class="btn-start"
+      >
+        {{ isRecording ? '识别中...' : '开始识别' }}
+      </button>
+      <button
+          @click="stopRecognition"
+          :disabled="!isRecording"
+          class="btn-stop"
+      >
+        停止识别
+      </button>
+    </div>
+    <div class="result" v-if="intermediateResult || recognitionResult">
+      <h4>实时识别结果:</h4>
+      <p class="intermediate" v-if="intermediateResult">{{ intermediateResult }}</p>
+      <h4 v-if="recognitionResult">最终识别结果:</h4>
+      <p class="final" v-if="recognitionResult">{{ recognitionResult }}</p>
+    </div>
+    <div class="status" v-if="status">
+      {{ status }}
+    </div>
+  </div>
+</template>
+
+<script setup>
+import { ref, onUnmounted } from 'vue';
+import axios from '@/utils/request';
+
+const isRecording = ref(false);
+const recognitionResult = ref('');
+const intermediateResult = ref('');
+const status = ref('');
+let sessionId = null;
+let mediaStream = null;
+let audioContext = null;
+let mediaStreamSource = null;
+let scriptProcessor = null;
+let streamInterval = null;
+let webSocket = null;
+
+// 开始语音识别
+const startRecognition = async () => {
+  try {
+    // 开始会话
+    const startData = await axios({
+      url: 'admin/ai/speech/start',
+      method: 'POST',
+      data: {}
+    });
+
+    if (!startData.success) {
+      status.value = '错误:' + startData.message;
+      return;
+    }
+
+    // 保存会话ID
+    sessionId = startData.sessionId || startData.data?.sessionId;
+    
+    // 建立WebSocket连接,用于接收实时中间结果
+    const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
+    const wsUrl = `${wsProtocol}//${window.location.host}/admin/ai/speech/ws?sessionId=${sessionId}`;
+    webSocket = new WebSocket(wsUrl);
+    
+    webSocket.onopen = () => {
+      console.log('WebSocket连接已建立');
+    };
+    
+    webSocket.onmessage = (event) => {
+      try {
+        const data = JSON.parse(event.data);
+        if (data.type === 'intermediate') {
+          intermediateResult.value = data.result;
+        }
+      } catch (error) {
+        console.error('解析WebSocket消息失败', error);
+      }
+    };
+    
+    webSocket.onclose = () => {
+      console.log('WebSocket连接已关闭');
+    };
+    
+    webSocket.onerror = (error) => {
+      console.error('WebSocket错误', error);
+    };
+
+    // 获取麦克风权限
+    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
+
+    // 创建AudioContext
+    audioContext = new (window.AudioContext || window.webkitAudioContext)();
+
+    // 创建媒体流源
+    mediaStreamSource = audioContext.createMediaStreamSource(mediaStream);
+
+    // 创建脚本处理器
+    scriptProcessor = audioContext.createScriptProcessor(4096, 1, 1);
+
+    // 连接音频处理链
+    mediaStreamSource.connect(scriptProcessor);
+    scriptProcessor.connect(audioContext.destination);
+
+    // 处理音频数据
+    scriptProcessor.onaudioprocess = (event) => {
+      const inputData = event.inputBuffer.getChannelData(0);
+      // 将Float32Array转换为Int16Array
+      const int16Data = new Int16Array(inputData.length);
+      for (let i = 0; i < inputData.length; i++) {
+        // 缩放并转换为16位整数
+        int16Data[i] = Math.max(-32768, Math.min(32767, inputData[i] * 32768));
+      }
+      // 转换为字节数组
+      const byteData = new Uint8Array(int16Data.length * 2);
+      for (let i = 0; i < int16Data.length; i++) {
+        byteData[i * 2] = int16Data[i] & 0xff;
+        byteData[i * 2 + 1] = (int16Data[i] >> 8) & 0xff;
+      }
+      // 发送音频数据
+      sendAudioData(byteData);
+    };
+
+    isRecording.value = true;
+    status.value = '正在录制...';
+
+  } catch (error) {
+    status.value = '错误:' + error.message;
+    isRecording.value = false;
+    // 关闭WebSocket连接
+    if (webSocket) {
+      webSocket.close();
+      webSocket = null;
+    }
+    // 重置中间结果
+    intermediateResult.value = '';
+    // 重置会话ID
+    sessionId = null;
+  }
+};
+
+// 发送音频数据
+const sendAudioData = async (audioData) => {
+  try {
+    // 确保sessionId存在
+    if (!sessionId) {
+      status.value = '错误:会话ID不存在';
+      return;
+    }
+
+    const formData = new FormData();
+    formData.append('audio', new Blob([audioData], { type: 'application/octet-stream' }), 'audio.pcm');
+    formData.append('sessionId', sessionId);
+
+    const streamData = await axios({
+      url: 'admin/ai/speech/stream',
+      method: 'POST',
+      data: formData,
+      headers: {
+        'content-type': 'multipart/form-data'
+      }
+    });
+
+    if (!streamData.success) {
+      status.value = '错误:' + streamData.message;
+    }
+  } catch (error) {
+    status.value = '发送音频数据失败:' + error.message;
+  }
+};
+
+// 停止语音识别
+const stopRecognition = async () => {
+  if (isRecording.value) {
+    // 停止音频处理
+    if (scriptProcessor) {
+      scriptProcessor.disconnect();
+      scriptProcessor = null;
+    }
+
+    if (mediaStreamSource) {
+      mediaStreamSource.disconnect();
+      mediaStreamSource = null;
+    }
+
+    if (mediaStream) {
+      mediaStream.getTracks().forEach(track => track.stop());
+      mediaStream = null;
+    }
+
+    if (audioContext) {
+      audioContext.close();
+      audioContext = null;
+    }
+
+    // 停止会话并获取结果
+    try {
+      const stopData = await axios({
+        url: `admin/ai/speech/stop?sessionId=${sessionId}`,
+        method: 'POST',
+        data: {}
+      });
+
+      if (stopData.success) {
+        recognitionResult.value = stopData.result;
+        status.value = '识别完成';
+      } else {
+        status.value = '错误:' + stopData.message;
+      }
+    } catch (error) {
+      status.value = '停止识别失败:' + error.message;
+    }
+
+    isRecording.value = false;
+  }
+};
+
+// 组件卸载时清理
+onUnmounted(() => {
+  if (isRecording.value) {
+    stopRecognition();
+  }
+});
+</script>
+
+<style scoped>
+.speech-recognition {
+  max-width: 600px;
+  margin: 0 auto;
+  padding: 20px;
+  border: 1px solid #ddd;
+  border-radius: 8px;
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+
+.controls {
+  margin: 20px 0;
+}
+
+button {
+  padding: 10px 20px;
+  margin-right: 10px;
+  border: none;
+  border-radius: 4px;
+  cursor: pointer;
+  font-size: 16px;
+}
+
+.btn-start {
+  background-color: #4CAF50;
+  color: white;
+}
+
+.btn-start:disabled {
+  background-color: #cccccc;
+  cursor: not-allowed;
+}
+
+.btn-stop {
+  background-color: #f44336;
+  color: white;
+}
+
+.btn-stop:disabled {
+  background-color: #cccccc;
+  cursor: not-allowed;
+}
+
+.result {
+  margin-top: 20px;
+  padding: 15px;
+  background-color: #f5f5f5;
+  border-radius: 4px;
+}
+
+.status {
+  margin-top: 10px;
+  color: #666;
+  font-size: 14px;
+}
+</style>