Procházet zdrojové kódy

语音识别调取阿里云实时语音识别接口,sdk调取

liyanbo před 1 měsícem
rodič
revize
f74becd128

+ 0 - 337
byzs-module-ai/src/main/java/cn/iocoder/byzs/module/ai/controller/admin/speech/RealTimeSpeechController.java

@@ -1,337 +0,0 @@
-package cn.iocoder.byzs.module.ai.controller.admin.speech;
-
-import com.alibaba.nls.client.AccessToken;
-import com.alibaba.nls.client.protocol.NlsClient;
-import com.alibaba.nls.client.protocol.InputFormatEnum;
-import com.alibaba.nls.client.protocol.SampleRateEnum;
-import com.alibaba.nls.client.protocol.asr.SpeechTranscriber;
-import com.alibaba.nls.client.protocol.asr.SpeechTranscriberListener;
-import com.alibaba.nls.client.protocol.asr.SpeechTranscriberResponse;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.beans.factory.annotation.Value;
-import org.springframework.web.bind.annotation.*;
-import org.springframework.web.multipart.MultipartFile;
-import org.springframework.web.socket.CloseStatus;
-import org.springframework.web.socket.TextMessage;
-import org.springframework.web.socket.WebSocketSession;
-import org.springframework.web.socket.handler.TextWebSocketHandler;
-
-import jakarta.servlet.http.HttpServletRequest;
-import org.springframework.web.context.request.RequestContextHolder;
-import org.springframework.web.context.request.ServletRequestAttributes;
-import org.springframework.web.multipart.MultipartFile;
-import org.springframework.web.multipart.MultipartHttpServletRequest;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.TimeUnit;
-
-@RestController
-@RequestMapping("/admin/ai/speech")
-public class RealTimeSpeechController {
-
-    private static final Logger logger = LoggerFactory.getLogger(RealTimeSpeechController.class);
-
-    @Value("${ai.aliyun.app-key:4SUOF4LfaU7FekyW}")
-    private String appKey;
-
-    @Value("${ai.aliyun.access-key-id:LTAI5tQhMPLXtSgXiPiWbw6D}")
-    private String accessKeyId;
-
-    @Value("${ai.aliyun.access-key-secret:HCXpFYjl4swk0qwfIKa9s2bXx0AWcG}")
-    private String accessKeySecret;
-
-    @Value("${ai.aliyun.nls-gateway-url:wss://nls-gateway-cn-shanghai.aliyuncs.com/ws/v1}")
-    private String nlsGatewayUrl;
-
-    // 存储每个会话的识别结果
-    private final Map<String, StringBuilder> transcriptionResults = new ConcurrentHashMap<>();
-
-    // 存储每个会话的NlsClient
-    private final Map<String, NlsClient> nlsClients = new ConcurrentHashMap<>();
-
-    // 存储每个会话的SpeechTranscriber
-    private final Map<String, SpeechTranscriber> transcribers = new ConcurrentHashMap<>();
-
-    // 存储每个会话的结束信号
-    private final Map<String, CountDownLatch> latches = new ConcurrentHashMap<>();
-    
-    // 存储WebSocket会话,用于实时返回中间结果
-    private final Map<String, WebSocketSession> webSocketSessions = new ConcurrentHashMap<>();
-
-    // WebSocket处理器
-    public class SpeechWebSocketHandler extends TextWebSocketHandler {
-        private final Map<String, WebSocketSession> webSocketSessions;
-        private final Logger logger = LoggerFactory.getLogger(SpeechWebSocketHandler.class);
-        
-        public SpeechWebSocketHandler(Map<String, WebSocketSession> webSocketSessions) {
-            this.webSocketSessions = webSocketSessions;
-        }
-        
-        @Override
-        public void afterConnectionEstablished(WebSocketSession session) throws Exception {
-            // 从会话参数中获取sessionId
-            String sessionId = session.getUri().getQuery().split("=")[1];
-            webSocketSessions.put(sessionId, session);
-            logger.info("WebSocket连接建立,sessionId: {}", sessionId);
-        }
-        
-        @Override
-        protected void handleTextMessage(WebSocketSession session, TextMessage message) throws Exception {
-            // 处理客户端消息
-        }
-        
-        @Override
-        public void afterConnectionClosed(WebSocketSession session, CloseStatus status) throws Exception {
-            // 从会话参数中获取sessionId
-            String sessionId = session.getUri().getQuery().split("=")[1];
-            webSocketSessions.remove(sessionId);
-            logger.info("WebSocket连接关闭,sessionId: {}", sessionId);
-        }
-    }
-
-    /**
-     * 开始语音识别会话
-     */
-    @PostMapping("/start")
-    public Map<String, Object> startRecognition() {
-        // 生成唯一的会话ID
-        String sessionId = java.util.UUID.randomUUID().toString();
-        try {
-            // 清理旧的会话资源
-            cleanupSession(sessionId);
-            
-            // 初始化NlsClient
-            AccessToken accessToken = new AccessToken(accessKeyId, accessKeySecret);
-            accessToken.apply();
-            NlsClient client = new NlsClient(nlsGatewayUrl, accessToken.getToken());
-            nlsClients.put(sessionId, client);
-
-            // 初始化识别结果
-            transcriptionResults.put(sessionId, new StringBuilder());
-
-            // 初始化结束信号
-            latches.put(sessionId, new CountDownLatch(1));
-
-            // 创建SpeechTranscriber
-            SpeechTranscriber transcriber = new SpeechTranscriber(client, getTranscriberListener(sessionId));
-            transcriber.setAppKey(appKey);
-            transcriber.setFormat(InputFormatEnum.PCM);
-            transcriber.setSampleRate(SampleRateEnum.SAMPLE_RATE_16K);
-            transcriber.setEnableIntermediateResult(true);
-            transcriber.setEnablePunctuation(true);
-            transcriber.setEnableITN(false);
-            transcriber.start();
-
-            transcribers.put(sessionId, transcriber);
-
-            // 启动心跳线程,避免WebSocket会话超时
-            startHeartbeatThread(sessionId, transcriber);
-
-            return Map.of("success", true, "message", "语音识别会话已开始", "sessionId", sessionId);
-        } catch (Exception e) {
-            logger.error("开始语音识别失败", e);
-            cleanupSession(sessionId);
-            return Map.of("success", false, "message", "开始语音识别失败: " + e.getMessage());
-        }
-    }
-
-    /**
-     * 启动心跳线程,定期发送空数据保持WebSocket连接
-     */
-    private void startHeartbeatThread(String sessionId, SpeechTranscriber transcriber) {
-        Thread heartbeatThread = new Thread(() -> {
-            try {
-                while (transcribers.containsKey(sessionId)) {
-                    // 发送空数据保持连接
-                    transcriber.send(new byte[0], 0);
-                    Thread.sleep(5000); // 每5秒发送一次心跳
-                }
-            } catch (Exception e) {
-                logger.error("心跳线程异常", e);
-            }
-        });
-        heartbeatThread.setDaemon(true);
-        heartbeatThread.start();
-    }
-
-    /**
-     * 清理会话资源
-     */
-    private void cleanupSession(String sessionId) {
-        try {
-            SpeechTranscriber transcriber = transcribers.remove(sessionId);
-            if (transcriber != null) {
-                transcriber.close();
-            }
-            NlsClient client = nlsClients.remove(sessionId);
-            if (client != null) {
-                client.shutdown();
-            }
-            transcriptionResults.remove(sessionId);
-            latches.remove(sessionId);
-        } catch (Exception e) {
-            logger.error("清理会话资源失败", e);
-        }
-    }
-
-    /**
-     * 接收音频数据并发送到阿里云
-     */
-    @PostMapping("/stream")
-    public Map<String, Object> streamAudio(HttpServletRequest request) {
-        try {
-            // 从请求参数中获取sessionId
-            String sessionId = null;
-            
-            // 首先尝试从URL参数中获取
-            sessionId = request.getParameter("sessionId");
-            
-            // 如果URL参数中没有,尝试从multipart表单中获取
-            if (sessionId == null || sessionId.isEmpty()) {
-                if (request instanceof MultipartHttpServletRequest) {
-                    MultipartHttpServletRequest multipartRequest = (MultipartHttpServletRequest) request;
-                    sessionId = multipartRequest.getParameter("sessionId");
-                }
-            }
-            
-            if (sessionId == null || sessionId.isEmpty()) {
-                logger.error("sessionId参数缺失,请求参数: {}", request.getParameterMap());
-                return Map.of("success", false, "message", "请求参数缺失:sessionId");
-            }
-            
-            // 获取音频文件
-            MultipartFile audioFile = null;
-            if (request instanceof MultipartHttpServletRequest) {
-                MultipartHttpServletRequest multipartRequest = (MultipartHttpServletRequest) request;
-                audioFile = multipartRequest.getFile("audio");
-            }
-            
-            SpeechTranscriber transcriber = transcribers.get(sessionId);
-            if (transcriber == null) {
-                logger.error("会话不存在,sessionId: {}", sessionId);
-                return Map.of("success", false, "message", "语音识别会话未开始");
-            }
-
-            if (audioFile == null || audioFile.isEmpty()) {
-                // 忽略空数据,避免发送空数据到阿里云
-                logger.info("接收到空音频数据,忽略处理");
-                return Map.of("success", true, "message", "音频数据已接收");
-            }
-
-            byte[] audioData = audioFile.getBytes();
-            if (audioData.length > 0) {
-                transcriber.send(audioData, audioData.length);
-            } else {
-                // 忽略空数据,避免发送空数据到阿里云
-                logger.info("接收到空音频数据,忽略处理");
-            }
-
-            return Map.of("success", true, "message", "音频数据已接收");
-        } catch (Exception e) {
-            logger.error("处理音频数据失败", e);
-            return Map.of("success", false, "message", "处理音频数据失败: " + e.getMessage());
-        }
-    }
-
-    /**
-     * 结束语音识别会话并返回结果
-     */
-    @PostMapping("/stop")
-    public Map<String, Object> stopRecognition(@RequestParam("sessionId") String sessionId) {
-        try {
-            SpeechTranscriber transcriber = transcribers.get(sessionId);
-            CountDownLatch latch = latches.get(sessionId);
-
-            if (transcriber != null) {
-                try {
-                    transcriber.stop();
-                    // 等待识别完成
-                    if (latch != null) {
-                        latch.await(5, TimeUnit.SECONDS);
-                    }
-                } catch (Exception e) {
-                    logger.error("停止transcriber失败", e);
-                    // 继续执行,确保资源被清理
-                }
-            }
-
-            // 获取识别结果
-            StringBuilder result = transcriptionResults.get(sessionId);
-            String finalResult = result != null ? result.toString() : "";
-
-            // 清理资源
-            cleanupSession(sessionId);
-
-            return Map.of("success", true, "result", finalResult);
-        } catch (Exception e) {
-            logger.error("停止语音识别失败", e);
-            // 确保资源被清理
-            cleanupSession(sessionId);
-            return Map.of("success", false, "message", "停止语音识别失败: " + e.getMessage());
-        }
-    }
-
-    /**
-     * 获取语音识别监听器
-     */
-    private SpeechTranscriberListener getTranscriberListener(String sessionId) {
-        return new SpeechTranscriberListener() {
-            @Override
-            public void onTranscriptionResultChange(SpeechTranscriberResponse response) {
-                String result = response.getTransSentenceText();
-                logger.info("实时识别中间结果: " + result);
-                
-                // 通过WebSocket实时返回中间结果
-                WebSocketSession session = webSocketSessions.get(sessionId);
-                if (session != null && session.isOpen()) {
-                    try {
-                        session.sendMessage(new TextMessage("{\"type\":\"intermediate\",\"result\":\"" + result + "\"}"));
-                    } catch (Exception e) {
-                        logger.error("发送WebSocket消息失败", e);
-                    }
-                }
-            }
-
-            @Override
-            public void onTranscriberStart(SpeechTranscriberResponse response) {
-                logger.info("语音识别会话开始, task_id: " + response.getTaskId());
-            }
-
-            @Override
-            public void onSentenceBegin(SpeechTranscriberResponse response) {
-                logger.info("开始识别新句子");
-
-            }
-
-            @Override
-            public void onSentenceEnd(SpeechTranscriberResponse response) {
-                logger.info("句子识别完成, 结果: " + response.getTransSentenceText());
-                logger.info("置信度: " + response.getConfidence() + ", 开始时间: " + response.getSentenceBeginTime() + ", 处理时长: " + response.getTransSentenceTime() + "ms");
-                StringBuilder result = transcriptionResults.get(sessionId);
-                if (result != null) {
-                    result.append(response.getTransSentenceText());
-                }
-            }
-
-            @Override
-            public void onTranscriptionComplete(SpeechTranscriberResponse response) {
-                logger.info("语音识别会话完成");
-                CountDownLatch latch = latches.get(sessionId);
-                if (latch != null) {
-                    latch.countDown();
-                }
-            }
-
-            @Override
-            public void onFail(SpeechTranscriberResponse response) {
-                logger.error("语音识别失败: " + response.getStatusText() + ", 状态码: " + response.getStatus());
-                CountDownLatch latch = latches.get(sessionId);
-                if (latch != null) {
-                    latch.countDown();
-                }
-            }
-        };
-    }
-}

+ 206 - 0
byzs-module-ai/src/main/java/cn/iocoder/byzs/module/ai/controller/admin/speech/SpeechRecognitionController.java

@@ -0,0 +1,206 @@
+package cn.iocoder.byzs.module.ai.controller.admin.speech;
+
+import cn.iocoder.byzs.framework.common.pojo.CommonResult;
+import com.alibaba.nls.client.AccessToken;
+import com.alibaba.nls.client.protocol.InputFormatEnum;
+import com.alibaba.nls.client.protocol.NlsClient;
+import com.alibaba.nls.client.protocol.SampleRateEnum;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriber;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriberListener;
+import com.alibaba.nls.client.protocol.asr.SpeechTranscriberResponse;
+import io.swagger.v3.oas.annotations.Operation;
+import io.swagger.v3.oas.annotations.Parameter;
+import io.swagger.v3.oas.annotations.tags.Tag;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.web.bind.annotation.*;
+
+import java.util.concurrent.ConcurrentHashMap;
+
+import static cn.iocoder.byzs.framework.common.exception.enums.GlobalErrorCodeConstants.INTERNAL_SERVER_ERROR;
+import static cn.iocoder.byzs.framework.common.pojo.CommonResult.success;
+
+@Tag(name = "管理后台 - AI 语音识别")
+@RestController
+@RequestMapping("/admin/speech/recognition")
+@Slf4j
+public class SpeechRecognitionController {
+
+    // 存储每个会话的识别器
+    private final ConcurrentHashMap<String, SpeechTranscriber> transcriberMap = new ConcurrentHashMap<>();
+    // 存储每个会话的NlsClient
+    private final ConcurrentHashMap<String, NlsClient> clientMap = new ConcurrentHashMap<>();
+    // 存储每个会话的识别结果
+    private final ConcurrentHashMap<String, String> resultMap = new ConcurrentHashMap<>();
+
+    // 配置参数
+    private final String appKey = "4SUOF4LfaU7FekyW";
+    private final String accessKeyId = "LTAI5tQhMPLXtSgXiPiWbw6D";
+    private final String accessKeySecret = "HCXpFYjl4swk0qwfIKa9s2bXx0AWcG";
+    private final String url = System.getenv().getOrDefault("NLS_GATEWAY_URL", "wss://nls-gateway-cn-shanghai.aliyuncs.com/ws/v1");
+
+    @PostMapping("/start")
+    @Operation(summary = "开始语音识别")
+    @Parameter(name = "sessionId", description = "会话ID", required = true)
+    public CommonResult<String> startRecognition(@RequestParam("sessionId") String sessionId) {
+        try {
+            // 获取token
+            AccessToken accessToken = new AccessToken(accessKeyId, accessKeySecret);
+            accessToken.apply();
+            log.info("get token: {}, expire time: {}", accessToken.getToken(), accessToken.getExpireTime());
+
+            // 创建NlsClient
+            NlsClient client = new NlsClient(url, accessToken.getToken());
+            clientMap.put(sessionId, client);
+
+            // 创建SpeechTranscriber
+            SpeechTranscriber transcriber = new SpeechTranscriber(client, getTranscriberListener(sessionId));
+            transcriber.setAppKey(appKey);
+            transcriber.setFormat(InputFormatEnum.PCM);
+            transcriber.setSampleRate(SampleRateEnum.SAMPLE_RATE_16K);
+            transcriber.setEnableIntermediateResult(true);
+            transcriber.setEnablePunctuation(true);
+            transcriber.setEnableITN(false);
+
+            // 开始识别
+            transcriber.start();
+            transcriberMap.put(sessionId, transcriber);
+
+            return success("Recognition started");
+        } catch (Exception e) {
+            log.error("Error starting recognition", e);
+            return CommonResult.error(INTERNAL_SERVER_ERROR.getCode(), "Failed to start recognition");
+        }
+    }
+
+    @PostMapping("/send")
+    @Operation(summary = "发送音频数据")
+    @Parameter(name = "sessionId", description = "会话ID", required = true)
+    public CommonResult<String> sendAudio(@RequestParam("sessionId") String sessionId, @RequestBody byte[] audioData) {
+        try {
+            SpeechTranscriber transcriber = transcriberMap.get(sessionId);
+            if (transcriber == null) {
+                return CommonResult.error(INTERNAL_SERVER_ERROR.getCode(), "Recognition not started");
+            }
+
+            // 发送音频数据
+            // 注意:这里直接发送前端传来的音频数据
+            // 实际项目中可能需要根据前端发送的音频格式进行适当处理
+            transcriber.send(audioData, audioData.length);
+            return success("Audio data received");
+        } catch (Exception e) {
+            log.error("Error sending audio data", e);
+            return CommonResult.error(INTERNAL_SERVER_ERROR.getCode(), "Failed to send audio data");
+        }
+    }
+
+    @PostMapping("/stop")
+    @Operation(summary = "停止语音识别")
+    @Parameter(name = "sessionId", description = "会话ID", required = true)
+    public CommonResult<String> stopRecognition(@RequestParam("sessionId") String sessionId) {
+        try {
+            SpeechTranscriber transcriber = transcriberMap.remove(sessionId);
+            NlsClient client = clientMap.remove(sessionId);
+
+            if (transcriber != null) {
+                transcriber.stop();
+                transcriber.close();
+            }
+
+            if (client != null) {
+                client.shutdown();
+            }
+
+            // 保留识别结果,以便前端获取
+
+            return success("Recognition stopped");
+        } catch (Exception e) {
+            log.error("Error stopping recognition", e);
+            return CommonResult.error(INTERNAL_SERVER_ERROR.getCode(), "Failed to stop recognition");
+        }
+    }
+
+    @GetMapping("/result")
+    @Operation(summary = "获取识别结果")
+    @Parameter(name = "sessionId", description = "会话ID", required = true)
+    public CommonResult<RecognitionResult> getRecognitionResult(@RequestParam("sessionId") String sessionId) {
+        try {
+            String result = resultMap.get(sessionId);
+            if (result == null) {
+                result = "";
+            }
+            return success(new RecognitionResult(result));
+        } catch (Exception e) {
+            log.error("Error getting recognition result", e);
+            return CommonResult.error(INTERNAL_SERVER_ERROR.getCode(), "Failed to get recognition result");
+        }
+    }
+
+    // 识别结果封装类
+    private static class RecognitionResult {
+        private String result;
+
+        public RecognitionResult(String result) {
+            this.result = result;
+        }
+
+        public String getResult() {
+            return result;
+        }
+
+        public void setResult(String result) {
+            this.result = result;
+        }
+    }
+
+    private SpeechTranscriberListener getTranscriberListener(String sessionId) {
+        return new SpeechTranscriberListener() {
+            @Override
+            public void onTranscriptionResultChange(SpeechTranscriberResponse response) {
+                String result = response.getTransSentenceText();
+                log.info("Session {} - Intermediate result: {}", sessionId, result);
+                // 更新识别结果
+                resultMap.put(sessionId, result);
+            }
+
+            @Override
+            public void onTranscriberStart(SpeechTranscriberResponse response) {
+                log.info("Session {} - Recognition started: {}", sessionId, response.getTaskId());
+                // 初始化识别结果
+                resultMap.put(sessionId, "");
+            }
+
+            @Override
+            public void onSentenceBegin(SpeechTranscriberResponse response) {
+                log.info("Session {} - Sentence begun", sessionId);
+            }
+
+            @Override
+            public void onSentenceEnd(SpeechTranscriberResponse response) {
+                String result = response.getTransSentenceText();
+                log.info("Session {} - Sentence ended: {}", sessionId, result);
+                // 更新识别结果
+                resultMap.put(sessionId, result);
+            }
+
+            @Override
+            public void onTranscriptionComplete(SpeechTranscriberResponse response) {
+                log.info("Session {} - Recognition completed", sessionId);
+            }
+
+            @Override
+            public void onFail(SpeechTranscriberResponse response) {
+                String errorMessage = response.getStatusText();
+                log.error("Session {} - Recognition failed: {} - {}", sessionId, response.getStatus(), errorMessage);
+                // 更新错误信息
+                resultMap.put(sessionId, "Error: " + errorMessage);
+            }
+        };
+    }
+
+    // 计算睡眠时长,用于模拟实时流
+    private int getSleepDelta(int dataSize, int sampleRate) {
+        int sampleBytes = 16;
+        int soundChannel = 1;
+        return (dataSize * 10 * 8000) / (160 * sampleRate);
+    }
+}

+ 104 - 0
byzs-module-ai/src/test/java/cn/iocoder/byzs/module/ai/framework/ai/core/model/speech/Main.java

@@ -0,0 +1,104 @@
+package cn.iocoder.byzs.module.ai.framework.ai.core.model.speech;
+
+import com.alibaba.dashscope.audio.asr.recognition.Recognition;
+import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam;
+import com.alibaba.dashscope.audio.asr.recognition.RecognitionResult;
+import com.alibaba.dashscope.common.ResultCallback;
+import com.alibaba.dashscope.utils.Constants;
+
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.TargetDataLine;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+public class Main {
+    //阿里云百炼
+    public static void main(String[] args) throws InterruptedException {
+        // 以下为北京地域url,若使用新加坡地域的模型,需将url替换为:wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference
+        Constants.baseWebsocketApiUrl = "wss://dashscope.aliyuncs.com/api-ws/v1/inference";
+        ExecutorService executorService = Executors.newSingleThreadExecutor();
+        executorService.submit(new RealtimeRecognitionTask());
+        executorService.shutdown();
+        executorService.awaitTermination(1, TimeUnit.MINUTES);
+        System.exit(0);
+    }
+}
+
+class RealtimeRecognitionTask implements Runnable {
+    @Override
+    public void run() {
+        RecognitionParam param = RecognitionParam.builder()
+                .model("fun-asr-realtime")
+                // 新加坡和北京地域的API Key不同。获取API Key:https://help.aliyun.com/zh/model-studio/get-api-key
+                // 若没有配置环境变量,请用百炼API Key将下行替换为:.apiKey("sk-xxx")
+                .apiKey(System.getenv("sk-b8704d060b194674805bbbcfef5f3e68"))
+                .format("pcm")
+                .sampleRate(16000)
+                .build();
+        Recognition recognizer = new Recognition();
+
+        ResultCallback<RecognitionResult> callback = new ResultCallback<RecognitionResult>() {
+            @Override
+            public void onEvent(RecognitionResult result) {
+                if (result.isSentenceEnd()) {
+                    System.out.println("Final Result: " + result.getSentence().getText());
+                } else {
+                    System.out.println("Intermediate Result: " + result.getSentence().getText());
+                }
+            }
+
+            @Override
+            public void onComplete() {
+                System.out.println("Recognition complete");
+            }
+
+            @Override
+            public void onError(Exception e) {
+                System.out.println("RecognitionCallback error: " + e.getMessage());
+            }
+        };
+        try {
+            recognizer.call(param, callback);
+            // 创建音频格式
+            AudioFormat audioFormat = new AudioFormat(16000, 16, 1, true, false);
+            // 根据格式匹配默认录音设备
+            TargetDataLine targetDataLine =
+                    AudioSystem.getTargetDataLine(audioFormat);
+            targetDataLine.open(audioFormat);
+            // 开始录音
+            targetDataLine.start();
+            ByteBuffer buffer = ByteBuffer.allocate(1024);
+            long start = System.currentTimeMillis();
+            // 录音50s并进行实时转写
+            while (System.currentTimeMillis() - start < 50000) {
+                int read = targetDataLine.read(buffer.array(), 0, buffer.capacity());
+                if (read > 0) {
+                    buffer.limit(read);
+                    // 将录音音频数据发送给流式识别服务
+                    recognizer.sendAudioFrame(buffer);
+                    buffer = ByteBuffer.allocate(1024);
+                    // 录音速率有限,防止cpu占用过高,休眠一小会儿
+                    Thread.sleep(20);
+                }
+            }
+            recognizer.stop();
+        } catch (Exception e) {
+            e.printStackTrace();
+        } finally {
+            // 任务结束后关闭 Websocket 连接
+            recognizer.getDuplexApi().close(1000, "bye");
+        }
+
+        System.out.println(
+                "[Metric] requestId: "
+                        + recognizer.getLastRequestId()
+                        + ", first package delay ms: "
+                        + recognizer.getFirstPackageDelay()
+                        + ", last package delay ms: "
+                        + recognizer.getLastPackageDelay());
+    }
+}

+ 1 - 1
byzs-module-ai/src/main/java/cn/iocoder/byzs/module/ai/controller/admin/speech/RealTimeSpeechTranscriberDemo.java → byzs-module-ai/src/test/java/cn/iocoder/byzs/module/ai/framework/ai/core/model/speech/RealTimeSpeechTranscriberDemo.java

@@ -1,4 +1,4 @@
-package cn.iocoder.byzs.module.ai.controller.admin.speech;
+package cn.iocoder.byzs.module.ai.framework.ai.core.model.speech;
 
 import com.alibaba.nls.client.AccessToken;
 import com.alibaba.nls.client.protocol.NlsClient;

+ 1 - 1
byzs-module-ai/src/main/java/cn/iocoder/byzs/module/ai/controller/admin/speech/SpeechTranscriberDemo.java → byzs-module-ai/src/test/java/cn/iocoder/byzs/module/ai/framework/ai/core/model/speech/SpeechTranscriberDemo.java

@@ -1,4 +1,4 @@
-package cn.iocoder.byzs.module.ai.controller.admin.speech;
+package cn.iocoder.byzs.module.ai.framework.ai.core.model.speech;
 
 import java.io.File;
 import java.io.FileInputStream;

+ 0 - 286
byzs-web/src/views/SpeechRecognition.vue

@@ -1,286 +0,0 @@
-<template>
-  <div class="speech-recognition">
-    <h3>实时语音识别</h3>
-    <div class="controls">
-      <button
-          @click="startRecognition"
-          :disabled="isRecording"
-          class="btn-start"
-      >
-        {{ isRecording ? '识别中...' : '开始识别' }}
-      </button>
-      <button
-          @click="stopRecognition"
-          :disabled="!isRecording"
-          class="btn-stop"
-      >
-        停止识别
-      </button>
-    </div>
-    <div class="result" v-if="intermediateResult || recognitionResult">
-      <h4>实时识别结果:</h4>
-      <p class="intermediate" v-if="intermediateResult">{{ intermediateResult }}</p>
-      <h4 v-if="recognitionResult">最终识别结果:</h4>
-      <p class="final" v-if="recognitionResult">{{ recognitionResult }}</p>
-    </div>
-    <div class="status" v-if="status">
-      {{ status }}
-    </div>
-  </div>
-</template>
-
-<script setup>
-import { ref, onUnmounted } from 'vue';
-import axios from '@/utils/request';
-
-const isRecording = ref(false);
-const recognitionResult = ref('');
-const intermediateResult = ref('');
-const status = ref('');
-let sessionId = null;
-let mediaStream = null;
-let audioContext = null;
-let mediaStreamSource = null;
-let scriptProcessor = null;
-let streamInterval = null;
-let webSocket = null;
-
-// 开始语音识别
-const startRecognition = async () => {
-  try {
-    // 开始会话
-    const startData = await axios({
-      url: 'admin/ai/speech/start',
-      method: 'POST',
-      data: {}
-    });
-
-    if (!startData.success) {
-      status.value = '错误:' + startData.message;
-      return;
-    }
-
-    // 保存会话ID
-    sessionId = startData.sessionId || startData.data?.sessionId;
-    
-    // 建立WebSocket连接,用于接收实时中间结果
-    const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-    const wsUrl = `${wsProtocol}//${window.location.host}/admin/ai/speech/ws?sessionId=${sessionId}`;
-    webSocket = new WebSocket(wsUrl);
-    
-    webSocket.onopen = () => {
-      console.log('WebSocket连接已建立');
-    };
-    
-    webSocket.onmessage = (event) => {
-      try {
-        const data = JSON.parse(event.data);
-        if (data.type === 'intermediate') {
-          intermediateResult.value = data.result;
-        }
-      } catch (error) {
-        console.error('解析WebSocket消息失败', error);
-      }
-    };
-    
-    webSocket.onclose = () => {
-      console.log('WebSocket连接已关闭');
-    };
-    
-    webSocket.onerror = (error) => {
-      console.error('WebSocket错误', error);
-    };
-
-    // 获取麦克风权限
-    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
-
-    // 创建AudioContext
-    audioContext = new (window.AudioContext || window.webkitAudioContext)();
-
-    // 创建媒体流源
-    mediaStreamSource = audioContext.createMediaStreamSource(mediaStream);
-
-    // 创建脚本处理器
-    scriptProcessor = audioContext.createScriptProcessor(4096, 1, 1);
-
-    // 连接音频处理链
-    mediaStreamSource.connect(scriptProcessor);
-    scriptProcessor.connect(audioContext.destination);
-
-    // 处理音频数据
-    scriptProcessor.onaudioprocess = (event) => {
-      const inputData = event.inputBuffer.getChannelData(0);
-      // 将Float32Array转换为Int16Array
-      const int16Data = new Int16Array(inputData.length);
-      for (let i = 0; i < inputData.length; i++) {
-        // 缩放并转换为16位整数
-        int16Data[i] = Math.max(-32768, Math.min(32767, inputData[i] * 32768));
-      }
-      // 转换为字节数组
-      const byteData = new Uint8Array(int16Data.length * 2);
-      for (let i = 0; i < int16Data.length; i++) {
-        byteData[i * 2] = int16Data[i] & 0xff;
-        byteData[i * 2 + 1] = (int16Data[i] >> 8) & 0xff;
-      }
-      // 发送音频数据
-      sendAudioData(byteData);
-    };
-
-    isRecording.value = true;
-    status.value = '正在录制...';
-
-  } catch (error) {
-    status.value = '错误:' + error.message;
-    isRecording.value = false;
-    // 关闭WebSocket连接
-    if (webSocket) {
-      webSocket.close();
-      webSocket = null;
-    }
-    // 重置中间结果
-    intermediateResult.value = '';
-    // 重置会话ID
-    sessionId = null;
-  }
-};
-
-// 发送音频数据
-const sendAudioData = async (audioData) => {
-  try {
-    // 确保sessionId存在
-    if (!sessionId) {
-      status.value = '错误:会话ID不存在';
-      return;
-    }
-
-    const formData = new FormData();
-    formData.append('audio', new Blob([audioData], { type: 'application/octet-stream' }), 'audio.pcm');
-    formData.append('sessionId', sessionId);
-
-    const streamData = await axios({
-      url: 'admin/ai/speech/stream',
-      method: 'POST',
-      data: formData,
-      headers: {
-        'content-type': 'multipart/form-data'
-      }
-    });
-
-    if (!streamData.success) {
-      status.value = '错误:' + streamData.message;
-    }
-  } catch (error) {
-    status.value = '发送音频数据失败:' + error.message;
-  }
-};
-
-// 停止语音识别
-const stopRecognition = async () => {
-  if (isRecording.value) {
-    // 停止音频处理
-    if (scriptProcessor) {
-      scriptProcessor.disconnect();
-      scriptProcessor = null;
-    }
-
-    if (mediaStreamSource) {
-      mediaStreamSource.disconnect();
-      mediaStreamSource = null;
-    }
-
-    if (mediaStream) {
-      mediaStream.getTracks().forEach(track => track.stop());
-      mediaStream = null;
-    }
-
-    if (audioContext) {
-      audioContext.close();
-      audioContext = null;
-    }
-
-    // 停止会话并获取结果
-    try {
-      const stopData = await axios({
-        url: `admin/ai/speech/stop?sessionId=${sessionId}`,
-        method: 'POST',
-        data: {}
-      });
-
-      if (stopData.success) {
-        recognitionResult.value = stopData.result;
-        status.value = '识别完成';
-      } else {
-        status.value = '错误:' + stopData.message;
-      }
-    } catch (error) {
-      status.value = '停止识别失败:' + error.message;
-    }
-
-    isRecording.value = false;
-  }
-};
-
-// 组件卸载时清理
-onUnmounted(() => {
-  if (isRecording.value) {
-    stopRecognition();
-  }
-});
-</script>
-
-<style scoped>
-.speech-recognition {
-  max-width: 600px;
-  margin: 0 auto;
-  padding: 20px;
-  border: 1px solid #ddd;
-  border-radius: 8px;
-  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
-}
-
-.controls {
-  margin: 20px 0;
-}
-
-button {
-  padding: 10px 20px;
-  margin-right: 10px;
-  border: none;
-  border-radius: 4px;
-  cursor: pointer;
-  font-size: 16px;
-}
-
-.btn-start {
-  background-color: #4CAF50;
-  color: white;
-}
-
-.btn-start:disabled {
-  background-color: #cccccc;
-  cursor: not-allowed;
-}
-
-.btn-stop {
-  background-color: #f44336;
-  color: white;
-}
-
-.btn-stop:disabled {
-  background-color: #cccccc;
-  cursor: not-allowed;
-}
-
-.result {
-  margin-top: 20px;
-  padding: 15px;
-  background-color: #f5f5f5;
-  border-radius: 4px;
-}
-
-.status {
-  margin-top: 10px;
-  color: #666;
-  font-size: 14px;
-}
-</style>