D:\Confucius3-Math>docker run -e ARK_API_KEY=f6150e6c-422a-4265-8b63-4d941b271220 -p 8827:8827 confucius3 ========== == CUDA == ========== CUDA Version 12.2.2 Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. This container image and its contents are governed by the NVIDIA Deep Learning Container License. By pulling and using the container, you accept the terms and conditions of this license: https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience. WARNING: The NVIDIA Driver was not detected. GPU functionality will not be available. Use the NVIDIA Container Toolkit to start this container with GPU support; see https://docs.nvidia.com/datacenter/cloud-native/ . 正在启动vLLM模型服务... : not found_stream.sh: 2: : not found_stream.sh: 6: : not found_stream.sh: 12: INFO 08-12 04:55:32 [__init__.py:243] No platform detected, vLLM is running on UnspecifiedPlatform WARNING 08-12 04:55:32 [_custom_ops.py:21] Failed to import from vllm._C with ImportError('libcuda.so.1: cannot open shared object file: No such file or directory') usage: api_server.py [-h] [--host HOST] [--port PORT] [--uvicorn-log-level {debug,info,warning,error,critical,trace}] [--disable-uvicorn-access-log] [--allow-credentials] [--allowed-origins ALLOWED_ORIGINS] [--allowed-methods ALLOWED_METHODS] [--allowed-headers ALLOWED_HEADERS] [--api-key API_KEY] [--lora-modules LORA_MODULES [LORA_MODULES ...]] [--prompt-adapters PROMPT_ADAPTERS [PROMPT_ADAPTERS ...]] [--chat-template CHAT_TEMPLATE] [--chat-template-content-format {auto,string,openai}] [--response-role RESPONSE_ROLE] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--ssl-ca-certs SSL_CA_CERTS] [--enable-ssl-refresh] [--ssl-cert-reqs SSL_CERT_REQS] [--root-path ROOT_PATH] [--middleware MIDDLEWARE] [--return-tokens-as-token-ids] [--disable-frontend-multiprocessing] [--enable-request-id-headers] [--enable-auto-tool-choice] [--tool-call-parser {granite-20b-fc,granite,hermes,internlm,jamba,llama4_json,llama3_json,mistral,phi4_mini_json,pythonic} or name registered in --tool-parser-plugin] [--tool-parser-plugin TOOL_PARSER_PLUGIN] [--model MODEL] [--task {auto,generate,embedding,embed,classify,score,reward,transcription}] [--tokenizer TOKENIZER] [--hf-config-path HF_CONFIG_PATH] [--skip-tokenizer-init] [--revision REVISION] [--code-revision CODE_REVISION] [--tokenizer-revision TOKENIZER_REVISION] [--tokenizer-mode {auto,slow,mistral,custom}] [--trust-remote-code] [--allowed-local-media-path ALLOWED_LOCAL_MEDIA_PATH] [--load-format {auto,pt,safetensors,npcache,dummy,tensorizer,sharded_state,gguf,bitsandbytes,mistral,runai_streamer,runai_streamer_sharded,fastsafetensors}] [--download-dir DOWNLOAD_DIR] [--model-loader-extra-config MODEL_LOADER_EXTRA_CONFIG] [--use-tqdm-on-load | --no-use-tqdm-on-load] [--config-format {auto,hf,mistral}] [--dtype {auto,half,float16,bfloat16,float,float32}] [--max-model-len MAX_MODEL_LEN] [--guided-decoding-backend {auto,guidance,lm-format-enforcer,outlines,xgrammar}] [--reasoning-parser {deepseek_r1,granite}] [--logits-processor-pattern LOGITS_PROCESSOR_PATTERN] [--model-impl {auto,vllm,transformers}] [--distributed-executor-backend {external_launcher,mp,ray,uni,None}] [--pipeline-parallel-size PIPELINE_PARALLEL_SIZE] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE] [--enable-expert-parallel | --no-enable-expert-parallel] [--max-parallel-loading-workers MAX_PARALLEL_LOADING_WORKERS] [--ray-workers-use-nsight | --no-ray-workers-use-nsight] [--disable-custom-all-reduce | --no-disable-custom-all-reduce] [--block-size {1,8,16,32,64,128}] [--gpu-memory-utilization GPU_MEMORY_UTILIZATION] [--swap-space SWAP_SPACE] [--kv-cache-dtype {auto,fp8,fp8_e4m3,fp8_e5m2}] [--num-gpu-blocks-override NUM_GPU_BLOCKS_OVERRIDE] [--enable-prefix-caching | --no-enable-prefix-caching] [--prefix-caching-hash-algo {builtin,sha256}] [--cpu-offload-gb CPU_OFFLOAD_GB] [--calculate-kv-scales | --no-calculate-kv-scales] [--disable-sliding-window] [--use-v2-block-manager] [--seed SEED] [--max-logprobs MAX_LOGPROBS] [--disable-log-stats] [--quantization {aqlm,awq,deepspeedfp,tpu_int8,fp8,ptpc_fp8,fbgemm_fp8,modelopt,nvfp4,marlin,bitblas,gguf,gptq_marlin_24,gptq_marlin,gptq_bitblas,awq_marlin,gptq,compressed-tensors,bitsandbytes,qqq,hqq,experts_int8,neuron_quant,ipex,quark,moe_wna16,torchao,None}] [--rope-scaling ROPE_SCALING] [--rope-theta ROPE_THETA] [--hf-token [HF_TOKEN]] [--hf-overrides HF_OVERRIDES] [--enforce-eager] [--max-seq-len-to-capture MAX_SEQ_LEN_TO_CAPTURE] [--tokenizer-pool-size TOKENIZER_POOL_SIZE] [--tokenizer-pool-type TOKENIZER_POOL_TYPE] [--tokenizer-pool-extra-config TOKENIZER_POOL_EXTRA_CONFIG] [--limit-mm-per-prompt LIMIT_MM_PER_PROMPT] [--mm-processor-kwargs MM_PROCESSOR_KWARGS] [--disable-mm-preprocessor-cache] [--enable-lora | --no-enable-lora] [--enable-lora-bias | --no-enable-lora-bias] [--max-loras MAX_LORAS] [--max-lora-rank MAX_LORA_RANK] [--lora-extra-vocab-size LORA_EXTRA_VOCAB_SIZE] [--lora-dtype {auto,bfloat16,float16}] [--long-lora-scaling-factors LONG_LORA_SCALING_FACTORS [LONG_LORA_SCALING_FACTORS ...]] [--max-cpu-loras MAX_CPU_LORAS] [--fully-sharded-loras | --no-fully-sharded-loras] [--enable-prompt-adapter | --no-enable-prompt-adapter] [--max-prompt-adapters MAX_PROMPT_ADAPTERS] [--max-prompt-adapter-token MAX_PROMPT_ADAPTER_TOKEN] [--device {auto,cpu,cuda,hpu,neuron,tpu,xpu}] [--speculative-config SPECULATIVE_CONFIG] [--ignore-patterns IGNORE_PATTERNS] [--served-model-name SERVED_MODEL_NAME [SERVED_MODEL_NAME ...]] [--qlora-adapter-name-or-path QLORA_ADAPTER_NAME_OR_PATH] [--show-hidden-metrics-for-version SHOW_HIDDEN_METRICS_FOR_VERSION] [--otlp-traces-endpoint OTLP_TRACES_ENDPOINT] [--collect-detailed-traces COLLECT_DETAILED_TRACES] [--disable-async-output-proc] [--max-num-batched-tokens MAX_NUM_BATCHED_TOKENS] [--max-num-seqs MAX_NUM_SEQS] [--max-num-partial-prefills MAX_NUM_PARTIAL_PREFILLS] [--max-long-partial-prefills MAX_LONG_PARTIAL_PREFILLS] [--long-prefill-token-threshold LONG_PREFILL_TOKEN_THRESHOLD] [--num-lookahead-slots NUM_LOOKAHEAD_SLOTS] [--scheduler-delay-factor SCHEDULER_DELAY_FACTOR] [--preemption-mode {recompute,swap,None}] [--num-scheduler-steps NUM_SCHEDULER_STEPS] [--multi-step-stream-outputs | --no-multi-step-stream-outputs] [--scheduling-policy {fcfs,priority}] [--enable-chunked-prefill | --no-enable-chunked-prefill] [--disable-chunked-mm-input | --no-disable-chunked-mm-input] [--scheduler-cls SCHEDULER_CLS] [--override-neuron-config OVERRIDE_NEURON_CONFIG] [--override-pooler-config OVERRIDE_POOLER_CONFIG] [--compilation-config COMPILATION_CONFIG] [--kv-transfer-config KV_TRANSFER_CONFIG] [--worker-cls WORKER_CLS] [--worker-extension-cls WORKER_EXTENSION_CLS] [--generation-config GENERATION_CONFIG] [--override-generation-config OVERRIDE_GENERATION_CONFIG] [--enable-sleep-mode] [--additional-config ADDITIONAL_CONFIG] [--enable-reasoning] [--disable-cascade-attn] [--disable-log-requests] [--max-log-len MAX_LOG_LEN] [--disable-fastapi-docs] [--enable-prompt-tokens-details] [--enable-server-load-tracking] api_server.py: error: unrecognized arguments: run_service_stream.sh: 16: --model: not found run_service_stream.sh: 17: --served-model-name: not found run_service_stream.sh: 18: --host: not found run_service_stream.sh: 19: --port: not found run_service_stream.sh: 20: --dtype: not found run_service_stream.sh: 21: --tensor-parallel-size: not found run_service_stream.sh: 22: --max-model-len: not found run_service_stream.sh: 23: --enforce-eager: not found : not found_stream.sh: 24: : not found_stream.sh: 25: run_service_stream.sh: 24: --trust-remote-code: not found sleep: invalid time interval '5\r' Try 'sleep --help' for more information. : not found_stream.sh: 28: 正在启动Web Demo服务... python3: can't open file '/app/web/stream_demo.py\r': [Errno 2] No such file or directory D:\Confucius3-Math>docker run --gpus all -e ARK_API_KEY=f6150e6c-422a-4265-8b63-4d941b271220 -p 8827:8827 confucius3 ========== == CUDA == ========== CUDA Version 12.2.2 Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. This container image and its contents are governed by the NVIDIA Deep Learning Container License. By pulling and using the container, you accept the terms and conditions of this license: https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience. : not found_stream.sh: 2: : not found_stream.sh: 6: : not found_stream.sh: 12: 正在启动vLLM模型服务... INFO 08-12 04:56:53 [__init__.py:239] Automatically detected platform cuda. usage: api_server.py [-h] [--host HOST] [--port PORT] [--uvicorn-log-level {debug,info,warning,error,critical,trace}] [--disable-uvicorn-access-log] [--allow-credentials] [--allowed-origins ALLOWED_ORIGINS] [--allowed-methods ALLOWED_METHODS] [--allowed-headers ALLOWED_HEADERS] [--api-key API_KEY] [--lora-modules LORA_MODULES [LORA_MODULES ...]] [--prompt-adapters PROMPT_ADAPTERS [PROMPT_ADAPTERS ...]] [--chat-template CHAT_TEMPLATE] [--chat-template-content-format {auto,string,openai}] [--response-role RESPONSE_ROLE] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--ssl-ca-certs SSL_CA_CERTS] [--enable-ssl-refresh] [--ssl-cert-reqs SSL_CERT_REQS] [--root-path ROOT_PATH] [--middleware MIDDLEWARE] [--return-tokens-as-token-ids] [--disable-frontend-multiprocessing] [--enable-request-id-headers] [--enable-auto-tool-choice] [--tool-call-parser {granite-20b-fc,granite,hermes,internlm,jamba,llama4_json,llama3_json,mistral,phi4_mini_json,pythonic} or name registered in --tool-parser-plugin] [--tool-parser-plugin TOOL_PARSER_PLUGIN] [--model MODEL] [--task {auto,generate,embedding,embed,classify,score,reward,transcription}] [--tokenizer TOKENIZER] [--hf-config-path HF_CONFIG_PATH] [--skip-tokenizer-init] [--revision REVISION] [--code-revision CODE_REVISION] [--tokenizer-revision TOKENIZER_REVISION] [--tokenizer-mode {auto,slow,mistral,custom}] [--trust-remote-code] [--allowed-local-media-path ALLOWED_LOCAL_MEDIA_PATH] [--load-format {auto,pt,safetensors,npcache,dummy,tensorizer,sharded_state,gguf,bitsandbytes,mistral,runai_streamer,runai_streamer_sharded,fastsafetensors}] [--download-dir DOWNLOAD_DIR] [--model-loader-extra-config MODEL_LOADER_EXTRA_CONFIG] [--use-tqdm-on-load | --no-use-tqdm-on-load] [--config-format {auto,hf,mistral}] [--dtype {auto,half,float16,bfloat16,float,float32}] [--max-model-len MAX_MODEL_LEN] [--guided-decoding-backend {auto,guidance,xgrammar}] [--reasoning-parser {deepseek_r1,granite}] [--logits-processor-pattern LOGITS_PROCESSOR_PATTERN] [--model-impl {auto,vllm,transformers}] [--distributed-executor-backend {external_launcher,mp,ray,uni,None}] [--pipeline-parallel-size PIPELINE_PARALLEL_SIZE] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE] [--enable-expert-parallel | --no-enable-expert-parallel] [--max-parallel-loading-workers MAX_PARALLEL_LOADING_WORKERS] [--ray-workers-use-nsight | --no-ray-workers-use-nsight] [--disable-custom-all-reduce | --no-disable-custom-all-reduce] [--block-size {1,8,16,32,64,128}] [--gpu-memory-utilization GPU_MEMORY_UTILIZATION] [--swap-space SWAP_SPACE] [--kv-cache-dtype {auto,fp8,fp8_e4m3,fp8_e5m2}] [--num-gpu-blocks-override NUM_GPU_BLOCKS_OVERRIDE] [--enable-prefix-caching | --no-enable-prefix-caching] [--prefix-caching-hash-algo {builtin,sha256}] [--cpu-offload-gb CPU_OFFLOAD_GB] [--calculate-kv-scales | --no-calculate-kv-scales] [--disable-sliding-window] [--use-v2-block-manager] [--seed SEED] [--max-logprobs MAX_LOGPROBS] [--disable-log-stats] [--quantization {aqlm,awq,deepspeedfp,tpu_int8,fp8,ptpc_fp8,fbgemm_fp8,modelopt,nvfp4,marlin,bitblas,gguf,gptq_marlin_24,gptq_marlin,gptq_bitblas,awq_marlin,gptq,compressed-tensors,bitsandbytes,qqq,hqq,experts_int8,neuron_quant,ipex,quark,moe_wna16,torchao,None}] [--rope-scaling ROPE_SCALING] [--rope-theta ROPE_THETA] [--hf-token [HF_TOKEN]] [--hf-overrides HF_OVERRIDES] [--enforce-eager] [--max-seq-len-to-capture MAX_SEQ_LEN_TO_CAPTURE] [--tokenizer-pool-size TOKENIZER_POOL_SIZE] [--tokenizer-pool-type TOKENIZER_POOL_TYPE] [--tokenizer-pool-extra-config TOKENIZER_POOL_EXTRA_CONFIG] [--limit-mm-per-prompt LIMIT_MM_PER_PROMPT] [--mm-processor-kwargs MM_PROCESSOR_KWARGS] [--disable-mm-preprocessor-cache] [--enable-lora | --no-enable-lora] [--enable-lora-bias | --no-enable-lora-bias] [--max-loras MAX_LORAS] [--max-lora-rank MAX_LORA_RANK] [--lora-extra-vocab-size LORA_EXTRA_VOCAB_SIZE] [--lora-dtype {auto,bfloat16,float16}] [--long-lora-scaling-factors LONG_LORA_SCALING_FACTORS [LONG_LORA_SCALING_FACTORS ...]] [--max-cpu-loras MAX_CPU_LORAS] [--fully-sharded-loras | --no-fully-sharded-loras] [--enable-prompt-adapter | --no-enable-prompt-adapter] [--max-prompt-adapters MAX_PROMPT_ADAPTERS] [--max-prompt-adapter-token MAX_PROMPT_ADAPTER_TOKEN] [--device {auto,cpu,cuda,hpu,neuron,tpu,xpu}] [--speculative-config SPECULATIVE_CONFIG] [--ignore-patterns IGNORE_PATTERNS] [--served-model-name SERVED_MODEL_NAME [SERVED_MODEL_NAME ...]] [--qlora-adapter-name-or-path QLORA_ADAPTER_NAME_OR_PATH] [--show-hidden-metrics-for-version SHOW_HIDDEN_METRICS_FOR_VERSION] [--otlp-traces-endpoint OTLP_TRACES_ENDPOINT] [--collect-detailed-traces COLLECT_DETAILED_TRACES] [--disable-async-output-proc] [--max-num-batched-tokens MAX_NUM_BATCHED_TOKENS] [--max-num-seqs MAX_NUM_SEQS] [--max-num-partial-prefills MAX_NUM_PARTIAL_PREFILLS] [--max-long-partial-prefills MAX_LONG_PARTIAL_PREFILLS] [--long-prefill-token-threshold LONG_PREFILL_TOKEN_THRESHOLD] [--num-lookahead-slots NUM_LOOKAHEAD_SLOTS] [--scheduler-delay-factor SCHEDULER_DELAY_FACTOR] [--preemption-mode {recompute,swap,None}] [--num-scheduler-steps NUM_SCHEDULER_STEPS] [--multi-step-stream-outputs | --no-multi-step-stream-outputs] [--scheduling-policy {fcfs,priority}] [--enable-chunked-prefill | --no-enable-chunked-prefill] [--disable-chunked-mm-input | --no-disable-chunked-mm-input] [--scheduler-cls SCHEDULER_CLS] [--override-neuron-config OVERRIDE_NEURON_CONFIG] [--override-pooler-config OVERRIDE_POOLER_CONFIG] [--compilation-config COMPILATION_CONFIG] [--kv-transfer-config KV_TRANSFER_CONFIG] [--worker-cls WORKER_CLS] [--worker-extension-cls WORKER_EXTENSION_CLS] [--generation-config GENERATION_CONFIG] [--override-generation-config OVERRIDE_GENERATION_CONFIG] [--enable-sleep-mode] [--additional-config ADDITIONAL_CONFIG] [--enable-reasoning] [--disable-cascade-attn] [--disable-log-requests] [--max-log-len MAX_LOG_LEN] [--disable-fastapi-docs] [--enable-prompt-tokens-details] [--enable-server-load-tracking] api_server.py: error: unrecognized arguments: run_service_stream.sh: 16: --model: not found run_service_stream.sh: 17: --served-model-name: not found run_service_stream.sh: 18: --host: not found run_service_stream.sh: 19: --port: not found run_service_stream.sh: 20: --dtype: not found run_service_stream.sh: 21: --tensor-parallel-size: not found run_service_stream.sh: 22: --max-model-len: not found run_service_stream.sh: 23: --enforce-eager: not found : not found_stream.sh: 24: : not found_stream.sh: 25: run_service_stream.sh: 24: --trust-remote-code: not found sleep: invalid time interval '5\r' Try 'sleep --help' for more information. : not found_stream.sh: 28: 正在启动Web Demo服务... python3: can't open file '/app/web/stream_demo.py\r': [Errno 2] No such file or directory D:\Confucius3-Math>