318 lines
20 KiB
Plaintext
318 lines
20 KiB
Plaintext
D:\Confucius3-Math>docker run -e ARK_API_KEY=f6150e6c-422a-4265-8b63-4d941b271220 -p 8827:8827 confucius3
|
|
|
|
==========
|
|
== CUDA ==
|
|
==========
|
|
|
|
CUDA Version 12.2.2
|
|
|
|
Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
|
|
This container image and its contents are governed by the NVIDIA Deep Learning Container License.
|
|
By pulling and using the container, you accept the terms and conditions of this license:
|
|
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license
|
|
|
|
A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.
|
|
|
|
WARNING: The NVIDIA Driver was not detected. GPU functionality will not be available.
|
|
Use the NVIDIA Container Toolkit to start this container with GPU support; see
|
|
https://docs.nvidia.com/datacenter/cloud-native/ .
|
|
|
|
正在启动vLLM模型服务...
|
|
: not found_stream.sh: 2:
|
|
: not found_stream.sh: 6:
|
|
: not found_stream.sh: 12:
|
|
INFO 08-12 04:55:32 [__init__.py:243] No platform detected, vLLM is running on UnspecifiedPlatform
|
|
WARNING 08-12 04:55:32 [_custom_ops.py:21] Failed to import from vllm._C with ImportError('libcuda.so.1: cannot open shared object file: No such file or directory')
|
|
usage: api_server.py [-h] [--host HOST] [--port PORT]
|
|
[--uvicorn-log-level {debug,info,warning,error,critical,trace}]
|
|
[--disable-uvicorn-access-log] [--allow-credentials]
|
|
[--allowed-origins ALLOWED_ORIGINS]
|
|
[--allowed-methods ALLOWED_METHODS]
|
|
[--allowed-headers ALLOWED_HEADERS] [--api-key API_KEY]
|
|
[--lora-modules LORA_MODULES [LORA_MODULES ...]]
|
|
[--prompt-adapters PROMPT_ADAPTERS [PROMPT_ADAPTERS ...]]
|
|
[--chat-template CHAT_TEMPLATE]
|
|
[--chat-template-content-format {auto,string,openai}]
|
|
[--response-role RESPONSE_ROLE]
|
|
[--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
|
|
[--ssl-ca-certs SSL_CA_CERTS] [--enable-ssl-refresh]
|
|
[--ssl-cert-reqs SSL_CERT_REQS] [--root-path ROOT_PATH]
|
|
[--middleware MIDDLEWARE] [--return-tokens-as-token-ids]
|
|
[--disable-frontend-multiprocessing]
|
|
[--enable-request-id-headers] [--enable-auto-tool-choice]
|
|
[--tool-call-parser {granite-20b-fc,granite,hermes,internlm,jamba,llama4_json,llama3_json,mistral,phi4_mini_json,pythonic} or name registered in --tool-parser-plugin]
|
|
[--tool-parser-plugin TOOL_PARSER_PLUGIN] [--model MODEL]
|
|
[--task {auto,generate,embedding,embed,classify,score,reward,transcription}]
|
|
[--tokenizer TOKENIZER] [--hf-config-path HF_CONFIG_PATH]
|
|
[--skip-tokenizer-init] [--revision REVISION]
|
|
[--code-revision CODE_REVISION]
|
|
[--tokenizer-revision TOKENIZER_REVISION]
|
|
[--tokenizer-mode {auto,slow,mistral,custom}]
|
|
[--trust-remote-code]
|
|
[--allowed-local-media-path ALLOWED_LOCAL_MEDIA_PATH]
|
|
[--load-format {auto,pt,safetensors,npcache,dummy,tensorizer,sharded_state,gguf,bitsandbytes,mistral,runai_streamer,runai_streamer_sharded,fastsafetensors}]
|
|
[--download-dir DOWNLOAD_DIR]
|
|
[--model-loader-extra-config MODEL_LOADER_EXTRA_CONFIG]
|
|
[--use-tqdm-on-load | --no-use-tqdm-on-load]
|
|
[--config-format {auto,hf,mistral}]
|
|
[--dtype {auto,half,float16,bfloat16,float,float32}]
|
|
[--max-model-len MAX_MODEL_LEN]
|
|
[--guided-decoding-backend {auto,guidance,lm-format-enforcer,outlines,xgrammar}]
|
|
[--reasoning-parser {deepseek_r1,granite}]
|
|
[--logits-processor-pattern LOGITS_PROCESSOR_PATTERN]
|
|
[--model-impl {auto,vllm,transformers}]
|
|
[--distributed-executor-backend {external_launcher,mp,ray,uni,None}]
|
|
[--pipeline-parallel-size PIPELINE_PARALLEL_SIZE]
|
|
[--tensor-parallel-size TENSOR_PARALLEL_SIZE]
|
|
[--data-parallel-size DATA_PARALLEL_SIZE]
|
|
[--enable-expert-parallel | --no-enable-expert-parallel]
|
|
[--max-parallel-loading-workers MAX_PARALLEL_LOADING_WORKERS]
|
|
[--ray-workers-use-nsight | --no-ray-workers-use-nsight]
|
|
[--disable-custom-all-reduce | --no-disable-custom-all-reduce]
|
|
[--block-size {1,8,16,32,64,128}]
|
|
[--gpu-memory-utilization GPU_MEMORY_UTILIZATION]
|
|
[--swap-space SWAP_SPACE]
|
|
[--kv-cache-dtype {auto,fp8,fp8_e4m3,fp8_e5m2}]
|
|
[--num-gpu-blocks-override NUM_GPU_BLOCKS_OVERRIDE]
|
|
[--enable-prefix-caching | --no-enable-prefix-caching]
|
|
[--prefix-caching-hash-algo {builtin,sha256}]
|
|
[--cpu-offload-gb CPU_OFFLOAD_GB]
|
|
[--calculate-kv-scales | --no-calculate-kv-scales]
|
|
[--disable-sliding-window] [--use-v2-block-manager]
|
|
[--seed SEED] [--max-logprobs MAX_LOGPROBS]
|
|
[--disable-log-stats]
|
|
[--quantization {aqlm,awq,deepspeedfp,tpu_int8,fp8,ptpc_fp8,fbgemm_fp8,modelopt,nvfp4,marlin,bitblas,gguf,gptq_marlin_24,gptq_marlin,gptq_bitblas,awq_marlin,gptq,compressed-tensors,bitsandbytes,qqq,hqq,experts_int8,neuron_quant,ipex,quark,moe_wna16,torchao,None}]
|
|
[--rope-scaling ROPE_SCALING] [--rope-theta ROPE_THETA]
|
|
[--hf-token [HF_TOKEN]] [--hf-overrides HF_OVERRIDES]
|
|
[--enforce-eager]
|
|
[--max-seq-len-to-capture MAX_SEQ_LEN_TO_CAPTURE]
|
|
[--tokenizer-pool-size TOKENIZER_POOL_SIZE]
|
|
[--tokenizer-pool-type TOKENIZER_POOL_TYPE]
|
|
[--tokenizer-pool-extra-config TOKENIZER_POOL_EXTRA_CONFIG]
|
|
[--limit-mm-per-prompt LIMIT_MM_PER_PROMPT]
|
|
[--mm-processor-kwargs MM_PROCESSOR_KWARGS]
|
|
[--disable-mm-preprocessor-cache]
|
|
[--enable-lora | --no-enable-lora]
|
|
[--enable-lora-bias | --no-enable-lora-bias]
|
|
[--max-loras MAX_LORAS] [--max-lora-rank MAX_LORA_RANK]
|
|
[--lora-extra-vocab-size LORA_EXTRA_VOCAB_SIZE]
|
|
[--lora-dtype {auto,bfloat16,float16}]
|
|
[--long-lora-scaling-factors LONG_LORA_SCALING_FACTORS [LONG_LORA_SCALING_FACTORS ...]]
|
|
[--max-cpu-loras MAX_CPU_LORAS]
|
|
[--fully-sharded-loras | --no-fully-sharded-loras]
|
|
[--enable-prompt-adapter | --no-enable-prompt-adapter]
|
|
[--max-prompt-adapters MAX_PROMPT_ADAPTERS]
|
|
[--max-prompt-adapter-token MAX_PROMPT_ADAPTER_TOKEN]
|
|
[--device {auto,cpu,cuda,hpu,neuron,tpu,xpu}]
|
|
[--speculative-config SPECULATIVE_CONFIG]
|
|
[--ignore-patterns IGNORE_PATTERNS]
|
|
[--served-model-name SERVED_MODEL_NAME [SERVED_MODEL_NAME ...]]
|
|
[--qlora-adapter-name-or-path QLORA_ADAPTER_NAME_OR_PATH]
|
|
[--show-hidden-metrics-for-version SHOW_HIDDEN_METRICS_FOR_VERSION]
|
|
[--otlp-traces-endpoint OTLP_TRACES_ENDPOINT]
|
|
[--collect-detailed-traces COLLECT_DETAILED_TRACES]
|
|
[--disable-async-output-proc]
|
|
[--max-num-batched-tokens MAX_NUM_BATCHED_TOKENS]
|
|
[--max-num-seqs MAX_NUM_SEQS]
|
|
[--max-num-partial-prefills MAX_NUM_PARTIAL_PREFILLS]
|
|
[--max-long-partial-prefills MAX_LONG_PARTIAL_PREFILLS]
|
|
[--long-prefill-token-threshold LONG_PREFILL_TOKEN_THRESHOLD]
|
|
[--num-lookahead-slots NUM_LOOKAHEAD_SLOTS]
|
|
[--scheduler-delay-factor SCHEDULER_DELAY_FACTOR]
|
|
[--preemption-mode {recompute,swap,None}]
|
|
[--num-scheduler-steps NUM_SCHEDULER_STEPS]
|
|
[--multi-step-stream-outputs | --no-multi-step-stream-outputs]
|
|
[--scheduling-policy {fcfs,priority}]
|
|
[--enable-chunked-prefill | --no-enable-chunked-prefill]
|
|
[--disable-chunked-mm-input | --no-disable-chunked-mm-input]
|
|
[--scheduler-cls SCHEDULER_CLS]
|
|
[--override-neuron-config OVERRIDE_NEURON_CONFIG]
|
|
[--override-pooler-config OVERRIDE_POOLER_CONFIG]
|
|
[--compilation-config COMPILATION_CONFIG]
|
|
[--kv-transfer-config KV_TRANSFER_CONFIG]
|
|
[--worker-cls WORKER_CLS]
|
|
[--worker-extension-cls WORKER_EXTENSION_CLS]
|
|
[--generation-config GENERATION_CONFIG]
|
|
[--override-generation-config OVERRIDE_GENERATION_CONFIG]
|
|
[--enable-sleep-mode]
|
|
[--additional-config ADDITIONAL_CONFIG]
|
|
[--enable-reasoning] [--disable-cascade-attn]
|
|
[--disable-log-requests] [--max-log-len MAX_LOG_LEN]
|
|
[--disable-fastapi-docs] [--enable-prompt-tokens-details]
|
|
[--enable-server-load-tracking]
|
|
api_server.py: error: unrecognized arguments:
|
|
run_service_stream.sh: 16: --model: not found
|
|
run_service_stream.sh: 17: --served-model-name: not found
|
|
run_service_stream.sh: 18: --host: not found
|
|
run_service_stream.sh: 19: --port: not found
|
|
run_service_stream.sh: 20: --dtype: not found
|
|
run_service_stream.sh: 21: --tensor-parallel-size: not found
|
|
run_service_stream.sh: 22: --max-model-len: not found
|
|
run_service_stream.sh: 23: --enforce-eager: not found
|
|
: not found_stream.sh: 24:
|
|
: not found_stream.sh: 25:
|
|
run_service_stream.sh: 24: --trust-remote-code: not found
|
|
sleep: invalid time interval '5\r'
|
|
Try 'sleep --help' for more information.
|
|
: not found_stream.sh: 28:
|
|
正在启动Web Demo服务...
|
|
python3: can't open file '/app/web/stream_demo.py\r': [Errno 2] No such file or directory
|
|
|
|
D:\Confucius3-Math>docker run --gpus all -e ARK_API_KEY=f6150e6c-422a-4265-8b63-4d941b271220 -p 8827:8827 confucius3
|
|
|
|
==========
|
|
== CUDA ==
|
|
==========
|
|
|
|
CUDA Version 12.2.2
|
|
|
|
Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
|
|
This container image and its contents are governed by the NVIDIA Deep Learning Container License.
|
|
By pulling and using the container, you accept the terms and conditions of this license:
|
|
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license
|
|
|
|
A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.
|
|
|
|
: not found_stream.sh: 2:
|
|
: not found_stream.sh: 6:
|
|
: not found_stream.sh: 12:
|
|
正在启动vLLM模型服务...
|
|
INFO 08-12 04:56:53 [__init__.py:239] Automatically detected platform cuda.
|
|
usage: api_server.py [-h] [--host HOST] [--port PORT]
|
|
[--uvicorn-log-level {debug,info,warning,error,critical,trace}]
|
|
[--disable-uvicorn-access-log] [--allow-credentials]
|
|
[--allowed-origins ALLOWED_ORIGINS]
|
|
[--allowed-methods ALLOWED_METHODS]
|
|
[--allowed-headers ALLOWED_HEADERS] [--api-key API_KEY]
|
|
[--lora-modules LORA_MODULES [LORA_MODULES ...]]
|
|
[--prompt-adapters PROMPT_ADAPTERS [PROMPT_ADAPTERS ...]]
|
|
[--chat-template CHAT_TEMPLATE]
|
|
[--chat-template-content-format {auto,string,openai}]
|
|
[--response-role RESPONSE_ROLE]
|
|
[--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
|
|
[--ssl-ca-certs SSL_CA_CERTS] [--enable-ssl-refresh]
|
|
[--ssl-cert-reqs SSL_CERT_REQS] [--root-path ROOT_PATH]
|
|
[--middleware MIDDLEWARE] [--return-tokens-as-token-ids]
|
|
[--disable-frontend-multiprocessing]
|
|
[--enable-request-id-headers] [--enable-auto-tool-choice]
|
|
[--tool-call-parser {granite-20b-fc,granite,hermes,internlm,jamba,llama4_json,llama3_json,mistral,phi4_mini_json,pythonic} or name registered in --tool-parser-plugin]
|
|
[--tool-parser-plugin TOOL_PARSER_PLUGIN] [--model MODEL]
|
|
[--task {auto,generate,embedding,embed,classify,score,reward,transcription}]
|
|
[--tokenizer TOKENIZER] [--hf-config-path HF_CONFIG_PATH]
|
|
[--skip-tokenizer-init] [--revision REVISION]
|
|
[--code-revision CODE_REVISION]
|
|
[--tokenizer-revision TOKENIZER_REVISION]
|
|
[--tokenizer-mode {auto,slow,mistral,custom}]
|
|
[--trust-remote-code]
|
|
[--allowed-local-media-path ALLOWED_LOCAL_MEDIA_PATH]
|
|
[--load-format {auto,pt,safetensors,npcache,dummy,tensorizer,sharded_state,gguf,bitsandbytes,mistral,runai_streamer,runai_streamer_sharded,fastsafetensors}]
|
|
[--download-dir DOWNLOAD_DIR]
|
|
[--model-loader-extra-config MODEL_LOADER_EXTRA_CONFIG]
|
|
[--use-tqdm-on-load | --no-use-tqdm-on-load]
|
|
[--config-format {auto,hf,mistral}]
|
|
[--dtype {auto,half,float16,bfloat16,float,float32}]
|
|
[--max-model-len MAX_MODEL_LEN]
|
|
[--guided-decoding-backend {auto,guidance,xgrammar}]
|
|
[--reasoning-parser {deepseek_r1,granite}]
|
|
[--logits-processor-pattern LOGITS_PROCESSOR_PATTERN]
|
|
[--model-impl {auto,vllm,transformers}]
|
|
[--distributed-executor-backend {external_launcher,mp,ray,uni,None}]
|
|
[--pipeline-parallel-size PIPELINE_PARALLEL_SIZE]
|
|
[--tensor-parallel-size TENSOR_PARALLEL_SIZE]
|
|
[--data-parallel-size DATA_PARALLEL_SIZE]
|
|
[--enable-expert-parallel | --no-enable-expert-parallel]
|
|
[--max-parallel-loading-workers MAX_PARALLEL_LOADING_WORKERS]
|
|
[--ray-workers-use-nsight | --no-ray-workers-use-nsight]
|
|
[--disable-custom-all-reduce | --no-disable-custom-all-reduce]
|
|
[--block-size {1,8,16,32,64,128}]
|
|
[--gpu-memory-utilization GPU_MEMORY_UTILIZATION]
|
|
[--swap-space SWAP_SPACE]
|
|
[--kv-cache-dtype {auto,fp8,fp8_e4m3,fp8_e5m2}]
|
|
[--num-gpu-blocks-override NUM_GPU_BLOCKS_OVERRIDE]
|
|
[--enable-prefix-caching | --no-enable-prefix-caching]
|
|
[--prefix-caching-hash-algo {builtin,sha256}]
|
|
[--cpu-offload-gb CPU_OFFLOAD_GB]
|
|
[--calculate-kv-scales | --no-calculate-kv-scales]
|
|
[--disable-sliding-window] [--use-v2-block-manager]
|
|
[--seed SEED] [--max-logprobs MAX_LOGPROBS]
|
|
[--disable-log-stats]
|
|
[--quantization {aqlm,awq,deepspeedfp,tpu_int8,fp8,ptpc_fp8,fbgemm_fp8,modelopt,nvfp4,marlin,bitblas,gguf,gptq_marlin_24,gptq_marlin,gptq_bitblas,awq_marlin,gptq,compressed-tensors,bitsandbytes,qqq,hqq,experts_int8,neuron_quant,ipex,quark,moe_wna16,torchao,None}]
|
|
[--rope-scaling ROPE_SCALING] [--rope-theta ROPE_THETA]
|
|
[--hf-token [HF_TOKEN]] [--hf-overrides HF_OVERRIDES]
|
|
[--enforce-eager]
|
|
[--max-seq-len-to-capture MAX_SEQ_LEN_TO_CAPTURE]
|
|
[--tokenizer-pool-size TOKENIZER_POOL_SIZE]
|
|
[--tokenizer-pool-type TOKENIZER_POOL_TYPE]
|
|
[--tokenizer-pool-extra-config TOKENIZER_POOL_EXTRA_CONFIG]
|
|
[--limit-mm-per-prompt LIMIT_MM_PER_PROMPT]
|
|
[--mm-processor-kwargs MM_PROCESSOR_KWARGS]
|
|
[--disable-mm-preprocessor-cache]
|
|
[--enable-lora | --no-enable-lora]
|
|
[--enable-lora-bias | --no-enable-lora-bias]
|
|
[--max-loras MAX_LORAS] [--max-lora-rank MAX_LORA_RANK]
|
|
[--lora-extra-vocab-size LORA_EXTRA_VOCAB_SIZE]
|
|
[--lora-dtype {auto,bfloat16,float16}]
|
|
[--long-lora-scaling-factors LONG_LORA_SCALING_FACTORS [LONG_LORA_SCALING_FACTORS ...]]
|
|
[--max-cpu-loras MAX_CPU_LORAS]
|
|
[--fully-sharded-loras | --no-fully-sharded-loras]
|
|
[--enable-prompt-adapter | --no-enable-prompt-adapter]
|
|
[--max-prompt-adapters MAX_PROMPT_ADAPTERS]
|
|
[--max-prompt-adapter-token MAX_PROMPT_ADAPTER_TOKEN]
|
|
[--device {auto,cpu,cuda,hpu,neuron,tpu,xpu}]
|
|
[--speculative-config SPECULATIVE_CONFIG]
|
|
[--ignore-patterns IGNORE_PATTERNS]
|
|
[--served-model-name SERVED_MODEL_NAME [SERVED_MODEL_NAME ...]]
|
|
[--qlora-adapter-name-or-path QLORA_ADAPTER_NAME_OR_PATH]
|
|
[--show-hidden-metrics-for-version SHOW_HIDDEN_METRICS_FOR_VERSION]
|
|
[--otlp-traces-endpoint OTLP_TRACES_ENDPOINT]
|
|
[--collect-detailed-traces COLLECT_DETAILED_TRACES]
|
|
[--disable-async-output-proc]
|
|
[--max-num-batched-tokens MAX_NUM_BATCHED_TOKENS]
|
|
[--max-num-seqs MAX_NUM_SEQS]
|
|
[--max-num-partial-prefills MAX_NUM_PARTIAL_PREFILLS]
|
|
[--max-long-partial-prefills MAX_LONG_PARTIAL_PREFILLS]
|
|
[--long-prefill-token-threshold LONG_PREFILL_TOKEN_THRESHOLD]
|
|
[--num-lookahead-slots NUM_LOOKAHEAD_SLOTS]
|
|
[--scheduler-delay-factor SCHEDULER_DELAY_FACTOR]
|
|
[--preemption-mode {recompute,swap,None}]
|
|
[--num-scheduler-steps NUM_SCHEDULER_STEPS]
|
|
[--multi-step-stream-outputs | --no-multi-step-stream-outputs]
|
|
[--scheduling-policy {fcfs,priority}]
|
|
[--enable-chunked-prefill | --no-enable-chunked-prefill]
|
|
[--disable-chunked-mm-input | --no-disable-chunked-mm-input]
|
|
[--scheduler-cls SCHEDULER_CLS]
|
|
[--override-neuron-config OVERRIDE_NEURON_CONFIG]
|
|
[--override-pooler-config OVERRIDE_POOLER_CONFIG]
|
|
[--compilation-config COMPILATION_CONFIG]
|
|
[--kv-transfer-config KV_TRANSFER_CONFIG]
|
|
[--worker-cls WORKER_CLS]
|
|
[--worker-extension-cls WORKER_EXTENSION_CLS]
|
|
[--generation-config GENERATION_CONFIG]
|
|
[--override-generation-config OVERRIDE_GENERATION_CONFIG]
|
|
[--enable-sleep-mode]
|
|
[--additional-config ADDITIONAL_CONFIG]
|
|
[--enable-reasoning] [--disable-cascade-attn]
|
|
[--disable-log-requests] [--max-log-len MAX_LOG_LEN]
|
|
[--disable-fastapi-docs] [--enable-prompt-tokens-details]
|
|
[--enable-server-load-tracking]
|
|
api_server.py: error: unrecognized arguments:
|
|
run_service_stream.sh: 16: --model: not found
|
|
run_service_stream.sh: 17: --served-model-name: not found
|
|
run_service_stream.sh: 18: --host: not found
|
|
run_service_stream.sh: 19: --port: not found
|
|
run_service_stream.sh: 20: --dtype: not found
|
|
run_service_stream.sh: 21: --tensor-parallel-size: not found
|
|
run_service_stream.sh: 22: --max-model-len: not found
|
|
run_service_stream.sh: 23: --enforce-eager: not found
|
|
: not found_stream.sh: 24:
|
|
: not found_stream.sh: 25:
|
|
run_service_stream.sh: 24: --trust-remote-code: not found
|
|
sleep: invalid time interval '5\r'
|
|
Try 'sleep --help' for more information.
|
|
: not found_stream.sh: 28:
|
|
正在启动Web Demo服务...
|
|
python3: can't open file '/app/web/stream_demo.py\r': [Errno 2] No such file or directory
|
|
|
|
D:\Confucius3-Math> |