jd-opensource
diff --git a/‎xllm_service/common/global_gflags.cpp‎
Lines changed: 0 additions & 5 deletions b/‎xllm_service/common/global_gflags.cpp‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎xllm_service/common/global_gflags.h‎
Lines changed: 0 additions & 2 deletions b/‎xllm_service/common/global_gflags.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎xllm_service/http_service/service.cpp‎
Lines changed: 64 additions & 86 deletions b/‎xllm_service/http_service/service.cpp‎
Lines changed: 64 additions & 86 deletions
diff --git a/‎xllm_service/http_service/service.h‎
Lines changed: 2 additions & 8 deletions b/‎xllm_service/http_service/service.h‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎xllm_service/master.cpp‎
Lines changed: 50 additions & 7 deletions b/‎xllm_service/master.cpp‎
Lines changed: 50 additions & 7 deletions
diff --git a/‎xllm_service/master.h‎
Lines changed: 3 additions & 0 deletions b/‎xllm_service/master.h‎
Lines changed: 3 additions & 0 deletions
@@ -18,11 +18,6 @@ DEFINE_int32(http_server_max_concurrency,
              128,
              "Limit number of requests processed in parallel");
 
-DEFINE_string(rpc_server_host,
-              "",
-              "Rpc server listen address, may be IPV4/IPV6/UDS."
-              " If this is set, the flag port will be ignored");
-
 DEFINE_int32(rpc_server_port, 8889, "Port for xllm rpc service to listen on");
 
 DEFINE_int32(rpc_server_idle_timeout_s,
 
@@ -12,8 +12,6 @@ DECLARE_int32(http_server_num_threads);
 
 DECLARE_int32(http_server_max_concurrency);
 
-DECLARE_string(rpc_server_host);
-
 DECLARE_int32(rpc_server_port);
 
 DECLARE_int32(rpc_server_idle_timeout_s);
 
@@ -48,56 +48,6 @@ XllmHttpServiceImpl::XllmHttpServiceImpl(const HttpServiceConfig& config)
 
 XllmHttpServiceImpl::~XllmHttpServiceImpl() {}
 
-bool XllmHttpServiceImpl::create_channel(const std::string& target_uri) {
-  std::lock_guard<std::mutex> guard(channel_mutex_);
-  if (cached_channels_.find(target_uri) == cached_channels_.end()) {
-    brpc::Channel* channel = new brpc::Channel();
-    brpc::ChannelOptions options;
-    // Add to params
-    options.protocol = "http";
-    options.timeout_ms = config_.timeout_ms; /*milliseconds*/
-    options.max_retry = 3;
-    std::string load_balancer = "";
-    if (channel->Init(target_uri.c_str(), load_balancer.c_str(), &options) !=
-        0) {
-      LOG(ERROR) << "Fail to initialize channel for " << target_uri;
-      return false;
-    }
-    cached_channels_[target_uri] = channel;
-  }
-
-  return true;
-}
-
-std::string XllmHttpServiceImpl::get_redirect_uri(bool only_prefill) {
-  std::string target_instance_addr;
-  if (!rpc_service_) {
-    // for testing
-    if (config_.test_instance_addr.empty()) {
-      LOG(ERROR) << "Rpc service is not start.";
-      return "";
-    }
-    target_instance_addr = config_.test_instance_addr;
-  } else {
-    InstancesPair instances_pair =
-        rpc_service_->select_instances_pair(only_prefill);
-    if (instances_pair.prefill_instance_http_addr.empty()) {
-      LOG(ERROR) << "No prefill instance available.";
-      return "";
-    }
-    target_instance_addr = instances_pair.prefill_instance_http_addr;
-
-    if (!only_prefill) {
-      if (instances_pair.decode_instance_http_addr.empty()) {
-        // TODO:
-      }
-      // TODO: add instances_pair.decode_instance_http_addr to request?
-    }
-  }
-
-  return target_instance_addr;
-}
-
 void XllmHttpServiceImpl::Hello(::google::protobuf::RpcController* controller,
                                 const proto::HttpHelloRequest* request,
                                 proto::HttpHelloResponse* response,
@@ -198,7 +148,8 @@ void XllmHttpServiceImpl::handle(std::shared_ptr<T> call_data,
 
   // async redistribute the request and wait the response
   // TODO: optimize the thread pool to async mode.
-  auto channel_ptr = cached_channels_[target_uri];
+  brpc::Channel* channel_ptr = rpc_service_->get_channel(target_uri).get();
+
   // send request to prefill instance.
   thread_pool_->schedule([this,
                           service_request_id,
@@ -360,24 +311,6 @@ void XllmHttpServiceImpl::post_serving(
   // create xllm_service request_id: service_request_id
   std::string service_request_id = generate_service_request_id(serving_method);
   json_value["service_request_id"] = service_request_id;
-  std::string req_attachment = json_value.dump();
-  request_tracer_->log(service_request_id, req_attachment);
-
-  // redistribute the request to the correct P/D instance
-  // TODO: redistribute policy to select the instance
-  std::string target_uri = get_redirect_uri();
-  if (target_uri.empty()) {
-    cntl->SetFailed(
-        "Internal runtime error, can not found a running instance.");
-    return;
-  }
-  if (cached_channels_.find(target_uri) == cached_channels_.end()) {
-    if (!create_channel(target_uri)) {
-      LOG(ERROR) << "Create channel failed, target_uri is " << target_uri;
-      cntl->SetFailed("Internal runtime error.");
-      return;
-    }
-  }
 
   std::function<void(const std::string&)> trace_callback;
   if (config_.enable_request_trace) {
@@ -388,33 +321,82 @@ void XllmHttpServiceImpl::post_serving(
     trace_callback = nullptr;
   }
 
+  SchduleResult schedule_res;
   if (serving_method == "/v1/completions") {
+    if (json_value.contains("prompt")) {
+      if (!rpc_service_->schedule(json_value.at("prompt").get<std::string>(),
+                                  &schedule_res)) {
+        cntl->SetFailed("Schedule fail!");
+        LOG(ERROR) << "XllmRpcServiceImpl::schedule error!";
+        return;
+      }
+    } else {
+      cntl->SetFailed("Input has no prompt!");
+      LOG(ERROR) << "Input has no prompt!";
+      return;
+    }
+    json_value["token_ids"] = schedule_res.token_ids;
+    json_value["routing"] = schedule_res.routing.serialize_to_json();
+
+    std::string req_attachment = json_value.dump();
     auto arena = response->GetArena();
     auto resp_pb =
         google::protobuf::Arena::CreateMessage<llm::proto::CompletionResponse>(
             arena);
     auto call_data = std::make_shared<CompletionCallData>(
-        cntl, stream, done_guard.release(), resp_pb, trace_callback);
+        cntl, stream, done_guard.release(), resp_pb);
     handle_v1_completions(call_data,
                           req_attachment,
                           service_request_id,
                           stream,
                           model,
                           include_usage,
-                          target_uri);
+                          schedule_res.routing.prefill_name);
   } else if (serving_method == "/v1/chat/completions") {
+    if (json_value.contains("messages") && json_value["messages"].is_array()) {
+      ChatMessages messages;
+      try {
+        const auto& msgs = json_value["messages"];
+        messages.reserve(msgs.size());
+        for (const auto& msg : msgs) {
+          if (msg.contains("role") && msg["role"].is_string() &&
+              msg.contains("content") && msg["content"].is_string()) {
+            messages.emplace_back(msg["role"].get<std::string>(),
+                                  msg["content"].get<std::string>());
+          }
+        }
+      } catch (const nlohmann::json::exception& e) {
+        cntl->SetFailed("Parse request fail, Invalid messages!");
+        LOG(ERROR) << "Parse request fail, Invalid messages!";
+        return;
+      }
+
+      if (!rpc_service_->schedule(messages, &schedule_res)) {
+        cntl->SetFailed("Schedule fail!");
+        LOG(ERROR) << "XllmRpcServiceImpl::schedule error!";
+        return;
+      }
+    } else {
+      cntl->SetFailed("Input has no messages!");
+      LOG(ERROR) << "Input has no messages!";
+      return;
+    }
+    json_value["token_ids"] = schedule_res.token_ids;
+    json_value["routing"] = schedule_res.routing.serialize_to_json();
+
+    std::string req_attachment = json_value.dump();
     auto arena = response->GetArena();
     auto resp_pb =
         google::protobuf::Arena::CreateMessage<llm::proto::ChatResponse>(arena);
     auto call_data = std::make_shared<ChatCallData>(
-        cntl, stream, done_guard.release(), resp_pb, trace_callback);
+        cntl, stream, done_guard.release(), resp_pb);
     handle_v1_chat_completions(call_data,
                                req_attachment,
                                service_request_id,
                                stream,
                                model,
                                include_usage,
-                               target_uri);
+                               schedule_res.routing.prefill_name);
   } else {
     LOG(ERROR) << "Not supported method: " << serving_method;
     cntl->SetFailed("Not supported method: " + serving_method);
@@ -456,22 +438,18 @@ void XllmHttpServiceImpl::get_serving(
   // done_guard.release());
   auto call_data = std::make_shared<CompletionCallData>(
       cntl, false, done_guard.release(), nullptr);
-  std::string target_uri = get_redirect_uri(true /*only_prefill*/);
-  if (target_uri.empty()) {
-    cntl->SetFailed(
-        "Internal runtime error, can not found a running instance.");
+
+  SchduleResult schedule_res;
+  if (!rpc_service_->schedule("", &schedule_res)) {
+    cntl->SetFailed("Schedule fail!");
+    LOG(ERROR) << "XllmRpcServiceImpl::schedule error!";
     return;
   }
-  if (cached_channels_.find(target_uri) == cached_channels_.end()) {
-    if (!create_channel(target_uri)) {
-      LOG(ERROR) << "Create channel failed, target_uri is " << target_uri;
-      cntl->SetFailed("Internal runtime error.");
-      return;
-    }
-  }
 
-  auto channel_ptr = cached_channels_[target_uri];
-  target_uri += serving_method;
+  brpc::Channel* channel_ptr =
+      rpc_service_->get_channel(schedule_res.routing.prefill_name).get();
+  std::string target_uri = schedule_res.routing.prefill_name + serving_method;
+
   thread_pool_->schedule(
       [/*req_attachment, */ call_data, cntl, channel_ptr, target_uri]() {
         brpc::Controller* redirect_cntl = new brpc::Controller();
 
@@ -62,8 +62,7 @@ class XllmHttpServiceImpl : public proto::XllmHttpService {
 
  private:
   bool create_channel(const std::string& target_uri);
-  // only prefill is true means only prefill instance is returned
-  std::string get_redirect_uri(bool only_prefill = false);
+
   void post_serving(const std::string& serving_method,
                     ::google::protobuf::RpcController* controller,
                     const proto::HttpRequest* request,
@@ -109,13 +108,8 @@ class XllmHttpServiceImpl : public proto::XllmHttpService {
   std::shared_ptr<XllmRpcServiceImpl> rpc_service_;
 
   std::unique_ptr<RequestTracer> request_tracer_;
-  // uri -> channel
-  // e.g. 127.0.0.1:9999/v1/completions -> channel1
-  //      127.0.0.1:9999/v1/chat/completions -> channel2
-  // NOTE: different methods to one instance has different channels
-  std::unordered_map<std::string, brpc::Channel*> cached_channels_;
+
   std::unique_ptr<ThreadPool> thread_pool_;
-  std::mutex channel_mutex_;
 
   // In disagg pd mode, we support receive generated token from
   // prefill or from decode directly.
 
@@ -1,5 +1,6 @@
 #include "master.h"
 
+#include <boost/asio.hpp>
 #include <csignal>
 
 #include "common/global_gflags.h"
@@ -20,14 +21,26 @@ Master::Master(const ServerOptions& server_options)
   rpc_config.detect_disconnected_instance_interval =
       server_options.detect_disconnected_instance_interval;
 
-  rpc_service_impl_ =
-      std::make_shared<xllm_service::XllmRpcServiceImpl>(rpc_config);
-  rpc_service_ =
-      std::make_unique<xllm_service::XllmRpcService>(rpc_service_impl_);
+  rpc_config.service_name = server_options_.rpc_server_host + ":" +
+                            std::to_string(server_options_.rpc_port);
+
+  ModelConfig model_config;
+  model_config.block_size = server_options.block_size;
+  model_config.model_type = server_options.model_type;
+  model_config.tokenizer_path = server_options.tokenizer_path;
 
-  HttpServiceConfig http_config;
+  xllm_service::HttpServiceConfig http_config;
   http_config.num_threads = server_options.http_num_threads;
+  http_config.timeout_ms = server_options.timeout_ms;
+  http_config.test_instance_addr = server_options.test_instance_addr;
   http_config.enable_request_trace = server_options.enable_request_trace;
+
+  rpc_service_impl_ = std::make_shared<xllm_service::XllmRpcServiceImpl>(
+      rpc_config, model_config, http_config);
+
+  rpc_service_ =
+      std::make_unique<xllm_service::XllmRpcService>(rpc_service_impl_);
+
   http_service_ = std::make_unique<xllm_service::XllmHttpServiceImpl>(
       rpc_service_impl_, http_config);
 }
@@ -145,13 +158,37 @@ void shutdown_handler(int signal) {
   exit(1);
 }
 
+std::string get_local_ip() {
+  using namespace boost::asio;
+  io_service io;
+  ip::tcp::resolver resolver(io);
+  ip::tcp::resolver::query query(ip::host_name(), "");
+  ip::tcp::resolver::iterator iter = resolver.resolve(query);
+  ip::tcp::resolver::iterator end;
+
+  while (iter != end) {
+    ip::address addr = iter->endpoint().address();
+    if (!addr.is_loopback() && addr.is_v4()) {
+      return addr.to_string();
+    }
+    ++iter;
+  }
+
+  LOG(FATAL) << "Get local ip faill!";
+  return "";
+}
+
 int main(int argc, char* argv[]) {
   // Initialize gflags
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   // Initialize glog
   google::InitGoogleLogging(argv[0]);
-  FLAGS_logtostderr = true;
+  // FLAGS_logtostderr = true;
+
+  LOG(INFO) << "Dump all gflags: " << std::endl
+            << google::CommandlineFlagsIntoString();
+  google::FlushLogFiles(google::INFO);
 
   LOG(INFO) << "Starting xllm master service.";
 
@@ -176,7 +213,7 @@ int main(int argc, char* argv[]) {
   server_options.http_idle_timeout_s = FLAGS_http_server_idle_timeout_s;
   server_options.http_num_threads = FLAGS_http_server_num_threads;
   server_options.http_max_concurrency = FLAGS_http_server_max_concurrency;
-  server_options.rpc_server_host = FLAGS_rpc_server_host;
+  server_options.rpc_server_host = get_local_ip();
   server_options.rpc_port = FLAGS_rpc_server_port;
   server_options.rpc_idle_timeout_s = FLAGS_rpc_server_idle_timeout_s;
   server_options.rpc_num_threads = FLAGS_rpc_server_num_threads;
@@ -186,10 +223,16 @@ int main(int argc, char* argv[]) {
   server_options.detect_disconnected_instance_interval =
       FLAGS_detect_disconnected_instance_interval;
   server_options.enable_request_trace = FLAGS_enable_request_trace;
+
+  server_options.tokenizer_path = FLAGS_tokenizer_path;
   server_options.block_size = FLAGS_block_size;
   server_options.model_type = FLAGS_model_type;
   server_options.tokenizer_path = FLAGS_tokenizer_path;
 
+  server_options.num_threads = FLAGS_num_threads;
+  server_options.timeout_ms = FLAGS_timeout_ms;
+  server_options.test_instance_addr = FLAGS_test_instance_addr;
+
   xllm_service::Master master(server_options);
 
   if (!master.start()) {
 
@@ -17,6 +17,9 @@ struct ServerOptions {
   int32_t http_num_threads = 32;
   int32_t http_max_concurrency = 128;
   bool enable_request_trace = false;
+  int num_threads = 16;
+  int timeout_ms = -1;
+  std::string test_instance_addr = "";
 
   // rpc server options
   std::string rpc_server_host = "";