21365024824

Committed 26 Jan 2026 07:10AM UTC coverage: 88.654% (-0.05%) from 88.704%

Build # 21365024824

Build Type

push

github

Committed by

grencez

Commit Message

Update localserv to support chat interface and CLI args

Run Details

2141 of 2415 relevant lines covered (88.65%)

522.29 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

59.92

/src/language/inference.cc

#include "src/language/inference.hh"

#include <algorithm>
#include <cassert>
#include <cstring>
#include <stdexcept>
#include <thread>
#include <vector>

#include <fildesh/fildesh.h>
#include <fildesh/ostream.hh>

#include "src/chat/display.hh"
#include "src/chat/guide.hh"
#include "src/chat/opt.hh"
#include "src/chat/trajectory.hh"
#include "src/language/vocabulary.hh"

using rendezllama::ChatDisplay;
using rendezllama::ChatGuide;
using rendezllama::ChatOptions;
using rendezllama::ChatTrajectory;
using rendezllama::Inference;
using rendezllama::Vocabulary;
using rendezllama::inference::AdjustViaKind;

Inference::Inference(const Vocabulary& vocabulary)
  : vocabulary_(vocabulary)
{}
Inference::~Inference() {
  if (smpl_) {llama_sampler_free(smpl_);}
  llama_batch_free(batch_);
}

  const std::string&
rendezllama::antiprompt_suffix(
    std::string_view text,
    const std::set<std::string>& antiprompts)
{
  static const std::string empty_string;
  for (const std::string& s : antiprompts) {
    if (text.size() >= s.size()) {
      const size_t offset = text.size() - s.size();
      if (0 == memcmp(&text[offset], &s[0], s.size())) {
        return s;
      }
    }
  }
  return empty_string;
}

static bool maybe_trim_endspace(std::string& s)
{
  bool result = false;
  while (!s.empty() && s.back() == ' ') {
    s.pop_back();
    result = true;
  }
  return result;
}

  void
rendezllama::augment_tokenize_chat_input(
    ChatGuide& chat_guide,
    ChatTrajectory& chat_traj,
    bool& prevent_subsequent_newline,
    std::string s,
    const Vocabulary& vocabulary,
    const ChatOptions& opt)
{
  prevent_subsequent_newline = false;
  if (s.size() >= 2 && s[0] == '\\' && s[1] == 'n') {
    chat_guide.end_turn();
    chat_guide.begin_turn(opt.message_opts.size()-1);
    s.erase(0, 2);
    prevent_subsequent_newline = maybe_trim_endspace(s);
    if (opt.message_opts.back().prefix.back() == '\n' && opt.linespace_on) {
      if (!s.empty() && s.front() != ' ') {
        s.insert(0, " ");
      }
    }
    chat_traj.tokenize_append(s, vocabulary);
  }
  else if (s.front() == '\n') {
    // This is from /yield.
    chat_guide.yield_turn(s.substr(1));
  }
  else if (s.front() == ' ') {
    prevent_subsequent_newline = maybe_trim_endspace(s);
    chat_traj.tokenize_append(s, vocabulary);
  }
  else {
    chat_guide.yield_turn(0);
    if (opt.message_opts[0].prefix.back() == '\n' && opt.linespace_on) {
      if (!s.empty() && s.front() != ' ') {
        s.insert(0, " ");
      }
    }
    chat_traj.tokenize_append(s, vocabulary);
    chat_guide.yield_turn();
    chat_traj.display_token_count_ = chat_traj.rfind_message_prefix_begin_at(
        chat_traj.token_count()-1);
    prevent_subsequent_newline = true;
  }
}

  std::tuple<struct llama_model*, struct llama_context*>
rendezllama::make_llama_context(rendezllama::ChatOptions& opt)
{
  llama_model_params model_params = llama_model_default_params();
  model_params.use_mlock = opt.mlock_on;
  model_params.use_mmap = opt.mmap_on;

  struct llama_model* model = llama_model_load_from_file(
      opt.model_filename.c_str(), model_params);
  if (!model) {
    fildesh_log_error("Failed to open model.");
    return std::make_tuple(nullptr, nullptr);
  }

  if (opt.model_token_limit == 0) {
    opt.model_token_limit = llama_model_n_ctx_train(model);
  }
  if (opt.context_token_limit == 0) {
    opt.context_token_limit = opt.model_token_limit;
  }
  float rope_freq_scale = llama_model_rope_freq_scale_train(model);
  if (rope_freq_scale <= 0.0) {
    rope_freq_scale = 1.0f;
  }
  while (
      (unsigned)(opt.model_token_limit / rope_freq_scale)
      <
      opt.context_token_limit)
  {
    rope_freq_scale /= 2;
  }
  llama_model_free(model);
  model = nullptr;


  model_params = llama_model_default_params();
  model_params.use_mlock = opt.mlock_on;
  model_params.use_mmap = opt.mmap_on;

  llama_context_params ctx_params = llama_context_default_params();
  ctx_params.n_ctx = opt.context_token_limit;
  ctx_params.n_batch = opt.batch_count;
  ctx_params.rope_freq_scale = rope_freq_scale;

  std::vector<float> tensor_split(llama_max_devices());
  std::vector<llama_model_tensor_buft_override> tensor_buft_overrides(llama_max_tensor_buft_overrides());
  std::vector<size_t> margins(llama_max_devices(), 0);

  // Auto-tune parameters if possible (and not manually overridden by user yet).
  // This helps avoid OOM crashes on Vulkan/GPU by fitting layers to available memory.
  auto status = llama_params_fit(
      opt.model_filename.c_str(),
      &model_params,
      &ctx_params,
      tensor_split.data(),
      tensor_buft_overrides.data(),
      margins.data(),
      /*n_ctx_min=*/0,
      GGML_LOG_LEVEL_ERROR);

  if (status != 0) {
    fildesh_log_warning("llama_params_fit failed");
  }

  model = llama_model_load_from_file(
      opt.model_filename.c_str(), model_params);
  if (!model) {
    fildesh_log_error("Failed to open model.");
    return std::make_tuple(nullptr, nullptr);
  }

  struct llama_context* ctx = llama_init_from_model(model, ctx_params);
  if (!ctx) {
    llama_model_free(model);
    fildesh_log_error("Failed to create context.");
    return std::make_tuple(nullptr, nullptr);
  }
  return std::make_tuple(model, ctx);
}

static
  int
new_sampling_seed()
{
  return static_cast<int>(INT_MAX & time(NULL));
}

static
  void
apply_sampler_chain(
    struct llama_sampler* smpl,
    const rendezllama::inference::AdjustVia& adjust_via,
    const struct llama_model* model,
    unsigned seed,
    std::ostream& eout)
{
  const unsigned keep_one = 1;

  if (const auto* dry = std::get_if<AdjustViaKind::dry>(&adjust_via)) {
    static const char* seq_breakers[] = {
      "\n", ":",
    };
    llama_sampler_init_dry(
        llama_model_get_vocab(model),
        llama_model_n_ctx_train(model),
        dry->multiplier,
        dry->base,
        dry->allowed_length,
        dry->window_length,
        seq_breakers,
        sizeof(seq_breakers)/sizeof(*seq_breakers));
    eout << "dry:"
      << "\n  multiplier: " << dry->multiplier
      << "\n  base: " << dry->base
      << "\n  allowed_length: " << dry->allowed_length
      << "\n  window_length: " << dry->window_length
      << "\n";
  }
  if (const auto* min_p = std::get_if<AdjustViaKind::min_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_min_p(*min_p, keep_one));
    eout << "min_p: " << *min_p << "\n";
  }
  if (const auto* penalize_with = std::get_if<AdjustViaKind::penalize_with>(&adjust_via)) {
    llama_sampler_init_penalties(
        penalize_with->window_length,
        penalize_with->repetition,
        penalize_with->frequency,
        penalize_with->presence);
    eout << "penalties:"
      << "\n  window_length: " << penalize_with->window_length
      << "\n  repetition: " << penalize_with->repetition
      << "\n  frequency: " << penalize_with->frequency
      << "\n  presence: " << penalize_with->presence
      << "\n";
  }
  if (const auto* temperature = std::get_if<AdjustViaKind::temperature>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_temp(*temperature));
    eout << "temperature: " << *temperature << "\n";
  }
  if (const auto* top_k = std::get_if<AdjustViaKind::top_k>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(*top_k));
    eout << "top_k: " << *top_k << "\n";
  }
  if (const auto* top_p = std::get_if<AdjustViaKind::top_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(*top_p, keep_one));
    eout << "top_p: " << *top_p << "\n";
  }
  if (const auto* typical_p = std::get_if<AdjustViaKind::typical_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_typical(*typical_p, keep_one));
    eout << "typical_p: " << *typical_p << "\n";
  }
  if (const auto* xtc = std::get_if<AdjustViaKind::xtc>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_xtc(xtc->probability, xtc->threshold, keep_one, seed));
    eout << "xtc: "
      << "\n  probability: " << xtc->probability
      << "\n  threshold: " << xtc->threshold
      << "\n";
  }
}

static
  void
adaptive_p_sample(
    struct llama_sampler* smpl,
    const rendezllama::inference::AdaptiveP& adaptive_p,
    unsigned seed)
{
  llama_sampler_chain_add(
      smpl,
      llama_sampler_init_adaptive_p(
          adaptive_p.target,
          adaptive_p.decay,
          seed));
}

static
  void
mirostat_sample(
    struct llama_sampler* smpl,
    const rendezllama::inference::Mirostat& mirostat,
    unsigned seed,
    const rendezllama::Vocabulary& vocabulary)
{
  if (mirostat.version == 1) {
    const int mirostat_m = 100;
    llama_sampler_chain_add(
        smpl,
        llama_sampler_init_mirostat(
            vocabulary.cardinality(), seed,
            mirostat.tau, mirostat.eta, mirostat_m));
  }
  else if (mirostat.version == 2) {
    llama_sampler_chain_add(
        smpl,
        llama_sampler_init_mirostat_v2(
            seed, mirostat.tau, mirostat.eta));
  }
}

static
  std::tuple<unsigned, unsigned>
infer_thread_counts(const rendezllama::ChatOptions& opt)
{
  unsigned thread_count = opt.thread_count;
  unsigned batch_thread_count = opt.batch_thread_count;
  const unsigned n = std::thread::hardware_concurrency();
  if (thread_count == 0) {
    thread_count = n / 2;
    if (thread_count == 0) {
      thread_count = 1;
    }
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
    if (2 <= n && n <= 4) {
      thread_count = n;
    }
#endif
  }
  if (batch_thread_count == 0) {
    batch_thread_count = n;
  }
  return std::make_tuple(thread_count, batch_thread_count);
}

  void
Inference::reinitialize(const ChatOptions& opt, const struct llama_model* model)
{
  fildesh::ofstream eout("/dev/stderr");

  const auto* sampling = std::get_if<rendezllama::inference::Sampling>(&opt.infer_via);
  assert(sampling);
  auto seed = sampling->seed;
  if (smpl_ || seed < 0) {
    // We're retrying or just don't have a fixed seed, so we should reseed.
    seed = new_sampling_seed();
  }
  std::tie(thread_count_, batch_thread_count_) = infer_thread_counts(opt);
  if (smpl_) {
    llama_sampler_free(smpl_);
    eout.open("/dev/null");
  }
  token_count_ = 0;
  auto smpl_param = llama_sampler_chain_default_params();
  smpl_ = llama_sampler_chain_init(smpl_param);

  for (const auto& adjust_via : sampling->adjust_thru) {
    apply_sampler_chain(smpl_, adjust_via, model, seed, eout);
  }

  if (std::get_if<rendezllama::inference::Probability>(&sampling->pick_via)) {
    llama_sampler_chain_add(smpl_, llama_sampler_init_dist(seed));
  }
  else if (std::get_if<rendezllama::inference::Determinism>(&sampling->pick_via)) {
    llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());
  }
  else if (const auto* adaptive_p = std::get_if<rendezllama::inference::AdaptiveP>(&sampling->pick_via)) {
    adaptive_p_sample(smpl_, *adaptive_p, seed);
  }
  else if (const auto* mirostat = std::get_if<rendezllama::inference::Mirostat>(&sampling->pick_via)) {
    mirostat_sample(smpl_, *mirostat, seed, vocabulary_);
  }
  else {
    fildesh_log_error("Missing pick method? Using greedy.");
    llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());
  }
}

  bool
Inference::commit_to_context(
    struct llama_context* ctx,
    ChatDisplay& chat_disp,
    ChatTrajectory& chat_traj,
    const ChatOptions& opt,
    const llama_model* model)
{
  assert(!chat_traj.erased_since_eval_ ||
         chat_traj.context_token_count_ < chat_traj.token_count());
  if (chat_traj.erased_since_eval_ || !smpl_) {
    this->reinitialize(opt, model);
  }
  if (chat_traj.context_token_count_ == chat_traj.token_count()) {
    return true;
  }

  chat_traj.maybe_rollforget_within_limit(opt.context_token_limit, vocabulary_);

  // Reset thread count just in case the user reconfigured it.
  llama_set_n_threads(ctx, thread_count_, batch_thread_count_);

  // Clear KV cache past current position just in case the user deleted tokens.
  llama_memory_seq_rm(
      llama_get_memory(ctx),
      0, chat_traj.context_token_count_, -1);

  while (chat_traj.context_token_count_ < chat_traj.token_count()) {
    const unsigned n = std::min(
        opt.batch_count,
        chat_traj.token_count() - chat_traj.context_token_count_);

    chat_disp.show_new(chat_traj.context_token_count_ + n, chat_traj, vocabulary_);

    if (!batch_.token || (unsigned)batch_.n_tokens < n) {
      llama_batch_free(batch_);
      unsigned n_alloc = n;
      if (n_alloc < opt.batch_count) {n_alloc = opt.batch_count;}
      batch_ = llama_batch_init(n_alloc, /*embd=*/0, /*n_seq_max=*/1);
    }
    batch_.n_tokens = n;
    for (unsigned i = 0; i < n; ++i) {
      batch_.token[i] = chat_traj.tokens()[chat_traj.context_token_count_ + i];
      batch_.pos[i] = chat_traj.context_token_count_ + i;
      batch_.n_seq_id[i] = 1;
      batch_.seq_id[i][0] = 0;
      batch_.logits[i] = (i == n - 1);
    }

    const int istat = llama_decode(ctx, batch_);

    if (istat != 0) {
      fildesh_log_error("Failed to eval.");
      chat_traj.context_token_count_ = 0;
      return false;
    }
    else {
      chat_traj.context_token_count_ += n;
    }
  }
  assert(chat_traj.context_token_count_ == chat_traj.token_count());
  chat_traj.erased_since_eval_ = false;
  while (token_count_ < chat_traj.token_count()) {
    Vocabulary::Token_id token_id = chat_traj.token_at(token_count_);
    llama_sampler_accept(smpl_, token_id);
    token_count_ += 1;
  }
  return true;
}

  void
Inference::sample_to_trajectory(
    ChatTrajectory& chat_traj,
    struct llama_context* ctx,
    bool preventing_newline)
{
  float* logits = llama_get_logits(ctx);
  if (preventing_newline) {
    // Zero probability for message-ending tokens when requested.
    logits[vocabulary_.eos_token_id()] = 0;
    logits[vocabulary_.newline_token_id()] = 0;
  }

  std::vector<llama_token_data> candidates;
  candidates.resize(vocabulary_.cardinality());
  for (llama_token i = 0; i < (llama_token)candidates.size(); ++i) {
    candidates[i] = llama_token_data{
      i, logits[i], 0.0f,
    };
  }
  logits = NULL;
  llama_token_data_array candidates_data[1] = {{
    candidates.data(),
    candidates.size(),
    /*selected=*/0,
    /*sorted=*/false,
  }};
  llama_sampler_apply(smpl_, candidates_data);
  chat_traj.push_back(candidates[candidates_data->selected].id);
  llama_sampler_accept(smpl_, chat_traj.token());
  token_count_ += 1;
}

1	#include "src/language/inference.hh"
2
3	#include <algorithm>
4	#include <cassert>
5	#include <cstring>
6	#include <stdexcept>
7	#include <thread>
8	#include <vector>
9
10	#include <fildesh/fildesh.h>
11	#include <fildesh/ostream.hh>
12
13	#include "src/chat/display.hh"
14	#include "src/chat/guide.hh"
15	#include "src/chat/opt.hh"
16	#include "src/chat/trajectory.hh"
17	#include "src/language/vocabulary.hh"
18
19	using rendezllama::ChatDisplay;
20	using rendezllama::ChatGuide;
21	using rendezllama::ChatOptions;
22	using rendezllama::ChatTrajectory;
23	using rendezllama::Inference;
24	using rendezllama::Vocabulary;
25	using rendezllama::inference::AdjustViaKind;
26
27	Inference::Inference(const Vocabulary& vocabulary)	2✔
28	: vocabulary_(vocabulary)	2✔
29	{}	2✔
30	Inference::~Inference() {	2✔
31	if (smpl_) {llama_sampler_free(smpl_);}	2✔
32	llama_batch_free(batch_);	2✔
33	}	2✔
34
35	const std::string&
36	rendezllama::antiprompt_suffix(	5✔
37	std::string_view text,
38	const std::set<std::string>& antiprompts)
39	{
40	static const std::string empty_string;	5✔
41	for (const std::string& s : antiprompts) {	11✔
42	if (text.size() >= s.size()) {	9✔
43	const size_t offset = text.size() - s.size();	6✔
44	if (0 == memcmp(&text[offset], &s[0], s.size())) {	6✔
45	return s;	3✔
46	}
47	}
48	}
49	return empty_string;	2✔
50	}
51
52	static bool maybe_trim_endspace(std::string& s)	×
53	{
54	bool result = false;	×
55	while (!s.empty() && s.back() == ' ') {	×
56	s.pop_back();	×
57	result = true;	×
58	}
59	return result;	×
60	}
61
62	void
63	rendezllama::augment_tokenize_chat_input(	×
64	ChatGuide& chat_guide,
65	ChatTrajectory& chat_traj,
66	bool& prevent_subsequent_newline,
67	std::string s,
68	const Vocabulary& vocabulary,
69	const ChatOptions& opt)
70	{
71	prevent_subsequent_newline = false;	×
72	if (s.size() >= 2 && s[0] == '\\' && s[1] == 'n') {	×
73	chat_guide.end_turn();	×
74	chat_guide.begin_turn(opt.message_opts.size()-1);	×
75	s.erase(0, 2);	×
76	prevent_subsequent_newline = maybe_trim_endspace(s);	×
77	if (opt.message_opts.back().prefix.back() == '\n' && opt.linespace_on) {	×
78	if (!s.empty() && s.front() != ' ') {	×
79	s.insert(0, " ");	×
80	}
81	}
82	chat_traj.tokenize_append(s, vocabulary);	×
83	}
84	else if (s.front() == '\n') {	×
85	// This is from /yield.
86	chat_guide.yield_turn(s.substr(1));	×
87	}
88	else if (s.front() == ' ') {	×
89	prevent_subsequent_newline = maybe_trim_endspace(s);	×
90	chat_traj.tokenize_append(s, vocabulary);	×
91	}
92	else {
93	chat_guide.yield_turn(0);	×
94	if (opt.message_opts[0].prefix.back() == '\n' && opt.linespace_on) {	×
95	if (!s.empty() && s.front() != ' ') {	×
96	s.insert(0, " ");	×
97	}
98	}
99	chat_traj.tokenize_append(s, vocabulary);	×
100	chat_guide.yield_turn();	×
101	chat_traj.display_token_count_ = chat_traj.rfind_message_prefix_begin_at(	×
102	chat_traj.token_count()-1);	×
103	prevent_subsequent_newline = true;	×
104	}
105	}	×
106
107	std::tuple<struct llama_model, struct llama_context>
108	rendezllama::make_llama_context(rendezllama::ChatOptions& opt)	2✔
109	{
110	llama_model_params model_params = llama_model_default_params();	2✔
111	model_params.use_mlock = opt.mlock_on;	2✔
112	model_params.use_mmap = opt.mmap_on;	2✔
113
114	struct llama_model* model = llama_model_load_from_file(	2✔
115	opt.model_filename.c_str(), model_params);	2✔
116	if (!model) {	2✔
117	fildesh_log_error("Failed to open model.");	×
118	return std::make_tuple(nullptr, nullptr);	×
119	}
120
121	if (opt.model_token_limit == 0) {	2✔
122	opt.model_token_limit = llama_model_n_ctx_train(model);	2✔
123	}
124	if (opt.context_token_limit == 0) {	2✔
125	opt.context_token_limit = opt.model_token_limit;	1✔
126	}
127	float rope_freq_scale = llama_model_rope_freq_scale_train(model);	2✔
128	if (rope_freq_scale <= 0.0) {	2✔
129	rope_freq_scale = 1.0f;	×
130	}
131	while (
132	(unsigned)(opt.model_token_limit / rope_freq_scale)	2✔
133	<	2✔
134	opt.context_token_limit)	2✔
135	{
136	rope_freq_scale /= 2;	×
137	}
138	llama_model_free(model);	2✔
139	model = nullptr;	2✔
140
141
142	model_params = llama_model_default_params();	2✔
143	model_params.use_mlock = opt.mlock_on;	2✔
144	model_params.use_mmap = opt.mmap_on;	2✔
145
146	llama_context_params ctx_params = llama_context_default_params();	2✔
147	ctx_params.n_ctx = opt.context_token_limit;	2✔
148	ctx_params.n_batch = opt.batch_count;	2✔
149	ctx_params.rope_freq_scale = rope_freq_scale;	2✔
150
151	std::vector<float> tensor_split(llama_max_devices());	2✔
152	std::vector<llama_model_tensor_buft_override> tensor_buft_overrides(llama_max_tensor_buft_overrides());	2✔
153	std::vector<size_t> margins(llama_max_devices(), 0);	2✔
154
155	// Auto-tune parameters if possible (and not manually overridden by user yet).
156	// This helps avoid OOM crashes on Vulkan/GPU by fitting layers to available memory.
157	auto status = llama_params_fit(	2✔
158	opt.model_filename.c_str(),
159	&model_params,
160	&ctx_params,
161	tensor_split.data(),
162	tensor_buft_overrides.data(),
163	margins.data(),
164	/n_ctx_min=/0,
165	GGML_LOG_LEVEL_ERROR);
166
167	if (status != 0) {	2✔
168	fildesh_log_warning("llama_params_fit failed");	×
169	}
170
171	model = llama_model_load_from_file(	2✔
172	opt.model_filename.c_str(), model_params);
173	if (!model) {	2✔
174	fildesh_log_error("Failed to open model.");	×
175	return std::make_tuple(nullptr, nullptr);	×
176	}
177
178	struct llama_context* ctx = llama_init_from_model(model, ctx_params);	2✔
179	if (!ctx) {	2✔
180	llama_model_free(model);	×
181	fildesh_log_error("Failed to create context.");	×
182	return std::make_tuple(nullptr, nullptr);	×
183	}
184	return std::make_tuple(model, ctx);	2✔
185	}	2✔
186
187	static
188	int
189	new_sampling_seed()	2✔
190	{
191	return static_cast<int>(INT_MAX & time(NULL));	2✔
192	}
193
194	static
195	void
196	apply_sampler_chain(	3✔
197	struct llama_sampler* smpl,
198	const rendezllama::inference::AdjustVia& adjust_via,
199	const struct llama_model* model,
200	unsigned seed,
201	std::ostream& eout)
202	{
203	const unsigned keep_one = 1;	3✔
204
205	if (const auto* dry = std::get_if<AdjustViaKind::dry>(&adjust_via)) {	3✔
206	static const char* seq_breakers[] = {	×
207	"\n", ":",
208	};
209	llama_sampler_init_dry(	×
210	llama_model_get_vocab(model),
211	llama_model_n_ctx_train(model),
212	dry->multiplier,	×
213	dry->base,	×
214	dry->allowed_length,	×
215	dry->window_length,	×
216	seq_breakers,
217	sizeof(seq_breakers)/sizeof(*seq_breakers));
218	eout << "dry:"	×
219	<< "\n multiplier: " << dry->multiplier	×
220	<< "\n base: " << dry->base	×
221	<< "\n allowed_length: " << dry->allowed_length	×
222	<< "\n window_length: " << dry->window_length	×
223	<< "\n";	×
224	}
225	if (const auto* min_p = std::get_if<AdjustViaKind::min_p>(&adjust_via)) {	3✔
226	llama_sampler_chain_add(smpl, llama_sampler_init_min_p(*min_p, keep_one));	1✔
227	eout << "min_p: " << *min_p << "\n";	1✔
228	}
229	if (const auto* penalize_with = std::get_if<AdjustViaKind::penalize_with>(&adjust_via)) {	3✔
230	llama_sampler_init_penalties(	×
231	penalize_with->window_length,	×
232	penalize_with->repetition,	×
233	penalize_with->frequency,	×
234	penalize_with->presence);	×
235	eout << "penalties:"	×
236	<< "\n window_length: " << penalize_with->window_length	×
237	<< "\n repetition: " << penalize_with->repetition	×
238	<< "\n frequency: " << penalize_with->frequency	×
239	<< "\n presence: " << penalize_with->presence	×
240	<< "\n";	×
241	}
242	if (const auto* temperature = std::get_if<AdjustViaKind::temperature>(&adjust_via)) {	3✔
243	llama_sampler_chain_add(smpl, llama_sampler_init_temp(*temperature));	2✔
244	eout << "temperature: " << *temperature << "\n";	2✔
245	}
246	if (const auto* top_k = std::get_if<AdjustViaKind::top_k>(&adjust_via)) {	3✔
247	llama_sampler_chain_add(smpl, llama_sampler_init_top_k(*top_k));	×
248	eout << "top_k: " << *top_k << "\n";	×
249	}
250	if (const auto* top_p = std::get_if<AdjustViaKind::top_p>(&adjust_via)) {	3✔
251	llama_sampler_chain_add(smpl, llama_sampler_init_top_p(*top_p, keep_one));	×
252	eout << "top_p: " << *top_p << "\n";	×
253	}
254	if (const auto* typical_p = std::get_if<AdjustViaKind::typical_p>(&adjust_via)) {	3✔
255	llama_sampler_chain_add(smpl, llama_sampler_init_typical(*typical_p, keep_one));	×
256	eout << "typical_p: " << *typical_p << "\n";	×
257	}
258	if (const auto* xtc = std::get_if<AdjustViaKind::xtc>(&adjust_via)) {	3✔
259	llama_sampler_chain_add(smpl, llama_sampler_init_xtc(xtc->probability, xtc->threshold, keep_one, seed));	×
260	eout << "xtc: "	×
261	<< "\n probability: " << xtc->probability	×
262	<< "\n threshold: " << xtc->threshold	×
263	<< "\n";	×
264	}
265	}	3✔
266
267	static
268	void
269	adaptive_p_sample(	×
270	struct llama_sampler* smpl,
271	const rendezllama::inference::AdaptiveP& adaptive_p,
272	unsigned seed)
273	{
274	llama_sampler_chain_add(	×
275	smpl,
276	llama_sampler_init_adaptive_p(
277	adaptive_p.target,	×
278	adaptive_p.decay,	×
279	seed));
280	}	×
281
282	static
283	void
284	mirostat_sample(	×
285	struct llama_sampler* smpl,
286	const rendezllama::inference::Mirostat& mirostat,
287	unsigned seed,
288	const rendezllama::Vocabulary& vocabulary)
289	{
290	if (mirostat.version == 1) {	×
291	const int mirostat_m = 100;	×
292	llama_sampler_chain_add(	×
293	smpl,
294	llama_sampler_init_mirostat(
295	vocabulary.cardinality(), seed,	×
296	mirostat.tau, mirostat.eta, mirostat_m));	×
297	}
298	else if (mirostat.version == 2) {	×
299	llama_sampler_chain_add(	×
300	smpl,
301	llama_sampler_init_mirostat_v2(
302	seed, mirostat.tau, mirostat.eta));	×
303	}
304	}	×
305
306	static
307	std::tuple<unsigned, unsigned>
308	infer_thread_counts(const rendezllama::ChatOptions& opt)	2✔
309	{
310	unsigned thread_count = opt.thread_count;	2✔
311	unsigned batch_thread_count = opt.batch_thread_count;	2✔
312	const unsigned n = std::thread::hardware_concurrency();	2✔
313	if (thread_count == 0) {	2✔
314	thread_count = n / 2;	2✔
315	if (thread_count == 0) {	2✔
316	thread_count = 1;	×
317	}
318	#if defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(__i386__) \|\| defined(_M_IX86)
319	if (2 <= n && n <= 4) {	2✔
320	thread_count = n;	2✔
321	}
322	#endif
323	}
324	if (batch_thread_count == 0) {	2✔
325	batch_thread_count = n;	2✔
326	}
327	return std::make_tuple(thread_count, batch_thread_count);	2✔
328	}
329
330	void
331	Inference::reinitialize(const ChatOptions& opt, const struct llama_model* model)	2✔
332	{
333	fildesh::ofstream eout("/dev/stderr");	2✔
334
335	const auto* sampling = std::get_if<rendezllama::inference::Sampling>(&opt.infer_via);	2✔
336	assert(sampling);	×
337	auto seed = sampling->seed;	2✔
338	if (smpl_ \|\| seed < 0) {	2✔
339	// We're retrying or just don't have a fixed seed, so we should reseed.
340	seed = new_sampling_seed();	2✔
341	}
342	std::tie(thread_count_, batch_thread_count_) = infer_thread_counts(opt);	2✔
343	if (smpl_) {	2✔
344	llama_sampler_free(smpl_);	×
345	eout.open("/dev/null");	×
346	}
347	token_count_ = 0;	2✔
348	auto smpl_param = llama_sampler_chain_default_params();	2✔
349	smpl_ = llama_sampler_chain_init(smpl_param);	2✔
350
351	for (const auto& adjust_via : sampling->adjust_thru) {	5✔
352	apply_sampler_chain(smpl_, adjust_via, model, seed, eout);	3✔
353	}
354
355	if (std::get_if<rendezllama::inference::Probability>(&sampling->pick_via)) {	2✔
356	llama_sampler_chain_add(smpl_, llama_sampler_init_dist(seed));	×
357	}
358	else if (std::get_if<rendezllama::inference::Determinism>(&sampling->pick_via)) {	2✔
359	llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());	1✔
360	}
361	else if (const auto* adaptive_p = std::get_if<rendezllama::inference::AdaptiveP>(&sampling->pick_via)) {	1✔
362	adaptive_p_sample(smpl_, *adaptive_p, seed);	×
363	}
364	else if (const auto* mirostat = std::get_if<rendezllama::inference::Mirostat>(&sampling->pick_via)) {	1✔
365	mirostat_sample(smpl_, *mirostat, seed, vocabulary_);	×
366	}
367	else {
368	fildesh_log_error("Missing pick method? Using greedy.");	1✔
369	llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());	1✔
370	}
371	}	2✔
372
373	bool
374	Inference::commit_to_context(	21✔
375	struct llama_context* ctx,
376	ChatDisplay& chat_disp,
377	ChatTrajectory& chat_traj,
378	const ChatOptions& opt,
379	const llama_model* model)
380	{
381	assert(!chat_traj.erased_since_eval_ \|\|	21✔
382	chat_traj.context_token_count_ < chat_traj.token_count());
383	if (chat_traj.erased_since_eval_ \|\| !smpl_) {	21✔
384	this->reinitialize(opt, model);	2✔
385	}
386	if (chat_traj.context_token_count_ == chat_traj.token_count()) {	21✔
387	return true;
388	}
389
390	chat_traj.maybe_rollforget_within_limit(opt.context_token_limit, vocabulary_);	21✔
391
392	// Reset thread count just in case the user reconfigured it.
393	llama_set_n_threads(ctx, thread_count_, batch_thread_count_);	21✔
394
395	// Clear KV cache past current position just in case the user deleted tokens.
396	llama_memory_seq_rm(	42✔
397	llama_get_memory(ctx),
398	0, chat_traj.context_token_count_, -1);	21✔
399
400	while (chat_traj.context_token_count_ < chat_traj.token_count()) {	63✔
401	const unsigned n = std::min(	21✔
402	opt.batch_count,	21✔
403	chat_traj.token_count() - chat_traj.context_token_count_);	21✔
404
405	chat_disp.show_new(chat_traj.context_token_count_ + n, chat_traj, vocabulary_);	21✔
406
407	if (!batch_.token \|\| (unsigned)batch_.n_tokens < n) {	21✔
408	llama_batch_free(batch_);	2✔
409	unsigned n_alloc = n;	2✔
410	if (n_alloc < opt.batch_count) {n_alloc = opt.batch_count;}	2✔
411	batch_ = llama_batch_init(n_alloc, /embd=/0, /n_seq_max=/1);	2✔
412	}
413	batch_.n_tokens = n;	21✔
414	for (unsigned i = 0; i < n; ++i) {	59✔
415	batch_.token[i] = chat_traj.tokens()[chat_traj.context_token_count_ + i];	38✔
416	batch_.pos[i] = chat_traj.context_token_count_ + i;	38✔
417	batch_.n_seq_id[i] = 1;	38✔
418	batch_.seq_id[i][0] = 0;	38✔
419	batch_.logits[i] = (i == n - 1);	38✔
420	}
421
422	const int istat = llama_decode(ctx, batch_);	21✔
423
424	if (istat != 0) {	21✔
425	fildesh_log_error("Failed to eval.");	×
426	chat_traj.context_token_count_ = 0;	×
427	return false;	×
428	}
429	else {
430	chat_traj.context_token_count_ += n;	21✔
431	}
432	}
433	assert(chat_traj.context_token_count_ == chat_traj.token_count());	21✔
434	chat_traj.erased_since_eval_ = false;	21✔
435	while (token_count_ < chat_traj.token_count()) {	40✔
436	Vocabulary::Token_id token_id = chat_traj.token_at(token_count_);	19✔
437	llama_sampler_accept(smpl_, token_id);	19✔
438	token_count_ += 1;	19✔
439	}
440	return true;
441	}
442
443	void
444	Inference::sample_to_trajectory(	21✔
445	ChatTrajectory& chat_traj,
446	struct llama_context* ctx,
447	bool preventing_newline)
448	{
449	float* logits = llama_get_logits(ctx);	21✔
450	if (preventing_newline) {	21✔
451	// Zero probability for message-ending tokens when requested.
452	logits[vocabulary_.eos_token_id()] = 0;	×
453	logits[vocabulary_.newline_token_id()] = 0;	×
454	}
455
456	std::vector<llama_token_data> candidates;	21✔
457	candidates.resize(vocabulary_.cardinality());	21✔
458	for (llama_token i = 0; i < (llama_token)candidates.size(); ++i) {	350,229✔
459	candidates[i] = llama_token_data{	350,208✔
460	i, logits[i], 0.0f,	350,208✔
461	};
462	}
463	logits = NULL;	21✔
464	llama_token_data_array candidates_data[1] = {{	21✔
465	candidates.data(),	21✔
466	candidates.size(),	21✔
467	/selected=/0,
468	/sorted=/false,
469	}};	21✔
470	llama_sampler_apply(smpl_, candidates_data);	21✔
471	chat_traj.push_back(candidates[candidates_data->selected].id);	21✔
472	llama_sampler_accept(smpl_, chat_traj.token());	21✔
473	token_count_ += 1;	21✔
474	}	21✔

rendezqueue / rendezllama / 21365024824

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous