21341853687

Committed 25 Jan 2026 11:58PM UTC coverage: 88.75% (+0.5%) from 88.225%

Build # 21341853687

Build Type

push

github

Committed by

grencez

Commit Message

feat(eg): assistant_cli

Run Details

110 of 121 new or added lines in 4 files covered. (90.91%)

58 existing lines in 3 files now uncovered.

2130 of 2400 relevant lines covered (88.75%)

523.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

59.09

/src/language/inference.cc

#include "src/language/inference.hh"

#include <algorithm>
#include <cassert>
#include <cstring>
#include <thread>

#include <fildesh/fildesh.h>
#include <fildesh/ostream.hh>

#include "src/chat/display.hh"
#include "src/chat/guide.hh"
#include "src/chat/opt.hh"
#include "src/chat/trajectory.hh"
#include "src/language/vocabulary.hh"

using rendezllama::ChatDisplay;
using rendezllama::ChatGuide;
using rendezllama::ChatOptions;
using rendezllama::ChatTrajectory;
using rendezllama::Inference;
using rendezllama::Vocabulary;
using rendezllama::inference::AdjustViaKind;

Inference::Inference(const Vocabulary& vocabulary)
  : vocabulary_(vocabulary)
{}
Inference::~Inference() {
  if (smpl_) {llama_sampler_free(smpl_);}
  llama_batch_free(batch_);
}

  const std::string&
rendezllama::antiprompt_suffix(
    std::string_view text,
    const std::set<std::string>& antiprompts)
{
  static const std::string empty_string;
  for (const std::string& s : antiprompts) {
    if (text.size() >= s.size()) {
      const size_t offset = text.size() - s.size();
      if (0 == memcmp(&text[offset], &s[0], s.size())) {
        return s;
      }
    }
  }
  return empty_string;
}

static bool maybe_trim_endspace(std::string& s)
{
  bool result = false;
  while (!s.empty() && s.back() == ' ') {
    s.pop_back();
    result = true;
  }
  return result;
}

  void
rendezllama::augment_tokenize_chat_input(
    ChatGuide& chat_guide,
    ChatTrajectory& chat_traj,
    bool& prevent_subsequent_newline,
    std::string s,
    const Vocabulary& vocabulary,
    const ChatOptions& opt)
{
  prevent_subsequent_newline = false;
  if (s.size() >= 2 && s[0] == '\\' && s[1] == 'n') {
    chat_guide.end_turn();
    chat_guide.begin_turn(opt.message_opts.size()-1);
    s.erase(0, 2);
    prevent_subsequent_newline = maybe_trim_endspace(s);
    if (opt.message_opts.back().prefix.back() == '\n' && opt.linespace_on) {
      if (!s.empty() && s.front() != ' ') {
        s.insert(0, " ");
      }
    }
    chat_traj.tokenize_append(s, vocabulary);
  }
  else if (s.front() == '\n') {
    // This is from /yield.
    chat_guide.yield_turn(s.substr(1));
  }
  else if (s.front() == ' ') {
    prevent_subsequent_newline = maybe_trim_endspace(s);
    chat_traj.tokenize_append(s, vocabulary);
  }
  else {
    chat_guide.yield_turn(0);
    if (opt.message_opts[0].prefix.back() == '\n' && opt.linespace_on) {
      if (!s.empty() && s.front() != ' ') {
        s.insert(0, " ");
      }
    }
    chat_traj.tokenize_append(s, vocabulary);
    chat_guide.yield_turn();
    chat_traj.display_token_count_ = chat_traj.rfind_message_prefix_begin_at(
        chat_traj.token_count()-1);
    prevent_subsequent_newline = true;
  }
}

  std::tuple<struct llama_model*, struct llama_context*>
rendezllama::make_llama_context(rendezllama::ChatOptions& opt)
{
  llama_model_params model_params = llama_model_default_params();
  model_params.use_mlock = opt.mlock_on;
  model_params.use_mmap = opt.mmap_on;

  struct llama_model* model = llama_model_load_from_file(
      opt.model_filename.c_str(), model_params);
  if (!model) {
    fildesh_log_error("Failed to open model.");
    return std::make_tuple(nullptr, nullptr);
  }

  if (opt.model_token_limit == 0) {
    opt.model_token_limit = llama_model_n_ctx_train(model);
  }
  if (opt.context_token_limit == 0) {
    opt.context_token_limit = opt.model_token_limit;
  }

  model_params = llama_model_default_params();
  model_params.use_mlock = opt.mlock_on;
  model_params.use_mmap = opt.mmap_on;

  llama_context_params ctx_params = llama_context_default_params();
  ctx_params.n_ctx = opt.context_token_limit;
  ctx_params.n_batch = opt.batch_count;
  ctx_params.rope_freq_scale = llama_model_rope_freq_scale_train(model);
  assert(ctx_params.rope_freq_scale > 0.0);
  while (
      (unsigned)(opt.model_token_limit / ctx_params.rope_freq_scale)
      <
      opt.context_token_limit)
  {
    ctx_params.rope_freq_scale /= 2;
  }

  struct llama_context* ctx = llama_init_from_model(model, ctx_params);
  if (!ctx) {
    llama_model_free(model);
    fildesh_log_error("Failed to create context.");
    return std::make_tuple(nullptr, nullptr);
  }
  return std::make_tuple(model, ctx);
}

static
  int
new_sampling_seed()
{
  return static_cast<int>(INT_MAX & time(NULL));
}

static
  void
apply_sampler_chain(
    struct llama_sampler* smpl,
    const rendezllama::inference::AdjustVia& adjust_via,
    const struct llama_model* model,
    unsigned seed,
    std::ostream& eout)
{
  const unsigned keep_one = 1;

  if (const auto* dry = std::get_if<AdjustViaKind::dry>(&adjust_via)) {
    static const char* seq_breakers[] = {
      "\n", ":",
    };
    llama_sampler_init_dry(
        llama_model_get_vocab(model),
        llama_model_n_ctx_train(model),
        dry->multiplier,
        dry->base,
        dry->allowed_length,
        dry->window_length,
        seq_breakers,
        sizeof(seq_breakers)/sizeof(*seq_breakers));
    eout << "dry:"
      << "\n  multiplier: " << dry->multiplier
      << "\n  base: " << dry->base
      << "\n  allowed_length: " << dry->allowed_length
      << "\n  window_length: " << dry->window_length
      << "\n";
  }
  if (const auto* min_p = std::get_if<AdjustViaKind::min_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_min_p(*min_p, keep_one));
    eout << "min_p: " << *min_p << "\n";
  }
  if (const auto* penalize_with = std::get_if<AdjustViaKind::penalize_with>(&adjust_via)) {
    llama_sampler_init_penalties(
        penalize_with->window_length,
        penalize_with->repetition,
        penalize_with->frequency,
        penalize_with->presence);
    eout << "penalties:"
      << "\n  window_length: " << penalize_with->window_length
      << "\n  repetition: " << penalize_with->repetition
      << "\n  frequency: " << penalize_with->frequency
      << "\n  presence: " << penalize_with->presence
      << "\n";
  }
  if (const auto* temperature = std::get_if<AdjustViaKind::temperature>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_temp(*temperature));
    eout << "temperature: " << *temperature << "\n";
  }
  if (const auto* top_k = std::get_if<AdjustViaKind::top_k>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(*top_k));
    eout << "top_k: " << *top_k << "\n";
  }
  if (const auto* top_p = std::get_if<AdjustViaKind::top_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(*top_p, keep_one));
    eout << "top_p: " << *top_p << "\n";
  }
  if (const auto* typical_p = std::get_if<AdjustViaKind::typical_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_typical(*typical_p, keep_one));
    eout << "typical_p: " << *typical_p << "\n";
  }
  if (const auto* xtc = std::get_if<AdjustViaKind::xtc>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_xtc(xtc->probability, xtc->threshold, keep_one, seed));
    eout << "xtc: "
      << "\n  probability: " << xtc->probability
      << "\n  threshold: " << xtc->threshold
      << "\n";
  }
}

static
  void
adaptive_p_sample(
    struct llama_sampler* smpl,
    const rendezllama::inference::AdaptiveP& adaptive_p,
    unsigned seed)
{
  llama_sampler_chain_add(
      smpl,
      llama_sampler_init_adaptive_p(
          adaptive_p.target,
          adaptive_p.decay,
          seed));
}

static
  void
mirostat_sample(
    struct llama_sampler* smpl,
    const rendezllama::inference::Mirostat& mirostat,
    unsigned seed,
    const rendezllama::Vocabulary& vocabulary)
{
  if (mirostat.version == 1) {
    const int mirostat_m = 100;
    llama_sampler_chain_add(
        smpl,
        llama_sampler_init_mirostat(
            vocabulary.cardinality(), seed,
            mirostat.tau, mirostat.eta, mirostat_m));
  }
  else if (mirostat.version == 2) {
    llama_sampler_chain_add(
        smpl,
        llama_sampler_init_mirostat_v2(
            seed, mirostat.tau, mirostat.eta));
  }
}

static
  std::tuple<unsigned, unsigned>
infer_thread_counts(const rendezllama::ChatOptions& opt)
{
  unsigned thread_count = opt.thread_count;
  unsigned batch_thread_count = opt.batch_thread_count;
  const unsigned n = std::thread::hardware_concurrency();
  if (thread_count == 0) {
    thread_count = n / 2;
    if (thread_count == 0) {
      thread_count = 1;
    }
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
    if (2 <= n && n <= 4) {
      thread_count = n;
    }
#endif
  }
  if (batch_thread_count == 0) {
    batch_thread_count = n;
  }
  return std::make_tuple(thread_count, batch_thread_count);
}

  void
Inference::reinitialize(const ChatOptions& opt, const struct llama_model* model)
{
  fildesh::ofstream eout("/dev/stderr");

  const auto* sampling = std::get_if<rendezllama::inference::Sampling>(&opt.infer_via);
  assert(sampling);
  auto seed = sampling->seed;
  if (smpl_ || seed < 0) {
    // We're retrying or just don't have a fixed seed, so we should reseed.
    seed = new_sampling_seed();
  }
  std::tie(thread_count_, batch_thread_count_) = infer_thread_counts(opt);
  if (smpl_) {
    llama_sampler_free(smpl_);
    eout.open("/dev/null");
  }
  token_count_ = 0;
  auto smpl_param = llama_sampler_chain_default_params();
  smpl_ = llama_sampler_chain_init(smpl_param);

  for (const auto& adjust_via : sampling->adjust_thru) {
    apply_sampler_chain(smpl_, adjust_via, model, seed, eout);
  }

  if (std::get_if<rendezllama::inference::Probability>(&sampling->pick_via)) {
    llama_sampler_chain_add(smpl_, llama_sampler_init_dist(seed));
  }
  else if (std::get_if<rendezllama::inference::Determinism>(&sampling->pick_via)) {
    llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());
  }
  else if (const auto* adaptive_p = std::get_if<rendezllama::inference::AdaptiveP>(&sampling->pick_via)) {
    adaptive_p_sample(smpl_, *adaptive_p, seed);
  }
  else if (const auto* mirostat = std::get_if<rendezllama::inference::Mirostat>(&sampling->pick_via)) {
    mirostat_sample(smpl_, *mirostat, seed, vocabulary_);
  }
  else {
    fildesh_log_error("Missing pick method? Using greedy.");
    llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());
  }
}

  bool
Inference::commit_to_context(
    struct llama_context* ctx,
    ChatDisplay& chat_disp,
    ChatTrajectory& chat_traj,
    const ChatOptions& opt,
    const llama_model* model)
{
  assert(!chat_traj.erased_since_eval_ ||
         chat_traj.context_token_count_ < chat_traj.token_count());
  if (chat_traj.erased_since_eval_ || !smpl_) {
    this->reinitialize(opt, model);
  }
  if (chat_traj.context_token_count_ == chat_traj.token_count()) {
    return true;
  }

  chat_traj.maybe_rollforget_within_limit(opt.context_token_limit, vocabulary_);

  // Reset thread count just in case the user reconfigured it.
  llama_set_n_threads(ctx, thread_count_, batch_thread_count_);

  // Clear KV cache past current position just in case the user deleted tokens.
  llama_memory_seq_rm(
      llama_get_memory(ctx),
      0, chat_traj.context_token_count_, -1);

  while (chat_traj.context_token_count_ < chat_traj.token_count()) {
    const unsigned n = std::min(
        opt.batch_count,
        chat_traj.token_count() - chat_traj.context_token_count_);

#if LLAMA_OPENBLAS_ON
    if (n < 32) {
      llama_set_n_threads(ctx, thread_count_, batch_thread_count_);
    }
    else {
      llama_set_n_threads(ctx, thread_count_, 1);
    }
#endif
    chat_disp.show_new(chat_traj.context_token_count_ + n, chat_traj, vocabulary_);

    if (!batch_.token || (unsigned)batch_.n_tokens < n) {
      llama_batch_free(batch_);
      unsigned n_alloc = n;
      if (n_alloc < opt.batch_count) {n_alloc = opt.batch_count;}
      batch_ = llama_batch_init(n_alloc, /*embd=*/0, /*n_seq_max=*/1);
    }
    batch_.n_tokens = n;
    for (unsigned i = 0; i < n; ++i) {
      batch_.token[i] = chat_traj.tokens()[chat_traj.context_token_count_ + i];
      batch_.pos[i] = chat_traj.context_token_count_ + i;
      batch_.n_seq_id[i] = 1;
      batch_.seq_id[i][0] = 0;
      batch_.logits[i] = (i == n - 1);
    }

    const int istat = llama_decode(ctx, batch_);

    if (istat != 0) {
      fildesh_log_error("Failed to eval.");
      chat_traj.context_token_count_ = 0;
      return false;
    }
    else {
      chat_traj.context_token_count_ += n;
    }
  }
  assert(chat_traj.context_token_count_ == chat_traj.token_count());
  chat_traj.erased_since_eval_ = false;
  while (token_count_ < chat_traj.token_count()) {
    Vocabulary::Token_id token_id = chat_traj.token_at(token_count_);
    llama_sampler_accept(smpl_, token_id);
    token_count_ += 1;
  }
  return true;
}

  void
Inference::sample_to_trajectory(
    ChatTrajectory& chat_traj,
    struct llama_context* ctx,
    bool preventing_newline)
{
  float* logits = llama_get_logits(ctx);
  if (preventing_newline) {
    // Zero probability for message-ending tokens when requested.
    logits[vocabulary_.eos_token_id()] = 0;
    logits[vocabulary_.newline_token_id()] = 0;
  }

  std::vector<llama_token_data> candidates;
  candidates.resize(vocabulary_.cardinality());
  for (llama_token i = 0; i < (llama_token)candidates.size(); ++i) {
    candidates[i] = llama_token_data{
      i, logits[i], 0.0f,
    };
  }
  logits = NULL;
  llama_token_data_array candidates_data[1] = {{
    candidates.data(),
    candidates.size(),
    /*selected=*/0,
    /*sorted=*/false,
  }};
  llama_sampler_apply(smpl_, candidates_data);
  chat_traj.push_back(candidates[candidates_data->selected].id);
  llama_sampler_accept(smpl_, chat_traj.token());
  token_count_ += 1;
}


1	#include "src/language/inference.hh"
2
3	#include <algorithm>
4	#include <cassert>
5	#include <cstring>
6	#include <thread>
7
8	#include <fildesh/fildesh.h>
9	#include <fildesh/ostream.hh>
10
11	#include "src/chat/display.hh"
12	#include "src/chat/guide.hh"
13	#include "src/chat/opt.hh"
14	#include "src/chat/trajectory.hh"
15	#include "src/language/vocabulary.hh"
16
17	using rendezllama::ChatDisplay;
18	using rendezllama::ChatGuide;
19	using rendezllama::ChatOptions;
20	using rendezllama::ChatTrajectory;
21	using rendezllama::Inference;
22	using rendezllama::Vocabulary;
23	using rendezllama::inference::AdjustViaKind;
24
25	Inference::Inference(const Vocabulary& vocabulary)	2✔
26	: vocabulary_(vocabulary)	2✔
27	{}	2✔
28	Inference::~Inference() {	2✔
29	if (smpl_) {llama_sampler_free(smpl_);}	2✔
30	llama_batch_free(batch_);	2✔
31	}	2✔
32
33	const std::string&
34	rendezllama::antiprompt_suffix(	5✔
35	std::string_view text,
36	const std::set<std::string>& antiprompts)
37	{
38	static const std::string empty_string;	5✔
39	for (const std::string& s : antiprompts) {	11✔
40	if (text.size() >= s.size()) {	9✔
41	const size_t offset = text.size() - s.size();	6✔
42	if (0 == memcmp(&text[offset], &s[0], s.size())) {	6✔
43	return s;	3✔
44	}
45	}
46	}
47	return empty_string;	2✔
48	}
49
UNCOV 50	static bool maybe_trim_endspace(std::string& s)	×
51	{
UNCOV 52	bool result = false;	×
53	while (!s.empty() && s.back() == ' ') {	×
54	s.pop_back();	×
55	result = true;	×
56	}
UNCOV 57	return result;	×
58	}
59
60	void
UNCOV 61	rendezllama::augment_tokenize_chat_input(	×
62	ChatGuide& chat_guide,
63	ChatTrajectory& chat_traj,
64	bool& prevent_subsequent_newline,
65	std::string s,
66	const Vocabulary& vocabulary,
67	const ChatOptions& opt)
68	{
UNCOV 69	prevent_subsequent_newline = false;	×
70	if (s.size() >= 2 && s[0] == '\\' && s[1] == 'n') {	×
71	chat_guide.end_turn();	×
72	chat_guide.begin_turn(opt.message_opts.size()-1);	×
73	s.erase(0, 2);	×
74	prevent_subsequent_newline = maybe_trim_endspace(s);	×
75	if (opt.message_opts.back().prefix.back() == '\n' && opt.linespace_on) {	×
76	if (!s.empty() && s.front() != ' ') {	×
77	s.insert(0, " ");	×
78	}
79	}
UNCOV 80	chat_traj.tokenize_append(s, vocabulary);	×
81	}
UNCOV 82	else if (s.front() == '\n') {	×
83	// This is from /yield.
UNCOV 84	chat_guide.yield_turn(s.substr(1));	×
85	}
UNCOV 86	else if (s.front() == ' ') {	×
87	prevent_subsequent_newline = maybe_trim_endspace(s);	×
88	chat_traj.tokenize_append(s, vocabulary);	×
89	}
90	else {
UNCOV 91	chat_guide.yield_turn(0);	×
92	if (opt.message_opts[0].prefix.back() == '\n' && opt.linespace_on) {	×
93	if (!s.empty() && s.front() != ' ') {	×
94	s.insert(0, " ");	×
95	}
96	}
UNCOV 97	chat_traj.tokenize_append(s, vocabulary);	×
98	chat_guide.yield_turn();	×
99	chat_traj.display_token_count_ = chat_traj.rfind_message_prefix_begin_at(	×
100	chat_traj.token_count()-1);	×
101	prevent_subsequent_newline = true;	×
102	}
UNCOV 103	}	×
104
105	std::tuple<struct llama_model, struct llama_context>
106	rendezllama::make_llama_context(rendezllama::ChatOptions& opt)	2✔
107	{
108	llama_model_params model_params = llama_model_default_params();	2✔
109	model_params.use_mlock = opt.mlock_on;	2✔
110	model_params.use_mmap = opt.mmap_on;	2✔
111
112	struct llama_model* model = llama_model_load_from_file(	2✔
113	opt.model_filename.c_str(), model_params);	2✔
114	if (!model) {	2✔
UNCOV 115	fildesh_log_error("Failed to open model.");	×
116	return std::make_tuple(nullptr, nullptr);	×
117	}
118
119	if (opt.model_token_limit == 0) {	2✔
120	opt.model_token_limit = llama_model_n_ctx_train(model);	2✔
121	}
122	if (opt.context_token_limit == 0) {	2✔
123	opt.context_token_limit = opt.model_token_limit;	1✔
124	}
125
126	model_params = llama_model_default_params();	2✔
127	model_params.use_mlock = opt.mlock_on;	2✔
128	model_params.use_mmap = opt.mmap_on;	2✔
129
130	llama_context_params ctx_params = llama_context_default_params();	2✔
131	ctx_params.n_ctx = opt.context_token_limit;	2✔
132	ctx_params.n_batch = opt.batch_count;	2✔
133	ctx_params.rope_freq_scale = llama_model_rope_freq_scale_train(model);	2✔
134	assert(ctx_params.rope_freq_scale > 0.0);	2✔
135	while (
136	(unsigned)(opt.model_token_limit / ctx_params.rope_freq_scale)	2✔
137	<	2✔
138	opt.context_token_limit)	2✔
139	{
UNCOV 140	ctx_params.rope_freq_scale /= 2;	×
141	}
142
143	struct llama_context* ctx = llama_init_from_model(model, ctx_params);	2✔
144	if (!ctx) {	2✔
UNCOV 145	llama_model_free(model);	×
UNCOV 146	fildesh_log_error("Failed to create context.");	×
147	return std::make_tuple(nullptr, nullptr);	×
148	}
149	return std::make_tuple(model, ctx);	2✔
150	}
151
152	static
153	int
154	new_sampling_seed()	2✔
155	{
156	return static_cast<int>(INT_MAX & time(NULL));	2✔
157	}
158
159	static
160	void
161	apply_sampler_chain(	3✔
162	struct llama_sampler* smpl,
163	const rendezllama::inference::AdjustVia& adjust_via,
164	const struct llama_model* model,
165	unsigned seed,
166	std::ostream& eout)
167	{
168	const unsigned keep_one = 1;	3✔
169
170	if (const auto* dry = std::get_if<AdjustViaKind::dry>(&adjust_via)) {	3✔
UNCOV 171	static const char* seq_breakers[] = {	×
172	"\n", ":",
173	};
UNCOV 174	llama_sampler_init_dry(	×
175	llama_model_get_vocab(model),
176	llama_model_n_ctx_train(model),
UNCOV 177	dry->multiplier,	×
UNCOV 178	dry->base,	×
179	dry->allowed_length,	×
180	dry->window_length,	×
181	seq_breakers,
182	sizeof(seq_breakers)/sizeof(*seq_breakers));
UNCOV 183	eout << "dry:"	×
UNCOV 184	<< "\n multiplier: " << dry->multiplier	×
185	<< "\n base: " << dry->base	×
186	<< "\n allowed_length: " << dry->allowed_length	×
187	<< "\n window_length: " << dry->window_length	×
188	<< "\n";	×
189	}
190	if (const auto* min_p = std::get_if<AdjustViaKind::min_p>(&adjust_via)) {	3✔
191	llama_sampler_chain_add(smpl, llama_sampler_init_min_p(*min_p, keep_one));	1✔
192	eout << "min_p: " << *min_p << "\n";	1✔
193	}
194	if (const auto* penalize_with = std::get_if<AdjustViaKind::penalize_with>(&adjust_via)) {	3✔
UNCOV 195	llama_sampler_init_penalties(	×
UNCOV 196	penalize_with->window_length,	×
197	penalize_with->repetition,	×
198	penalize_with->frequency,	×
199	penalize_with->presence);	×
200	eout << "penalties:"	×
201	<< "\n window_length: " << penalize_with->window_length	×
202	<< "\n repetition: " << penalize_with->repetition	×
203	<< "\n frequency: " << penalize_with->frequency	×
204	<< "\n presence: " << penalize_with->presence	×
205	<< "\n";	×
206	}
207	if (const auto* temperature = std::get_if<AdjustViaKind::temperature>(&adjust_via)) {	3✔
208	llama_sampler_chain_add(smpl, llama_sampler_init_temp(*temperature));	2✔
209	eout << "temperature: " << *temperature << "\n";	2✔
210	}
211	if (const auto* top_k = std::get_if<AdjustViaKind::top_k>(&adjust_via)) {	3✔
UNCOV 212	llama_sampler_chain_add(smpl, llama_sampler_init_top_k(*top_k));	×
UNCOV 213	eout << "top_k: " << *top_k << "\n";	×
214	}
215	if (const auto* top_p = std::get_if<AdjustViaKind::top_p>(&adjust_via)) {	3✔
UNCOV 216	llama_sampler_chain_add(smpl, llama_sampler_init_top_p(*top_p, keep_one));	×
UNCOV 217	eout << "top_p: " << *top_p << "\n";	×
218	}
219	if (const auto* typical_p = std::get_if<AdjustViaKind::typical_p>(&adjust_via)) {	3✔
UNCOV 220	llama_sampler_chain_add(smpl, llama_sampler_init_typical(*typical_p, keep_one));	×
UNCOV 221	eout << "typical_p: " << *typical_p << "\n";	×
222	}
223	if (const auto* xtc = std::get_if<AdjustViaKind::xtc>(&adjust_via)) {	3✔
UNCOV 224	llama_sampler_chain_add(smpl, llama_sampler_init_xtc(xtc->probability, xtc->threshold, keep_one, seed));	×
UNCOV 225	eout << "xtc: "	×
226	<< "\n probability: " << xtc->probability	×
227	<< "\n threshold: " << xtc->threshold	×
228	<< "\n";	×
229	}
230	}	3✔
231
232	static
233	void
UNCOV 234	adaptive_p_sample(	×
235	struct llama_sampler* smpl,
236	const rendezllama::inference::AdaptiveP& adaptive_p,
237	unsigned seed)
238	{
UNCOV 239	llama_sampler_chain_add(	×
240	smpl,
241	llama_sampler_init_adaptive_p(
UNCOV 242	adaptive_p.target,	×
UNCOV 243	adaptive_p.decay,	×
244	seed));
245	}	×
246
247	static
248	void
UNCOV 249	mirostat_sample(	×
250	struct llama_sampler* smpl,
251	const rendezllama::inference::Mirostat& mirostat,
252	unsigned seed,
253	const rendezllama::Vocabulary& vocabulary)
254	{
UNCOV 255	if (mirostat.version == 1) {	×
UNCOV 256	const int mirostat_m = 100;	×
257	llama_sampler_chain_add(	×
258	smpl,
259	llama_sampler_init_mirostat(
UNCOV 260	vocabulary.cardinality(), seed,	×
UNCOV 261	mirostat.tau, mirostat.eta, mirostat_m));	×
262	}
263	else if (mirostat.version == 2) {	×
UNCOV 264	llama_sampler_chain_add(	×
265	smpl,
266	llama_sampler_init_mirostat_v2(
UNCOV 267	seed, mirostat.tau, mirostat.eta));	×
268	}
269	}	×
270
271	static
272	std::tuple<unsigned, unsigned>
273	infer_thread_counts(const rendezllama::ChatOptions& opt)	2✔
274	{
275	unsigned thread_count = opt.thread_count;	2✔
276	unsigned batch_thread_count = opt.batch_thread_count;	2✔
277	const unsigned n = std::thread::hardware_concurrency();	2✔
278	if (thread_count == 0) {	2✔
279	thread_count = n / 2;	2✔
280	if (thread_count == 0) {	2✔
UNCOV 281	thread_count = 1;	×
282	}
283	#if defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(__i386__) \|\| defined(_M_IX86)
284	if (2 <= n && n <= 4) {	2✔
285	thread_count = n;	2✔
286	}
287	#endif
288	}
289	if (batch_thread_count == 0) {	2✔
290	batch_thread_count = n;	2✔
291	}
292	return std::make_tuple(thread_count, batch_thread_count);	2✔
293	}
294
295	void
296	Inference::reinitialize(const ChatOptions& opt, const struct llama_model* model)	2✔
297	{
298	fildesh::ofstream eout("/dev/stderr");	2✔
299
300	const auto* sampling = std::get_if<rendezllama::inference::Sampling>(&opt.infer_via);	2✔
UNCOV 301	assert(sampling);	×
302	auto seed = sampling->seed;	2✔
303	if (smpl_ \|\| seed < 0) {	2✔
304	// We're retrying or just don't have a fixed seed, so we should reseed.
305	seed = new_sampling_seed();	2✔
306	}
307	std::tie(thread_count_, batch_thread_count_) = infer_thread_counts(opt);	2✔
308	if (smpl_) {	2✔
UNCOV 309	llama_sampler_free(smpl_);	×
UNCOV 310	eout.open("/dev/null");	×
311	}
312	token_count_ = 0;	2✔
313	auto smpl_param = llama_sampler_chain_default_params();	2✔
314	smpl_ = llama_sampler_chain_init(smpl_param);	2✔
315
316	for (const auto& adjust_via : sampling->adjust_thru) {	5✔
317	apply_sampler_chain(smpl_, adjust_via, model, seed, eout);	3✔
318	}
319
320	if (std::get_if<rendezllama::inference::Probability>(&sampling->pick_via)) {	2✔
UNCOV 321	llama_sampler_chain_add(smpl_, llama_sampler_init_dist(seed));	×
322	}
323	else if (std::get_if<rendezllama::inference::Determinism>(&sampling->pick_via)) {	2✔
324	llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());	1✔
325	}
326	else if (const auto* adaptive_p = std::get_if<rendezllama::inference::AdaptiveP>(&sampling->pick_via)) {	1✔
UNCOV 327	adaptive_p_sample(smpl_, *adaptive_p, seed);	×
328	}
329	else if (const auto* mirostat = std::get_if<rendezllama::inference::Mirostat>(&sampling->pick_via)) {	1✔
UNCOV 330	mirostat_sample(smpl_, *mirostat, seed, vocabulary_);	×
331	}
332	else {
333	fildesh_log_error("Missing pick method? Using greedy.");	1✔
334	llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());	1✔
335	}
336	}	2✔
337
338	bool
339	Inference::commit_to_context(	21✔
340	struct llama_context* ctx,
341	ChatDisplay& chat_disp,
342	ChatTrajectory& chat_traj,
343	const ChatOptions& opt,
344	const llama_model* model)
345	{
346	assert(!chat_traj.erased_since_eval_ \|\|	21✔
347	chat_traj.context_token_count_ < chat_traj.token_count());
348	if (chat_traj.erased_since_eval_ \|\| !smpl_) {	21✔
349	this->reinitialize(opt, model);	2✔
350	}
351	if (chat_traj.context_token_count_ == chat_traj.token_count()) {	21✔
352	return true;
353	}
354
355	chat_traj.maybe_rollforget_within_limit(opt.context_token_limit, vocabulary_);	21✔
356
357	// Reset thread count just in case the user reconfigured it.
358	llama_set_n_threads(ctx, thread_count_, batch_thread_count_);	21✔
359
360	// Clear KV cache past current position just in case the user deleted tokens.
361	llama_memory_seq_rm(	42✔
362	llama_get_memory(ctx),
363	0, chat_traj.context_token_count_, -1);	21✔
364
365	while (chat_traj.context_token_count_ < chat_traj.token_count()) {	63✔
366	const unsigned n = std::min(	21✔
367	opt.batch_count,	21✔
368	chat_traj.token_count() - chat_traj.context_token_count_);	21✔
369
370	#if LLAMA_OPENBLAS_ON
371	if (n < 32) {
372	llama_set_n_threads(ctx, thread_count_, batch_thread_count_);
373	}
374	else {
375	llama_set_n_threads(ctx, thread_count_, 1);
376	}
377	#endif
378	chat_disp.show_new(chat_traj.context_token_count_ + n, chat_traj, vocabulary_);	21✔
379
380	if (!batch_.token \|\| (unsigned)batch_.n_tokens < n) {	21✔
381	llama_batch_free(batch_);	2✔
382	unsigned n_alloc = n;	2✔
383	if (n_alloc < opt.batch_count) {n_alloc = opt.batch_count;}	2✔
384	batch_ = llama_batch_init(n_alloc, /embd=/0, /n_seq_max=/1);	2✔
385	}
386	batch_.n_tokens = n;	21✔
387	for (unsigned i = 0; i < n; ++i) {	59✔
388	batch_.token[i] = chat_traj.tokens()[chat_traj.context_token_count_ + i];	38✔
389	batch_.pos[i] = chat_traj.context_token_count_ + i;	38✔
390	batch_.n_seq_id[i] = 1;	38✔
391	batch_.seq_id[i][0] = 0;	38✔
392	batch_.logits[i] = (i == n - 1);	38✔
393	}
394
395	const int istat = llama_decode(ctx, batch_);	21✔
396
397	if (istat != 0) {	21✔
398	fildesh_log_error("Failed to eval.");	×
UNCOV 399	chat_traj.context_token_count_ = 0;	×
UNCOV 400	return false;	×
401	}
402	else {
403	chat_traj.context_token_count_ += n;	21✔
404	}
405	}
406	assert(chat_traj.context_token_count_ == chat_traj.token_count());	21✔
407	chat_traj.erased_since_eval_ = false;	21✔
408	while (token_count_ < chat_traj.token_count()) {	40✔
409	Vocabulary::Token_id token_id = chat_traj.token_at(token_count_);	19✔
410	llama_sampler_accept(smpl_, token_id);	19✔
411	token_count_ += 1;	19✔
412	}
413	return true;
414	}
415
416	void
417	Inference::sample_to_trajectory(	21✔
418	ChatTrajectory& chat_traj,
419	struct llama_context* ctx,
420	bool preventing_newline)
421	{
422	float* logits = llama_get_logits(ctx);	21✔
423	if (preventing_newline) {	21✔
424	// Zero probability for message-ending tokens when requested.
UNCOV 425	logits[vocabulary_.eos_token_id()] = 0;	×
UNCOV 426	logits[vocabulary_.newline_token_id()] = 0;	×
427	}
428
429	std::vector<llama_token_data> candidates;	21✔
430	candidates.resize(vocabulary_.cardinality());	21✔
431	for (llama_token i = 0; i < (llama_token)candidates.size(); ++i) {	350,229✔
432	candidates[i] = llama_token_data{	350,208✔
433	i, logits[i], 0.0f,	350,208✔
434	};
435	}
436	logits = NULL;	21✔
437	llama_token_data_array candidates_data[1] = {{	21✔
438	candidates.data(),	21✔
439	candidates.size(),	21✔
440	/selected=/0,
441	/sorted=/false,
442	}};	21✔
443	llama_sampler_apply(smpl_, candidates_data);	21✔
444	chat_traj.push_back(candidates[candidates_data->selected].id);	21✔
445	llama_sampler_accept(smpl_, chat_traj.token());	21✔
446	token_count_ += 1;	21✔
447	}	21✔
448

rendezqueue / rendezllama / 21341853687

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous