21199465653

Committed 21 Jan 2026 05:07AM UTC coverage: 88.225% (-2.1%) from 90.342%

Build # 21199465653

Build Type

push

github

Committed by

grencez

Commit Message

Update localserv to support chat interface and CLI args

Coverage Stats

2098 of 2378 relevant lines covered (88.23%)

21310.64 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

57.02

/src/language/inference.cc

#include "src/language/inference.hh"

#include <algorithm>
#include <cassert>
#include <cstring>
#include <thread>

#include <fildesh/fildesh.h>
#include <fildesh/ostream.hh>

#include "src/chat/display.hh"
#include "src/chat/guide.hh"
#include "src/chat/opt.hh"
#include "src/chat/trajectory.hh"
#include "src/language/vocabulary.hh"

using rendezllama::ChatDisplay;
using rendezllama::ChatGuide;
using rendezllama::ChatOptions;
using rendezllama::ChatTrajectory;
using rendezllama::Inference;
using rendezllama::Vocabulary;
using rendezllama::inference::AdjustViaKind;

Inference::Inference(const Vocabulary& vocabulary)
  : vocabulary_(vocabulary)
{}
Inference::~Inference() {
  if (smpl_) {llama_sampler_free(smpl_);}
  llama_batch_free(batch_);
}

  const std::string&
rendezllama::antiprompt_suffix(
    std::string_view text,
    const std::set<std::string>& antiprompts)
{
  static const std::string empty_string;
  for (const std::string& s : antiprompts) {
    if (text.size() >= s.size()) {
      const size_t offset = text.size() - s.size();
      if (0 == memcmp(&text[offset], &s[0], s.size())) {
        return s;
      }
    }
  }
  return empty_string;
}

static bool maybe_trim_endspace(std::string& s)
{
  bool result = false;
  while (!s.empty() && s.back() == ' ') {
    s.pop_back();
    result = true;
  }
  return result;
}

  void
rendezllama::augment_tokenize_chat_input(
    ChatGuide& chat_guide,
    ChatTrajectory& chat_traj,
    bool& prevent_subsequent_newline,
    std::string s,
    const Vocabulary& vocabulary,
    const ChatOptions& opt)
{
  prevent_subsequent_newline = false;
  if (s.size() >= 2 && s[0] == '\\' && s[1] == 'n') {
    chat_guide.end_turn();
    chat_guide.begin_turn(opt.message_opts.size()-1);
    s.erase(0, 2);
    prevent_subsequent_newline = maybe_trim_endspace(s);
    if (opt.message_opts.back().prefix.back() == '\n' && opt.linespace_on) {
      if (!s.empty() && s.front() != ' ') {
        s.insert(0, " ");
      }
    }
    chat_traj.tokenize_append(s, vocabulary);
  }
  else if (s.front() == '\n') {
    // This is from /yield.
    chat_guide.yield_turn(s.substr(1));
  }
  else if (s.front() == ' ') {
    prevent_subsequent_newline = maybe_trim_endspace(s);
    chat_traj.tokenize_append(s, vocabulary);
  }
  else {
    chat_guide.yield_turn(0);
    if (opt.message_opts[0].prefix.back() == '\n' && opt.linespace_on) {
      if (!s.empty() && s.front() != ' ') {
        s.insert(0, " ");
      }
    }
    chat_traj.tokenize_append(s, vocabulary);
    chat_guide.yield_turn();
    chat_traj.display_token_count_ = chat_traj.rfind_message_prefix_begin_at(
        chat_traj.token_count()-1);
    prevent_subsequent_newline = true;
  }
}

  std::tuple<struct llama_model*, struct llama_context*>
rendezllama::make_llama_context(rendezllama::ChatOptions& opt)
{
  llama_model_params model_params = llama_model_default_params();
  model_params.use_mlock = opt.mlock_on;
  model_params.use_mmap = opt.mmap_on;

  struct llama_model* model = llama_model_load_from_file(
      opt.model_filename.c_str(), model_params);
  if (!model) {
    fildesh_log_error("Failed to open model.");
    return std::make_tuple(nullptr, nullptr);
  }

  if (opt.model_token_limit == 0) {
    opt.model_token_limit = llama_model_n_ctx_train(model);
  }
  if (opt.context_token_limit == 0) {
    opt.context_token_limit = opt.model_token_limit;
  }

  model_params = llama_model_default_params();
  model_params.use_mlock = opt.mlock_on;
  model_params.use_mmap = opt.mmap_on;

  llama_context_params ctx_params = llama_context_default_params();
  ctx_params.n_ctx = opt.context_token_limit;
  ctx_params.n_threads = opt.thread_count;
  ctx_params.n_batch = opt.batch_count;
  ctx_params.rope_freq_scale = llama_model_rope_freq_scale_train(model);
  assert(ctx_params.rope_freq_scale > 0.0);
  while (
      (unsigned)(opt.model_token_limit / ctx_params.rope_freq_scale)
      <
      opt.context_token_limit)
  {
    ctx_params.rope_freq_scale /= 2;
  }

  struct llama_context* ctx = llama_init_from_model(model, ctx_params);
  if (!ctx) {
    llama_model_free(model);
    fildesh_log_error("Failed to create context.");
    return std::make_tuple(nullptr, nullptr);
  }
  return std::make_tuple(model, ctx);
}

static
  int
new_sampling_seed()
{
  return static_cast<int>(INT_MAX & time(NULL));
}

static
  void
apply_sampler_chain(
    struct llama_sampler* smpl,
    const rendezllama::inference::AdjustVia& adjust_via,
    const struct llama_model* model,
    unsigned seed,
    std::ostream& eout)
{
  const unsigned keep_one = 1;

  if (const auto* dry = std::get_if<AdjustViaKind::dry>(&adjust_via)) {
    static const char* seq_breakers[] = {
      "\n", ":",
    };
    llama_sampler_init_dry(
        llama_model_get_vocab(model),
        llama_model_n_ctx_train(model),
        dry->multiplier,
        dry->base,
        dry->allowed_length,
        dry->window_length,
        seq_breakers,
        sizeof(seq_breakers)/sizeof(*seq_breakers));
    eout << "dry:"
      << "\n  multiplier: " << dry->multiplier
      << "\n  base: " << dry->base
      << "\n  allowed_length: " << dry->allowed_length
      << "\n  window_length: " << dry->window_length
      << "\n";
  }
  if (const auto* min_p = std::get_if<AdjustViaKind::min_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_min_p(*min_p, keep_one));
    eout << "min_p: " << *min_p << "\n";
  }
  if (const auto* penalize_with = std::get_if<AdjustViaKind::penalize_with>(&adjust_via)) {
    llama_sampler_init_penalties(
        penalize_with->window_length,
        penalize_with->repetition,
        penalize_with->frequency,
        penalize_with->presence);
    eout << "penalties:"
      << "\n  window_length: " << penalize_with->window_length
      << "\n  repetition: " << penalize_with->repetition
      << "\n  frequency: " << penalize_with->frequency
      << "\n  presence: " << penalize_with->presence
      << "\n";
  }
  if (const auto* temperature = std::get_if<AdjustViaKind::temperature>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_temp(*temperature));
    eout << "temperature: " << *temperature << "\n";
  }
  if (const auto* top_k = std::get_if<AdjustViaKind::top_k>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(*top_k));
    eout << "top_k: " << *top_k << "\n";
  }
  if (const auto* top_p = std::get_if<AdjustViaKind::top_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(*top_p, keep_one));
    eout << "top_p: " << *top_p << "\n";
  }
  if (const auto* typical_p = std::get_if<AdjustViaKind::typical_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_typical(*typical_p, keep_one));
    eout << "typical_p: " << *typical_p << "\n";
  }
  if (const auto* xtc = std::get_if<AdjustViaKind::xtc>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_xtc(xtc->probability, xtc->threshold, keep_one, seed));
    eout << "xtc: "
      << "\n  probability: " << xtc->probability
      << "\n  threshold: " << xtc->threshold
      << "\n";
  }
}

static
  void
adaptive_p_sample(
    struct llama_sampler* smpl,
    const rendezllama::inference::AdaptiveP& adaptive_p,
    unsigned seed)
{
  llama_sampler_chain_add(
      smpl,
      llama_sampler_init_adaptive_p(
          adaptive_p.target,
          adaptive_p.decay,
          seed));
}

static
  void
mirostat_sample(
    struct llama_sampler* smpl,
    const rendezllama::inference::Mirostat& mirostat,
    unsigned seed,
    const rendezllama::Vocabulary& vocabulary)
{
  if (mirostat.version == 1) {
    const int mirostat_m = 100;
    llama_sampler_chain_add(
        smpl,
        llama_sampler_init_mirostat(
            vocabulary.cardinality(), seed,
            mirostat.tau, mirostat.eta, mirostat_m));
  }
  else if (mirostat.version == 2) {
    llama_sampler_chain_add(
        smpl,
        llama_sampler_init_mirostat_v2(
            seed, mirostat.tau, mirostat.eta));
  }
}

  void
Inference::reinitialize(const ChatOptions& opt, const struct llama_model* model)
{
  fildesh::ofstream eout("/dev/stderr");

  const auto* sampling = std::get_if<rendezllama::inference::Sampling>(&opt.infer_via);
  assert(sampling);
  auto seed = sampling->seed;
  if (smpl_ || seed < 0) {
    // We're retrying or just don't have a fixed seed, so we should reseed.
    seed = new_sampling_seed();
  }
  if (smpl_) {
    llama_sampler_free(smpl_);
    eout.open("/dev/null");
  }
  token_count_ = 0;
  auto smpl_param = llama_sampler_chain_default_params();
  smpl_ = llama_sampler_chain_init(smpl_param);

  for (const auto& adjust_via : sampling->adjust_thru) {
    apply_sampler_chain(smpl_, adjust_via, model, seed, eout);
  }

  if (std::get_if<rendezllama::inference::Probability>(&sampling->pick_via)) {
    llama_sampler_chain_add(smpl_, llama_sampler_init_dist(seed));
  }
  else if (std::get_if<rendezllama::inference::Determinism>(&sampling->pick_via)) {
    llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());
  }
  else if (const auto* adaptive_p = std::get_if<rendezllama::inference::AdaptiveP>(&sampling->pick_via)) {
    adaptive_p_sample(smpl_, *adaptive_p, seed);
  }
  else if (const auto* mirostat = std::get_if<rendezllama::inference::Mirostat>(&sampling->pick_via)) {
    mirostat_sample(smpl_, *mirostat, seed, vocabulary_);
  }
  else {
    fildesh_log_error("Missing pick method? Using greedy.");
    llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());
  }
}

  bool
Inference::commit_to_context(
    struct llama_context* ctx,
    ChatDisplay& chat_disp,
    ChatTrajectory& chat_traj,
    const ChatOptions& opt,
    const llama_model* model)
{
  assert(!chat_traj.erased_since_eval_ ||
         chat_traj.context_token_count_ < chat_traj.token_count());
  if (chat_traj.erased_since_eval_ || !smpl_) {
    this->reinitialize(opt, model);
  }
  if (chat_traj.context_token_count_ == chat_traj.token_count()) {
    return true;
  }

  chat_traj.maybe_rollforget_within_limit(opt.context_token_limit, vocabulary_);

  // Reset thread count just in case the user reconfigured it.
  const unsigned thread_count = opt.thread_count;
  unsigned batch_thread_count = opt.batch_thread_count;
  if (batch_thread_count == 0) {
    batch_thread_count = std::thread::hardware_concurrency();
  }
  if (batch_thread_count == 0) {
    batch_thread_count = thread_count;
  }
  llama_set_n_threads(ctx, thread_count, batch_thread_count);

  // Clear KV cache past current position just in case the user deleted tokens.
  llama_memory_seq_rm(
      llama_get_memory(ctx),
      0, chat_traj.context_token_count_, -1);

  while (chat_traj.context_token_count_ < chat_traj.token_count()) {
    const unsigned n = std::min(
        opt.batch_count,
        chat_traj.token_count() - chat_traj.context_token_count_);

#if LLAMA_OPENBLAS_ON
    if (n < 32) {
      llama_set_n_threads(ctx, thread_count, batch_thread_count);
    }
    else {
      llama_set_n_threads(ctx, thread_count, 1);
    }
#endif
    chat_disp.show_new(chat_traj.context_token_count_ + n, chat_traj, vocabulary_);

    if (!batch_.token || (unsigned)batch_.n_tokens < n) {
      llama_batch_free(batch_);
      unsigned n_alloc = n;
      if (n_alloc < opt.batch_count) {n_alloc = opt.batch_count;}
      batch_ = llama_batch_init(n_alloc, /*embd=*/0, /*n_seq_max=*/1);
    }
    batch_.n_tokens = n;
    for (unsigned i = 0; i < n; ++i) {
      batch_.token[i] = chat_traj.tokens()[chat_traj.context_token_count_ + i];
      batch_.pos[i] = chat_traj.context_token_count_ + i;
      batch_.n_seq_id[i] = 1;
      batch_.seq_id[i][0] = 0;
      batch_.logits[i] = (i == n - 1);
    }

    const int istat = llama_decode(ctx, batch_);

    if (istat != 0) {
      fildesh_log_error("Failed to eval.");
      chat_traj.context_token_count_ = 0;
      return false;
    }
    else {
      chat_traj.context_token_count_ += n;
    }
  }
  assert(chat_traj.context_token_count_ == chat_traj.token_count());
  chat_traj.erased_since_eval_ = false;
  while (token_count_ < chat_traj.token_count()) {
    Vocabulary::Token_id token_id = chat_traj.token_at(token_count_);
    llama_sampler_accept(smpl_, token_id);
    token_count_ += 1;
  }
  return true;
}

  void
Inference::sample_to_trajectory(
    ChatTrajectory& chat_traj,
    struct llama_context* ctx,
    bool preventing_newline)
{
  float* logits = llama_get_logits(ctx);
  if (preventing_newline) {
    // Zero probability for message-ending tokens when requested.
    logits[vocabulary_.eos_token_id()] = 0;
    logits[vocabulary_.newline_token_id()] = 0;
  }

  std::vector<llama_token_data> candidates;
  candidates.resize(vocabulary_.cardinality());
  for (llama_token i = 0; i < (llama_token)candidates.size(); ++i) {
    candidates[i] = llama_token_data{
      i, logits[i], 0.0f,
    };
  }
  logits = NULL;
  llama_token_data_array candidates_data[1] = {{
    candidates.data(),
    candidates.size(),
    /*selected=*/0,
    /*sorted=*/false,
  }};
  llama_sampler_apply(smpl_, candidates_data);
  chat_traj.push_back(candidates[candidates_data->selected].id);
  llama_sampler_accept(smpl_, chat_traj.token());
  token_count_ += 1;
}


1	#include "src/language/inference.hh"
2
3	#include <algorithm>
4	#include <cassert>
5	#include <cstring>
6	#include <thread>
7
8	#include <fildesh/fildesh.h>
9	#include <fildesh/ostream.hh>
10
11	#include "src/chat/display.hh"
12	#include "src/chat/guide.hh"
13	#include "src/chat/opt.hh"
14	#include "src/chat/trajectory.hh"
15	#include "src/language/vocabulary.hh"
16
17	using rendezllama::ChatDisplay;
18	using rendezllama::ChatGuide;
19	using rendezllama::ChatOptions;
20	using rendezllama::ChatTrajectory;
21	using rendezllama::Inference;
22	using rendezllama::Vocabulary;
23	using rendezllama::inference::AdjustViaKind;
24
25	Inference::Inference(const Vocabulary& vocabulary)	2✔
26	: vocabulary_(vocabulary)	2✔
27	{}	2✔
28	Inference::~Inference() {	2✔
29	if (smpl_) {llama_sampler_free(smpl_);}	2✔
30	llama_batch_free(batch_);	2✔
31	}	2✔
32
33	const std::string&
34	rendezllama::antiprompt_suffix(	5✔
35	std::string_view text,
36	const std::set<std::string>& antiprompts)
37	{
38	static const std::string empty_string;	5✔
39	for (const std::string& s : antiprompts) {	11✔
40	if (text.size() >= s.size()) {	9✔
41	const size_t offset = text.size() - s.size();	6✔
42	if (0 == memcmp(&text[offset], &s[0], s.size())) {	6✔
43	return s;	3✔
44	}
45	}
46	}
47	return empty_string;	2✔
48	}
49
50	static bool maybe_trim_endspace(std::string& s)	×
51	{
52	bool result = false;	×
53	while (!s.empty() && s.back() == ' ') {	×
54	s.pop_back();	×
55	result = true;	×
56	}
57	return result;	×
58	}
59
60	void
61	rendezllama::augment_tokenize_chat_input(	×
62	ChatGuide& chat_guide,
63	ChatTrajectory& chat_traj,
64	bool& prevent_subsequent_newline,
65	std::string s,
66	const Vocabulary& vocabulary,
67	const ChatOptions& opt)
68	{
69	prevent_subsequent_newline = false;	×
70	if (s.size() >= 2 && s[0] == '\\' && s[1] == 'n') {	×
71	chat_guide.end_turn();	×
72	chat_guide.begin_turn(opt.message_opts.size()-1);	×
73	s.erase(0, 2);	×
74	prevent_subsequent_newline = maybe_trim_endspace(s);	×
75	if (opt.message_opts.back().prefix.back() == '\n' && opt.linespace_on) {	×
76	if (!s.empty() && s.front() != ' ') {	×
77	s.insert(0, " ");	×
78	}
79	}
80	chat_traj.tokenize_append(s, vocabulary);	×
81	}
82	else if (s.front() == '\n') {	×
83	// This is from /yield.
84	chat_guide.yield_turn(s.substr(1));	×
85	}
86	else if (s.front() == ' ') {	×
87	prevent_subsequent_newline = maybe_trim_endspace(s);	×
88	chat_traj.tokenize_append(s, vocabulary);	×
89	}
90	else {
91	chat_guide.yield_turn(0);	×
92	if (opt.message_opts[0].prefix.back() == '\n' && opt.linespace_on) {	×
93	if (!s.empty() && s.front() != ' ') {	×
94	s.insert(0, " ");	×
95	}
96	}
97	chat_traj.tokenize_append(s, vocabulary);	×
98	chat_guide.yield_turn();	×
99	chat_traj.display_token_count_ = chat_traj.rfind_message_prefix_begin_at(	×
100	chat_traj.token_count()-1);	×
101	prevent_subsequent_newline = true;	×
102	}
103	}	×
104
105	std::tuple<struct llama_model, struct llama_context>
106	rendezllama::make_llama_context(rendezllama::ChatOptions& opt)	2✔
107	{
108	llama_model_params model_params = llama_model_default_params();	2✔
109	model_params.use_mlock = opt.mlock_on;	2✔
110	model_params.use_mmap = opt.mmap_on;	2✔
111
112	struct llama_model* model = llama_model_load_from_file(	2✔
113	opt.model_filename.c_str(), model_params);	2✔
114	if (!model) {	2✔
115	fildesh_log_error("Failed to open model.");	×
116	return std::make_tuple(nullptr, nullptr);	×
117	}
118
119	if (opt.model_token_limit == 0) {	2✔
120	opt.model_token_limit = llama_model_n_ctx_train(model);	2✔
121	}
122	if (opt.context_token_limit == 0) {	2✔
123	opt.context_token_limit = opt.model_token_limit;	2✔
124	}
125
126	model_params = llama_model_default_params();	2✔
127	model_params.use_mlock = opt.mlock_on;	2✔
128	model_params.use_mmap = opt.mmap_on;	2✔
129
130	llama_context_params ctx_params = llama_context_default_params();	2✔
131	ctx_params.n_ctx = opt.context_token_limit;	2✔
132	ctx_params.n_threads = opt.thread_count;	2✔
133	ctx_params.n_batch = opt.batch_count;	2✔
134	ctx_params.rope_freq_scale = llama_model_rope_freq_scale_train(model);	2✔
135	assert(ctx_params.rope_freq_scale > 0.0);	2✔
136	while (
137	(unsigned)(opt.model_token_limit / ctx_params.rope_freq_scale)	2✔
138	<	2✔
139	opt.context_token_limit)	2✔
140	{
141	ctx_params.rope_freq_scale /= 2;	×
142	}
143
144	struct llama_context* ctx = llama_init_from_model(model, ctx_params);	2✔
145	if (!ctx) {	2✔
146	llama_model_free(model);	×
147	fildesh_log_error("Failed to create context.");	×
148	return std::make_tuple(nullptr, nullptr);	×
149	}
150	return std::make_tuple(model, ctx);	2✔
151	}
152
153	static
154	int
155	new_sampling_seed()	2✔
156	{
157	return static_cast<int>(INT_MAX & time(NULL));	2✔
158	}
159
160	static
161	void
162	apply_sampler_chain(	1✔
163	struct llama_sampler* smpl,
164	const rendezllama::inference::AdjustVia& adjust_via,
165	const struct llama_model* model,
166	unsigned seed,
167	std::ostream& eout)
168	{
169	const unsigned keep_one = 1;	1✔
170
171	if (const auto* dry = std::get_if<AdjustViaKind::dry>(&adjust_via)) {	1✔
172	static const char* seq_breakers[] = {	×
173	"\n", ":",
174	};
175	llama_sampler_init_dry(	×
176	llama_model_get_vocab(model),
177	llama_model_n_ctx_train(model),
178	dry->multiplier,	×
179	dry->base,	×
180	dry->allowed_length,	×
181	dry->window_length,	×
182	seq_breakers,
183	sizeof(seq_breakers)/sizeof(*seq_breakers));
184	eout << "dry:"	×
185	<< "\n multiplier: " << dry->multiplier	×
186	<< "\n base: " << dry->base	×
187	<< "\n allowed_length: " << dry->allowed_length	×
188	<< "\n window_length: " << dry->window_length	×
189	<< "\n";	×
190	}
191	if (const auto* min_p = std::get_if<AdjustViaKind::min_p>(&adjust_via)) {	1✔
192	llama_sampler_chain_add(smpl, llama_sampler_init_min_p(*min_p, keep_one));	×
193	eout << "min_p: " << *min_p << "\n";	×
194	}
195	if (const auto* penalize_with = std::get_if<AdjustViaKind::penalize_with>(&adjust_via)) {	1✔
196	llama_sampler_init_penalties(	×
197	penalize_with->window_length,	×
198	penalize_with->repetition,	×
199	penalize_with->frequency,	×
200	penalize_with->presence);	×
201	eout << "penalties:"	×
202	<< "\n window_length: " << penalize_with->window_length	×
203	<< "\n repetition: " << penalize_with->repetition	×
204	<< "\n frequency: " << penalize_with->frequency	×
205	<< "\n presence: " << penalize_with->presence	×
206	<< "\n";	×
207	}
208	if (const auto* temperature = std::get_if<AdjustViaKind::temperature>(&adjust_via)) {	1✔
209	llama_sampler_chain_add(smpl, llama_sampler_init_temp(*temperature));	1✔
210	eout << "temperature: " << *temperature << "\n";	1✔
211	}
212	if (const auto* top_k = std::get_if<AdjustViaKind::top_k>(&adjust_via)) {	1✔
213	llama_sampler_chain_add(smpl, llama_sampler_init_top_k(*top_k));	×
214	eout << "top_k: " << *top_k << "\n";	×
215	}
216	if (const auto* top_p = std::get_if<AdjustViaKind::top_p>(&adjust_via)) {	1✔
217	llama_sampler_chain_add(smpl, llama_sampler_init_top_p(*top_p, keep_one));	×
218	eout << "top_p: " << *top_p << "\n";	×
219	}
220	if (const auto* typical_p = std::get_if<AdjustViaKind::typical_p>(&adjust_via)) {	1✔
221	llama_sampler_chain_add(smpl, llama_sampler_init_typical(*typical_p, keep_one));	×
222	eout << "typical_p: " << *typical_p << "\n";	×
223	}
224	if (const auto* xtc = std::get_if<AdjustViaKind::xtc>(&adjust_via)) {	1✔
225	llama_sampler_chain_add(smpl, llama_sampler_init_xtc(xtc->probability, xtc->threshold, keep_one, seed));	×
226	eout << "xtc: "	×
227	<< "\n probability: " << xtc->probability	×
228	<< "\n threshold: " << xtc->threshold	×
229	<< "\n";	×
230	}
231	}	1✔
232
233	static
234	void
235	adaptive_p_sample(	×
236	struct llama_sampler* smpl,
237	const rendezllama::inference::AdaptiveP& adaptive_p,
238	unsigned seed)
239	{
240	llama_sampler_chain_add(	×
241	smpl,
242	llama_sampler_init_adaptive_p(
243	adaptive_p.target,	×
244	adaptive_p.decay,	×
245	seed));
246	}	×
247
248	static
249	void
250	mirostat_sample(	×
251	struct llama_sampler* smpl,
252	const rendezllama::inference::Mirostat& mirostat,
253	unsigned seed,
254	const rendezllama::Vocabulary& vocabulary)
255	{
256	if (mirostat.version == 1) {	×
257	const int mirostat_m = 100;	×
258	llama_sampler_chain_add(	×
259	smpl,
260	llama_sampler_init_mirostat(
261	vocabulary.cardinality(), seed,	×
262	mirostat.tau, mirostat.eta, mirostat_m));	×
263	}
264	else if (mirostat.version == 2) {	×
265	llama_sampler_chain_add(	×
266	smpl,
267	llama_sampler_init_mirostat_v2(
268	seed, mirostat.tau, mirostat.eta));	×
269	}
270	}	×
271
272	void
273	Inference::reinitialize(const ChatOptions& opt, const struct llama_model* model)	2✔
274	{
275	fildesh::ofstream eout("/dev/stderr");	2✔
276
277	const auto* sampling = std::get_if<rendezllama::inference::Sampling>(&opt.infer_via);	2✔
278	assert(sampling);	×
279	auto seed = sampling->seed;	2✔
280	if (smpl_ \|\| seed < 0) {	2✔
281	// We're retrying or just don't have a fixed seed, so we should reseed.
282	seed = new_sampling_seed();	2✔
283	}
284	if (smpl_) {	2✔
285	llama_sampler_free(smpl_);	×
286	eout.open("/dev/null");	×
287	}
288	token_count_ = 0;	2✔
289	auto smpl_param = llama_sampler_chain_default_params();	2✔
290	smpl_ = llama_sampler_chain_init(smpl_param);	2✔
291
292	for (const auto& adjust_via : sampling->adjust_thru) {	3✔
293	apply_sampler_chain(smpl_, adjust_via, model, seed, eout);	1✔
294	}
295
296	if (std::get_if<rendezllama::inference::Probability>(&sampling->pick_via)) {	2✔
297	llama_sampler_chain_add(smpl_, llama_sampler_init_dist(seed));	×
298	}
299	else if (std::get_if<rendezllama::inference::Determinism>(&sampling->pick_via)) {	2✔
300	llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());	1✔
301	}
302	else if (const auto* adaptive_p = std::get_if<rendezllama::inference::AdaptiveP>(&sampling->pick_via)) {	1✔
303	adaptive_p_sample(smpl_, *adaptive_p, seed);	×
304	}
305	else if (const auto* mirostat = std::get_if<rendezllama::inference::Mirostat>(&sampling->pick_via)) {	1✔
306	mirostat_sample(smpl_, *mirostat, seed, vocabulary_);	×
307	}
308	else {
309	fildesh_log_error("Missing pick method? Using greedy.");	1✔
310	llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());	1✔
311	}
312	}	2✔
313
314	bool
315	Inference::commit_to_context(	524✔
316	struct llama_context* ctx,
317	ChatDisplay& chat_disp,
318	ChatTrajectory& chat_traj,
319	const ChatOptions& opt,
320	const llama_model* model)
321	{
322	assert(!chat_traj.erased_since_eval_ \|\|	524✔
323	chat_traj.context_token_count_ < chat_traj.token_count());
324	if (chat_traj.erased_since_eval_ \|\| !smpl_) {	524✔
325	this->reinitialize(opt, model);	2✔
326	}
327	if (chat_traj.context_token_count_ == chat_traj.token_count()) {	524✔
328	return true;
329	}
330
331	chat_traj.maybe_rollforget_within_limit(opt.context_token_limit, vocabulary_);	524✔
332
333	// Reset thread count just in case the user reconfigured it.
334	const unsigned thread_count = opt.thread_count;	524✔
335	unsigned batch_thread_count = opt.batch_thread_count;	524✔
336	if (batch_thread_count == 0) {	524✔
337	batch_thread_count = std::thread::hardware_concurrency();	524✔
338	}
339	if (batch_thread_count == 0) {	524✔
340	batch_thread_count = thread_count;	×
341	}
342	llama_set_n_threads(ctx, thread_count, batch_thread_count);	524✔
343
344	// Clear KV cache past current position just in case the user deleted tokens.
345	llama_memory_seq_rm(	1,048✔
346	llama_get_memory(ctx),
347	0, chat_traj.context_token_count_, -1);	524✔
348
349	while (chat_traj.context_token_count_ < chat_traj.token_count()) {	1,572✔
350	const unsigned n = std::min(	524✔
351	opt.batch_count,	524✔
352	chat_traj.token_count() - chat_traj.context_token_count_);	524✔
353
354	#if LLAMA_OPENBLAS_ON
355	if (n < 32) {
356	llama_set_n_threads(ctx, thread_count, batch_thread_count);
357	}
358	else {
359	llama_set_n_threads(ctx, thread_count, 1);
360	}
361	#endif
362	chat_disp.show_new(chat_traj.context_token_count_ + n, chat_traj, vocabulary_);	524✔
363
364	if (!batch_.token \|\| (unsigned)batch_.n_tokens < n) {	524✔
365	llama_batch_free(batch_);	2✔
366	unsigned n_alloc = n;	2✔
367	if (n_alloc < opt.batch_count) {n_alloc = opt.batch_count;}	2✔
368	batch_ = llama_batch_init(n_alloc, /embd=/0, /n_seq_max=/1);	2✔
369	}
370	batch_.n_tokens = n;	524✔
371	for (unsigned i = 0; i < n; ++i) {	1,071✔
372	batch_.token[i] = chat_traj.tokens()[chat_traj.context_token_count_ + i];	547✔
373	batch_.pos[i] = chat_traj.context_token_count_ + i;	547✔
374	batch_.n_seq_id[i] = 1;	547✔
375	batch_.seq_id[i][0] = 0;	547✔
376	batch_.logits[i] = (i == n - 1);	547✔
377	}
378
379	const int istat = llama_decode(ctx, batch_);	524✔
380
381	if (istat != 0) {	524✔
382	fildesh_log_error("Failed to eval.");	×
383	chat_traj.context_token_count_ = 0;	×
384	return false;	×
385	}
386	else {
387	chat_traj.context_token_count_ += n;	524✔
388	}
389	}
390	assert(chat_traj.context_token_count_ == chat_traj.token_count());	524✔
391	chat_traj.erased_since_eval_ = false;	524✔
392	while (token_count_ < chat_traj.token_count()) {	549✔
393	Vocabulary::Token_id token_id = chat_traj.token_at(token_count_);	25✔
394	llama_sampler_accept(smpl_, token_id);	25✔
395	token_count_ += 1;	25✔
396	}
397	return true;
398	}
399
400	void
401	Inference::sample_to_trajectory(	523✔
402	ChatTrajectory& chat_traj,
403	struct llama_context* ctx,
404	bool preventing_newline)
405	{
406	float* logits = llama_get_logits(ctx);	523✔
407	if (preventing_newline) {	523✔
408	// Zero probability for message-ending tokens when requested.
409	logits[vocabulary_.eos_token_id()] = 0;	×
410	logits[vocabulary_.newline_token_id()] = 0;	×
411	}
412
413	std::vector<llama_token_data> candidates;	523✔
414	candidates.resize(vocabulary_.cardinality());	523✔
415	for (llama_token i = 0; i < (llama_token)candidates.size(); ++i) {	16,800,267✔
416	candidates[i] = llama_token_data{	16,799,744✔
417	i, logits[i], 0.0f,	16,799,744✔
418	};
419	}
420	logits = NULL;	523✔
421	llama_token_data_array candidates_data[1] = {{	523✔
422	candidates.data(),	523✔
423	candidates.size(),	523✔
424	/selected=/0,
425	/sorted=/false,
426	}};	523✔
427	llama_sampler_apply(smpl_, candidates_data);	523✔
428	chat_traj.push_back(candidates[candidates_data->selected].id);	523✔
429	llama_sampler_accept(smpl_, chat_traj.token());	523✔
430	token_count_ += 1;	523✔
431	}	523✔
432

rendezqueue / rendezllama / 21199465653

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous