21157153067

Committed 20 Jan 2026 02:13AM UTC coverage: 90.342% (+0.4%) from 89.931%

Build # 21157153067

Build Type

push

github

Committed by

grencez

Commit Message

Update localserv to support chat interface and CLI args

Run Details

2142 of 2371 relevant lines covered (90.34%)

280.81 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

77.88

/src/language/inference.cc

#include "src/language/inference.hh"

#include <algorithm>
#include <cassert>
#include <cstring>
#include <thread>

#include <fildesh/fildesh.h>
#include <fildesh/ostream.hh>

#include "src/chat/display.hh"
#include "src/chat/guide.hh"
#include "src/chat/opt.hh"
#include "src/chat/trajectory.hh"
#include "src/language/vocabulary.hh"

using rendezllama::ChatDisplay;
using rendezllama::ChatGuide;
using rendezllama::ChatOptions;
using rendezllama::ChatTrajectory;
using rendezllama::Inference;
using rendezllama::Vocabulary;
using rendezllama::inference::AdjustViaKind;

Inference::Inference(const Vocabulary& vocabulary)
  : vocabulary_(vocabulary)
{}
Inference::~Inference() {
  if (smpl_) {llama_sampler_free(smpl_);}
}

  const std::string&
rendezllama::antiprompt_suffix(
    std::string_view text,
    const std::set<std::string>& antiprompts)
{
  static const std::string empty_string;
  for (const std::string& s : antiprompts) {
    if (text.size() >= s.size()) {
      const size_t offset = text.size() - s.size();
      if (0 == memcmp(&text[offset], &s[0], s.size())) {
        return s;
      }
    }
  }
  return empty_string;
}

static bool maybe_trim_endspace(std::string& s)
{
  bool result = false;
  while (!s.empty() && s.back() == ' ') {
    s.pop_back();
    result = true;
  }
  return result;
}

  void
rendezllama::augment_tokenize_chat_input(
    ChatGuide& chat_guide,
    ChatTrajectory& chat_traj,
    bool& prevent_subsequent_newline,
    std::string s,
    const Vocabulary& vocabulary,
    const ChatOptions& opt)
{
  prevent_subsequent_newline = false;
  if (s.size() >= 2 && s[0] == '\\' && s[1] == 'n') {
    chat_guide.end_turn();
    chat_guide.begin_turn(opt.message_opts.size()-1);
    s.erase(0, 2);
    prevent_subsequent_newline = maybe_trim_endspace(s);
    if (opt.message_opts.back().prefix.back() == '\n' && opt.linespace_on) {
      if (!s.empty() && s.front() != ' ') {
        s.insert(0, " ");
      }
    }
    chat_traj.tokenize_append(s, vocabulary);
  }
  else if (s.front() == '\n') {
    // This is from /yield.
    chat_guide.yield_turn(s.substr(1));
  }
  else if (s.front() == ' ') {
    prevent_subsequent_newline = maybe_trim_endspace(s);
    chat_traj.tokenize_append(s, vocabulary);
  }
  else {
    chat_guide.yield_turn(0);
    if (opt.message_opts[0].prefix.back() == '\n' && opt.linespace_on) {
      if (!s.empty() && s.front() != ' ') {
        s.insert(0, " ");
      }
    }
    chat_traj.tokenize_append(s, vocabulary);
    chat_guide.yield_turn();
    chat_traj.display_token_count_ = chat_traj.rfind_message_prefix_begin_at(
        chat_traj.token_count()-1);
    prevent_subsequent_newline = true;
  }
}

  std::tuple<struct llama_model*, struct llama_context*>
rendezllama::make_llama_context(rendezllama::ChatOptions& opt)
{
  llama_model_params model_params = llama_model_default_params();
  model_params.use_mlock = opt.mlock_on;
  model_params.use_mmap = opt.mmap_on;

  struct llama_model* model = llama_model_load_from_file(
      opt.model_filename.c_str(), model_params);
  if (!model) {
    fildesh_log_error("Failed to open model.");
    return std::make_tuple(nullptr, nullptr);
  }

  if (opt.model_token_limit == 0) {
    opt.model_token_limit = llama_model_n_ctx_train(model);
  }
  if (opt.context_token_limit == 0) {
    opt.context_token_limit = opt.model_token_limit;
  }

  model_params = llama_model_default_params();
  model_params.use_mlock = opt.mlock_on;
  model_params.use_mmap = opt.mmap_on;

  llama_context_params ctx_params = llama_context_default_params();
  ctx_params.n_ctx = opt.context_token_limit;
  ctx_params.n_threads = opt.thread_count;
  ctx_params.n_batch = opt.batch_count;
  ctx_params.rope_freq_scale = llama_model_rope_freq_scale_train(model);
  assert(ctx_params.rope_freq_scale > 0.0);
  while (
      (unsigned)(opt.model_token_limit / ctx_params.rope_freq_scale)
      <
      opt.context_token_limit)
  {
    ctx_params.rope_freq_scale /= 2;
  }

  struct llama_context* ctx = llama_init_from_model(model, ctx_params);
  if (!ctx) {
    llama_model_free(model);
    fildesh_log_error("Failed to create context.");
    return std::make_tuple(nullptr, nullptr);
  }
  return std::make_tuple(model, ctx);
}

static
  int
new_sampling_seed()
{
  return static_cast<int>(INT_MAX & time(NULL));
}

static
  void
apply_sampler_chain(
    struct llama_sampler* smpl,
    const rendezllama::inference::AdjustVia& adjust_via,
    const struct llama_model* model,
    unsigned seed,
    std::ostream& eout)
{
  const unsigned keep_one = 1;

  if (const auto* dry = std::get_if<AdjustViaKind::dry>(&adjust_via)) {
    static const char* seq_breakers[] = {
      "\n", ":",
    };
    llama_sampler_init_dry(
        llama_model_get_vocab(model),
        llama_model_n_ctx_train(model),
        dry->multiplier,
        dry->base,
        dry->allowed_length,
        dry->window_length,
        seq_breakers,
        sizeof(seq_breakers)/sizeof(*seq_breakers));
    eout << "dry:"
      << "\n  multiplier: " << dry->multiplier
      << "\n  base: " << dry->base
      << "\n  allowed_length: " << dry->allowed_length
      << "\n  window_length: " << dry->window_length
      << "\n";
  }
  if (const auto* min_p = std::get_if<AdjustViaKind::min_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_min_p(*min_p, keep_one));
    eout << "min_p: " << *min_p << "\n";
  }
  if (const auto* penalize_with = std::get_if<AdjustViaKind::penalize_with>(&adjust_via)) {
    llama_sampler_init_penalties(
        penalize_with->window_length,
        penalize_with->repetition,
        penalize_with->frequency,
        penalize_with->presence);
    eout << "penalties:"
      << "\n  window_length: " << penalize_with->window_length
      << "\n  repetition: " << penalize_with->repetition
      << "\n  frequency: " << penalize_with->frequency
      << "\n  presence: " << penalize_with->presence
      << "\n";
  }
  if (const auto* temperature = std::get_if<AdjustViaKind::temperature>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_temp(*temperature));
    eout << "temperature: " << *temperature << "\n";
  }
  if (const auto* top_k = std::get_if<AdjustViaKind::top_k>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(*top_k));
    eout << "top_k: " << *top_k << "\n";
  }
  if (const auto* top_p = std::get_if<AdjustViaKind::top_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(*top_p, keep_one));
    eout << "top_p: " << *top_p << "\n";
  }
  if (const auto* typical_p = std::get_if<AdjustViaKind::typical_p>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_typical(*typical_p, keep_one));
    eout << "typical_p: " << *typical_p << "\n";
  }
  if (const auto* xtc = std::get_if<AdjustViaKind::xtc>(&adjust_via)) {
    llama_sampler_chain_add(smpl, llama_sampler_init_xtc(xtc->probability, xtc->threshold, keep_one, seed));
    eout << "xtc: "
      << "\n  probability: " << xtc->probability
      << "\n  threshold: " << xtc->threshold
      << "\n";
  }
}

static
  void
adaptive_p_sample(
    struct llama_sampler* smpl,
    const rendezllama::inference::AdaptiveP& adaptive_p,
    unsigned seed)
{
  llama_sampler_chain_add(
      smpl,
      llama_sampler_init_adaptive_p(
          adaptive_p.target,
          adaptive_p.decay,
          seed));
}

static
  void
mirostat_sample(
    struct llama_sampler* smpl,
    const rendezllama::inference::Mirostat& mirostat,
    unsigned seed,
    const rendezllama::Vocabulary& vocabulary)
{
  if (mirostat.version == 1) {
    const int mirostat_m = 100;
    llama_sampler_chain_add(
        smpl,
        llama_sampler_init_mirostat(
            vocabulary.cardinality(), seed,
            mirostat.tau, mirostat.eta, mirostat_m));
  }
  else if (mirostat.version == 2) {
    llama_sampler_chain_add(
        smpl,
        llama_sampler_init_mirostat_v2(
            seed, mirostat.tau, mirostat.eta));
  }
}

  void
Inference::reinitialize(const ChatOptions& opt, const struct llama_model* model)
{
  fildesh::ofstream eout("/dev/stderr");

  const auto* sampling = std::get_if<rendezllama::inference::Sampling>(&opt.infer_via);
  assert(sampling);
  auto seed = sampling->seed;
  if (smpl_ || seed < 0) {
    // We're retrying or just don't have a fixed seed, so we should reseed.
    seed = new_sampling_seed();
  }
  if (smpl_) {
    llama_sampler_free(smpl_);
    eout.open("/dev/null");
  }
  token_count_ = 0;
  auto smpl_param = llama_sampler_chain_default_params();
  smpl_ = llama_sampler_chain_init(smpl_param);

  for (const auto& adjust_via : sampling->adjust_thru) {
    apply_sampler_chain(smpl_, adjust_via, model, seed, eout);
  }

  if (std::get_if<rendezllama::inference::Probability>(&sampling->pick_via)) {
    llama_sampler_chain_add(smpl_, llama_sampler_init_dist(seed));
  }
  else if (std::get_if<rendezllama::inference::Determinism>(&sampling->pick_via)) {
    llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());
  }
  else if (const auto* adaptive_p = std::get_if<rendezllama::inference::AdaptiveP>(&sampling->pick_via)) {
    adaptive_p_sample(smpl_, *adaptive_p, seed);
  }
  else if (const auto* mirostat = std::get_if<rendezllama::inference::Mirostat>(&sampling->pick_via)) {
    mirostat_sample(smpl_, *mirostat, seed, vocabulary_);
    eout << "mirostat:"
      << "\n  version: " << mirostat->version
      << "\n";
  }
  else {
    fildesh_log_error("Missing pick method?");
  }
}

  bool
Inference::commit_to_context(
    struct llama_context* ctx,
    ChatDisplay& chat_disp,
    ChatTrajectory& chat_traj,
    const ChatOptions& opt,
    const llama_model* model)
{
  assert(!chat_traj.erased_since_eval_ ||
         chat_traj.context_token_count_ < chat_traj.token_count());
  if (chat_traj.context_token_count_ < chat_traj.token_count()) {
    this->reinitialize(opt, model);
  }
  if (chat_traj.context_token_count_ == chat_traj.token_count()) {
    return true;
  }

  chat_traj.maybe_rollforget_within_limit(opt.context_token_limit, vocabulary_);

  // Reset thread count just in case the user reconfigured it.
  const unsigned thread_count = opt.thread_count;
  unsigned batch_thread_count = opt.batch_thread_count;
  if (batch_thread_count == 0) {
    batch_thread_count = std::thread::hardware_concurrency();
  }
  if (batch_thread_count == 0) {
    batch_thread_count = thread_count;
  }
  llama_set_n_threads(ctx, thread_count, batch_thread_count);

  // Clear KV cache past current position just in case the user deleted tokens.
  llama_memory_seq_rm(
      llama_get_memory(ctx),
      0, chat_traj.context_token_count_, -1);

  while (chat_traj.context_token_count_ < chat_traj.token_count()) {
    const unsigned n = std::min(
        opt.batch_count,
        chat_traj.token_count() - chat_traj.context_token_count_);

#if LLAMA_OPENBLAS_ON
    if (n < 32) {
      llama_set_n_threads(ctx, thread_count, batch_thread_count);
    }
    else {
      llama_set_n_threads(ctx, thread_count, 1);
    }
#endif
    chat_disp.show_new(chat_traj.context_token_count_ + n, chat_traj, vocabulary_);

    llama_batch batch = llama_batch_get_one(
        const_cast<int*>(&chat_traj.tokens()[chat_traj.context_token_count_]),
        n);
    const int istat = llama_decode(ctx, batch);
    if (istat != 0) {
      fildesh_log_error("Failed to eval.");
      chat_traj.context_token_count_ = 0;
      return false;
    }
    else {
      chat_traj.context_token_count_ += n;
    }
  }
  assert(chat_traj.context_token_count_ == chat_traj.token_count());
  chat_traj.erased_since_eval_ = false;
  while (token_count_ < chat_traj.token_count()) {
    Vocabulary::Token_id token_id = chat_traj.token_at(token_count_);
    llama_sampler_accept(smpl_, token_id);
    token_count_ += 1;
  }
  return true;
}

  void
Inference::sample_to_trajectory(
    ChatTrajectory& chat_traj,
    struct llama_context* ctx,
    bool preventing_newline)
{
  float* logits = llama_get_logits(ctx);
  if (preventing_newline) {
    // Zero probability for message-ending tokens when requested.
    logits[vocabulary_.eos_token_id()] = 0;
    logits[vocabulary_.newline_token_id()] = 0;
  }

  std::vector<llama_token_data> candidates;
  candidates.resize(vocabulary_.cardinality());
  for (llama_token i = 0; i < (llama_token)candidates.size(); ++i) {
    candidates[i] = llama_token_data{
      i, logits[i], 0.0f,
    };
  }
  logits = NULL;
  llama_token_data_array candidates_data[1] = {{
    candidates.data(),
    candidates.size(),
    /*selected=*/0,
    /*sorted=*/false,
  }};
  llama_sampler_apply(smpl_, candidates_data);
  chat_traj.push_back(candidates[candidates_data->selected].id);
  llama_sampler_accept(smpl_, chat_traj.token());
  token_count_ += 1;
}


1	#include "src/language/inference.hh"
2
3	#include <algorithm>
4	#include <cassert>
5	#include <cstring>
6	#include <thread>
7
8	#include <fildesh/fildesh.h>
9	#include <fildesh/ostream.hh>
10
11	#include "src/chat/display.hh"
12	#include "src/chat/guide.hh"
13	#include "src/chat/opt.hh"
14	#include "src/chat/trajectory.hh"
15	#include "src/language/vocabulary.hh"
16
17	using rendezllama::ChatDisplay;
18	using rendezllama::ChatGuide;
19	using rendezllama::ChatOptions;
20	using rendezllama::ChatTrajectory;
21	using rendezllama::Inference;
22	using rendezllama::Vocabulary;
23	using rendezllama::inference::AdjustViaKind;
24
25	Inference::Inference(const Vocabulary& vocabulary)	2✔
26	: vocabulary_(vocabulary)	2✔
27	{}	2✔
28	Inference::~Inference() {	2✔
29	if (smpl_) {llama_sampler_free(smpl_);}	2✔
30	}	2✔
31
32	const std::string&
33	rendezllama::antiprompt_suffix(	5✔
34	std::string_view text,
35	const std::set<std::string>& antiprompts)
36	{
37	static const std::string empty_string;	5✔
38	for (const std::string& s : antiprompts) {	11✔
39	if (text.size() >= s.size()) {	9✔
40	const size_t offset = text.size() - s.size();	6✔
41	if (0 == memcmp(&text[offset], &s[0], s.size())) {	6✔
42	return s;	3✔
43	}
44	}
45	}
46	return empty_string;	2✔
47	}
48
49	static bool maybe_trim_endspace(std::string& s)	×
50	{
51	bool result = false;	×
52	while (!s.empty() && s.back() == ' ') {	×
53	s.pop_back();	×
54	result = true;	×
55	}
56	return result;	×
57	}
58
59	void
60	rendezllama::augment_tokenize_chat_input(	×
61	ChatGuide& chat_guide,
62	ChatTrajectory& chat_traj,
63	bool& prevent_subsequent_newline,
64	std::string s,
65	const Vocabulary& vocabulary,
66	const ChatOptions& opt)
67	{
68	prevent_subsequent_newline = false;	×
69	if (s.size() >= 2 && s[0] == '\\' && s[1] == 'n') {	×
70	chat_guide.end_turn();	×
71	chat_guide.begin_turn(opt.message_opts.size()-1);	×
72	s.erase(0, 2);	×
73	prevent_subsequent_newline = maybe_trim_endspace(s);	×
74	if (opt.message_opts.back().prefix.back() == '\n' && opt.linespace_on) {	×
75	if (!s.empty() && s.front() != ' ') {	×
76	s.insert(0, " ");	×
77	}
78	}
79	chat_traj.tokenize_append(s, vocabulary);	×
80	}
81	else if (s.front() == '\n') {	×
82	// This is from /yield.
83	chat_guide.yield_turn(s.substr(1));	×
84	}
85	else if (s.front() == ' ') {	×
86	prevent_subsequent_newline = maybe_trim_endspace(s);	×
87	chat_traj.tokenize_append(s, vocabulary);	×
88	}
89	else {
90	chat_guide.yield_turn(0);	×
91	if (opt.message_opts[0].prefix.back() == '\n' && opt.linespace_on) {	×
92	if (!s.empty() && s.front() != ' ') {	×
93	s.insert(0, " ");	×
94	}
95	}
96	chat_traj.tokenize_append(s, vocabulary);	×
97	chat_guide.yield_turn();	×
98	chat_traj.display_token_count_ = chat_traj.rfind_message_prefix_begin_at(	×
99	chat_traj.token_count()-1);	×
100	prevent_subsequent_newline = true;	×
101	}
102	}	×
103
104	std::tuple<struct llama_model, struct llama_context>
105	rendezllama::make_llama_context(rendezllama::ChatOptions& opt)	2✔
106	{
107	llama_model_params model_params = llama_model_default_params();	2✔
108	model_params.use_mlock = opt.mlock_on;	2✔
109	model_params.use_mmap = opt.mmap_on;	2✔
110
111	struct llama_model* model = llama_model_load_from_file(	2✔
112	opt.model_filename.c_str(), model_params);	2✔
113	if (!model) {	2✔
114	fildesh_log_error("Failed to open model.");	×
115	return std::make_tuple(nullptr, nullptr);	×
116	}
117
118	if (opt.model_token_limit == 0) {	2✔
119	opt.model_token_limit = llama_model_n_ctx_train(model);	2✔
120	}
121	if (opt.context_token_limit == 0) {	2✔
122	opt.context_token_limit = opt.model_token_limit;	2✔
123	}
124
125	model_params = llama_model_default_params();	2✔
126	model_params.use_mlock = opt.mlock_on;	2✔
127	model_params.use_mmap = opt.mmap_on;	2✔
128
129	llama_context_params ctx_params = llama_context_default_params();	2✔
130	ctx_params.n_ctx = opt.context_token_limit;	2✔
131	ctx_params.n_threads = opt.thread_count;	2✔
132	ctx_params.n_batch = opt.batch_count;	2✔
133	ctx_params.rope_freq_scale = llama_model_rope_freq_scale_train(model);	2✔
134	assert(ctx_params.rope_freq_scale > 0.0);	2✔
135	while (
136	(unsigned)(opt.model_token_limit / ctx_params.rope_freq_scale)	2✔
137	<	2✔
138	opt.context_token_limit)	2✔
139	{
140	ctx_params.rope_freq_scale /= 2;	×
141	}
142
143	struct llama_context* ctx = llama_init_from_model(model, ctx_params);	2✔
144	if (!ctx) {	2✔
145	llama_model_free(model);	×
146	fildesh_log_error("Failed to create context.");	×
147	return std::make_tuple(nullptr, nullptr);	×
148	}
149	return std::make_tuple(model, ctx);	2✔
150	}
151
152	static
153	int
154	new_sampling_seed()	15✔
155	{
156	return static_cast<int>(INT_MAX & time(NULL));	15✔
157	}
158
159	static
160	void
161	apply_sampler_chain(	16✔
162	struct llama_sampler* smpl,
163	const rendezllama::inference::AdjustVia& adjust_via,
164	const struct llama_model* model,
165	unsigned seed,
166	std::ostream& eout)
167	{
168	const unsigned keep_one = 1;	16✔
169
170	if (const auto* dry = std::get_if<AdjustViaKind::dry>(&adjust_via)) {	16✔
171	static const char* seq_breakers[] = {	1✔
172	"\n", ":",
173	};
174	llama_sampler_init_dry(	1✔
175	llama_model_get_vocab(model),
176	llama_model_n_ctx_train(model),
177	dry->multiplier,	1✔
178	dry->base,	1✔
179	dry->allowed_length,	1✔
180	dry->window_length,	1✔
181	seq_breakers,
182	sizeof(seq_breakers)/sizeof(*seq_breakers));
183	eout << "dry:"	1✔
184	<< "\n multiplier: " << dry->multiplier	1✔
185	<< "\n base: " << dry->base	1✔
186	<< "\n allowed_length: " << dry->allowed_length	1✔
187	<< "\n window_length: " << dry->window_length	1✔
188	<< "\n";	1✔
189	}
190	if (const auto* min_p = std::get_if<AdjustViaKind::min_p>(&adjust_via)) {	16✔
191	llama_sampler_chain_add(smpl, llama_sampler_init_min_p(*min_p, keep_one));	5✔
192	eout << "min_p: " << *min_p << "\n";	5✔
193	}
194	if (const auto* penalize_with = std::get_if<AdjustViaKind::penalize_with>(&adjust_via)) {	16✔
195	llama_sampler_init_penalties(	1✔
196	penalize_with->window_length,	1✔
197	penalize_with->repetition,	1✔
198	penalize_with->frequency,	1✔
199	penalize_with->presence);	1✔
200	eout << "penalties:"	1✔
201	<< "\n window_length: " << penalize_with->window_length	1✔
202	<< "\n repetition: " << penalize_with->repetition	1✔
203	<< "\n frequency: " << penalize_with->frequency	1✔
204	<< "\n presence: " << penalize_with->presence	1✔
205	<< "\n";	1✔
206	}
207	if (const auto* temperature = std::get_if<AdjustViaKind::temperature>(&adjust_via)) {	16✔
208	llama_sampler_chain_add(smpl, llama_sampler_init_temp(*temperature));	5✔
209	eout << "temperature: " << *temperature << "\n";	5✔
210	}
211	if (const auto* top_k = std::get_if<AdjustViaKind::top_k>(&adjust_via)) {	16✔
212	llama_sampler_chain_add(smpl, llama_sampler_init_top_k(*top_k));	1✔
213	eout << "top_k: " << *top_k << "\n";	1✔
214	}
215	if (const auto* top_p = std::get_if<AdjustViaKind::top_p>(&adjust_via)) {	16✔
216	llama_sampler_chain_add(smpl, llama_sampler_init_top_p(*top_p, keep_one));	1✔
217	eout << "top_p: " << *top_p << "\n";	1✔
218	}
219	if (const auto* typical_p = std::get_if<AdjustViaKind::typical_p>(&adjust_via)) {	16✔
220	llama_sampler_chain_add(smpl, llama_sampler_init_typical(*typical_p, keep_one));	1✔
221	eout << "typical_p: " << *typical_p << "\n";	1✔
222	}
223	if (const auto* xtc = std::get_if<AdjustViaKind::xtc>(&adjust_via)) {	16✔
224	llama_sampler_chain_add(smpl, llama_sampler_init_xtc(xtc->probability, xtc->threshold, keep_one, seed));	1✔
225	eout << "xtc: "	1✔
226	<< "\n probability: " << xtc->probability	1✔
227	<< "\n threshold: " << xtc->threshold	1✔
228	<< "\n";	1✔
229	}
230	}	16✔
231
232	static
233	void
234	adaptive_p_sample(	1✔
235	struct llama_sampler* smpl,
236	const rendezllama::inference::AdaptiveP& adaptive_p,
237	unsigned seed)
238	{
239	llama_sampler_chain_add(	1✔
240	smpl,
241	llama_sampler_init_adaptive_p(
242	adaptive_p.target,	1✔
243	adaptive_p.decay,	1✔
244	seed));
245	}	1✔
246
247	static
248	void
249	mirostat_sample(	1✔
250	struct llama_sampler* smpl,
251	const rendezllama::inference::Mirostat& mirostat,
252	unsigned seed,
253	const rendezllama::Vocabulary& vocabulary)
254	{
255	if (mirostat.version == 1) {	1✔
256	const int mirostat_m = 100;	×
257	llama_sampler_chain_add(	×
258	smpl,
259	llama_sampler_init_mirostat(
260	vocabulary.cardinality(), seed,	×
261	mirostat.tau, mirostat.eta, mirostat_m));	×
262	}
263	else if (mirostat.version == 2) {	1✔
264	llama_sampler_chain_add(	1✔
265	smpl,
266	llama_sampler_init_mirostat_v2(
267	seed, mirostat.tau, mirostat.eta));	1✔
268	}
269	}	1✔
270
271	void
272	Inference::reinitialize(const ChatOptions& opt, const struct llama_model* model)	15✔
273	{
274	fildesh::ofstream eout("/dev/stderr");	15✔
275
276	const auto* sampling = std::get_if<rendezllama::inference::Sampling>(&opt.infer_via);	15✔
277	assert(sampling);	×
278	auto seed = sampling->seed;	15✔
279	if (smpl_ \|\| seed < 0) {	15✔
280	// We're retrying or just don't have a fixed seed, so we should reseed.
281	seed = new_sampling_seed();	15✔
282	}
283	if (smpl_) {	15✔
284	llama_sampler_free(smpl_);	13✔
285	eout.open("/dev/null");	13✔
286	}
287	token_count_ = 0;	15✔
288	auto smpl_param = llama_sampler_chain_default_params();	15✔
289	smpl_ = llama_sampler_chain_init(smpl_param);	15✔
290
291	for (const auto& adjust_via : sampling->adjust_thru) {	31✔
292	apply_sampler_chain(smpl_, adjust_via, model, seed, eout);	16✔
293	}
294
295	if (std::get_if<rendezllama::inference::Probability>(&sampling->pick_via)) {	15✔
296	llama_sampler_chain_add(smpl_, llama_sampler_init_dist(seed));	×
297	}
298	else if (std::get_if<rendezllama::inference::Determinism>(&sampling->pick_via)) {	15✔
299	llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());	1✔
300	}
301	else if (const auto* adaptive_p = std::get_if<rendezllama::inference::AdaptiveP>(&sampling->pick_via)) {	14✔
302	adaptive_p_sample(smpl_, *adaptive_p, seed);	1✔
303	}
304	else if (const auto* mirostat = std::get_if<rendezllama::inference::Mirostat>(&sampling->pick_via)) {	13✔
305	mirostat_sample(smpl_, *mirostat, seed, vocabulary_);	1✔
306	eout << "mirostat:"	1✔
307	<< "\n version: " << mirostat->version	1✔
308	<< "\n";	1✔
309	}
310	else {
311	fildesh_log_error("Missing pick method?");	12✔
312	}
313	}	15✔
314
315	bool
316	Inference::commit_to_context(	15✔
317	struct llama_context* ctx,
318	ChatDisplay& chat_disp,
319	ChatTrajectory& chat_traj,
320	const ChatOptions& opt,
321	const llama_model* model)
322	{
323	assert(!chat_traj.erased_since_eval_ \|\|	15✔
324	chat_traj.context_token_count_ < chat_traj.token_count());
325	if (chat_traj.context_token_count_ < chat_traj.token_count()) {	15✔
326	this->reinitialize(opt, model);	15✔
327	}
328	if (chat_traj.context_token_count_ == chat_traj.token_count()) {	15✔
329	return true;
330	}
331
332	chat_traj.maybe_rollforget_within_limit(opt.context_token_limit, vocabulary_);	15✔
333
334	// Reset thread count just in case the user reconfigured it.
335	const unsigned thread_count = opt.thread_count;	15✔
336	unsigned batch_thread_count = opt.batch_thread_count;	15✔
337	if (batch_thread_count == 0) {	15✔
338	batch_thread_count = std::thread::hardware_concurrency();	15✔
339	}
340	if (batch_thread_count == 0) {	15✔
341	batch_thread_count = thread_count;	×
342	}
343	llama_set_n_threads(ctx, thread_count, batch_thread_count);	15✔
344
345	// Clear KV cache past current position just in case the user deleted tokens.
346	llama_memory_seq_rm(	30✔
347	llama_get_memory(ctx),
348	0, chat_traj.context_token_count_, -1);	15✔
349
350	while (chat_traj.context_token_count_ < chat_traj.token_count()) {	45✔
351	const unsigned n = std::min(	15✔
352	opt.batch_count,	15✔
353	chat_traj.token_count() - chat_traj.context_token_count_);	15✔
354
355	#if LLAMA_OPENBLAS_ON
356	if (n < 32) {
357	llama_set_n_threads(ctx, thread_count, batch_thread_count);
358	}
359	else {
360	llama_set_n_threads(ctx, thread_count, 1);
361	}
362	#endif
363	chat_disp.show_new(chat_traj.context_token_count_ + n, chat_traj, vocabulary_);	15✔
364
365	llama_batch batch = llama_batch_get_one(	15✔
366	const_cast<int*>(&chat_traj.tokens()[chat_traj.context_token_count_]),	15✔
367	n);
368	const int istat = llama_decode(ctx, batch);	15✔
369	if (istat != 0) {	15✔
370	fildesh_log_error("Failed to eval.");	×
371	chat_traj.context_token_count_ = 0;	×
372	return false;	×
373	}
374	else {
375	chat_traj.context_token_count_ += n;	15✔
376	}
377	}
378	assert(chat_traj.context_token_count_ == chat_traj.token_count());	15✔
379	chat_traj.erased_since_eval_ = false;	15✔
380	while (token_count_ < chat_traj.token_count()) {	211✔
381	Vocabulary::Token_id token_id = chat_traj.token_at(token_count_);	196✔
382	llama_sampler_accept(smpl_, token_id);	196✔
383	token_count_ += 1;	196✔
384	}
385	return true;
386	}
387
388	void
389	Inference::sample_to_trajectory(	15✔
390	ChatTrajectory& chat_traj,
391	struct llama_context* ctx,
392	bool preventing_newline)
393	{
394	float* logits = llama_get_logits(ctx);	15✔
395	if (preventing_newline) {	15✔
396	// Zero probability for message-ending tokens when requested.
397	logits[vocabulary_.eos_token_id()] = 0;	×
398	logits[vocabulary_.newline_token_id()] = 0;	×
399	}
400
401	std::vector<llama_token_data> candidates;	15✔
402	candidates.resize(vocabulary_.cardinality());	15✔
403	for (llama_token i = 0; i < (llama_token)candidates.size(); ++i) {	153,615✔
404	candidates[i] = llama_token_data{	153,600✔
405	i, logits[i], 0.0f,	153,600✔
406	};
407	}
408	logits = NULL;	15✔
409	llama_token_data_array candidates_data[1] = {{	15✔
410	candidates.data(),	15✔
411	candidates.size(),	15✔
412	/selected=/0,
413	/sorted=/false,
414	}};	15✔
415	llama_sampler_apply(smpl_, candidates_data);	15✔
416	chat_traj.push_back(candidates[candidates_data->selected].id);	15✔
417	llama_sampler_accept(smpl_, chat_traj.token());	15✔
418	token_count_ += 1;	15✔
419	}	15✔
420

rendezqueue / rendezllama / 21157153067

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous