• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

rendezqueue / rendezllama / 21199465653

21 Jan 2026 05:07AM UTC coverage: 88.225% (-2.1%) from 90.342%
21199465653

push

github

grencez
Update localserv to support chat interface and CLI args

2098 of 2378 relevant lines covered (88.23%)

21310.64 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

57.02
/src/language/inference.cc
1
#include "src/language/inference.hh"
2

3
#include <algorithm>
4
#include <cassert>
5
#include <cstring>
6
#include <thread>
7

8
#include <fildesh/fildesh.h>
9
#include <fildesh/ostream.hh>
10

11
#include "src/chat/display.hh"
12
#include "src/chat/guide.hh"
13
#include "src/chat/opt.hh"
14
#include "src/chat/trajectory.hh"
15
#include "src/language/vocabulary.hh"
16

17
using rendezllama::ChatDisplay;
18
using rendezllama::ChatGuide;
19
using rendezllama::ChatOptions;
20
using rendezllama::ChatTrajectory;
21
using rendezllama::Inference;
22
using rendezllama::Vocabulary;
23
using rendezllama::inference::AdjustViaKind;
24

25
Inference::Inference(const Vocabulary& vocabulary)
2✔
26
  : vocabulary_(vocabulary)
2✔
27
{}
2✔
28
Inference::~Inference() {
2✔
29
  if (smpl_) {llama_sampler_free(smpl_);}
2✔
30
  llama_batch_free(batch_);
2✔
31
}
2✔
32

33
  const std::string&
34
rendezllama::antiprompt_suffix(
5✔
35
    std::string_view text,
36
    const std::set<std::string>& antiprompts)
37
{
38
  static const std::string empty_string;
5✔
39
  for (const std::string& s : antiprompts) {
11✔
40
    if (text.size() >= s.size()) {
9✔
41
      const size_t offset = text.size() - s.size();
6✔
42
      if (0 == memcmp(&text[offset], &s[0], s.size())) {
6✔
43
        return s;
3✔
44
      }
45
    }
46
  }
47
  return empty_string;
2✔
48
}
49

50
static bool maybe_trim_endspace(std::string& s)
×
51
{
52
  bool result = false;
×
53
  while (!s.empty() && s.back() == ' ') {
×
54
    s.pop_back();
×
55
    result = true;
×
56
  }
57
  return result;
×
58
}
59

60
  void
61
rendezllama::augment_tokenize_chat_input(
×
62
    ChatGuide& chat_guide,
63
    ChatTrajectory& chat_traj,
64
    bool& prevent_subsequent_newline,
65
    std::string s,
66
    const Vocabulary& vocabulary,
67
    const ChatOptions& opt)
68
{
69
  prevent_subsequent_newline = false;
×
70
  if (s.size() >= 2 && s[0] == '\\' && s[1] == 'n') {
×
71
    chat_guide.end_turn();
×
72
    chat_guide.begin_turn(opt.message_opts.size()-1);
×
73
    s.erase(0, 2);
×
74
    prevent_subsequent_newline = maybe_trim_endspace(s);
×
75
    if (opt.message_opts.back().prefix.back() == '\n' && opt.linespace_on) {
×
76
      if (!s.empty() && s.front() != ' ') {
×
77
        s.insert(0, " ");
×
78
      }
79
    }
80
    chat_traj.tokenize_append(s, vocabulary);
×
81
  }
82
  else if (s.front() == '\n') {
×
83
    // This is from /yield.
84
    chat_guide.yield_turn(s.substr(1));
×
85
  }
86
  else if (s.front() == ' ') {
×
87
    prevent_subsequent_newline = maybe_trim_endspace(s);
×
88
    chat_traj.tokenize_append(s, vocabulary);
×
89
  }
90
  else {
91
    chat_guide.yield_turn(0);
×
92
    if (opt.message_opts[0].prefix.back() == '\n' && opt.linespace_on) {
×
93
      if (!s.empty() && s.front() != ' ') {
×
94
        s.insert(0, " ");
×
95
      }
96
    }
97
    chat_traj.tokenize_append(s, vocabulary);
×
98
    chat_guide.yield_turn();
×
99
    chat_traj.display_token_count_ = chat_traj.rfind_message_prefix_begin_at(
×
100
        chat_traj.token_count()-1);
×
101
    prevent_subsequent_newline = true;
×
102
  }
103
}
×
104

105
  std::tuple<struct llama_model*, struct llama_context*>
106
rendezllama::make_llama_context(rendezllama::ChatOptions& opt)
2✔
107
{
108
  llama_model_params model_params = llama_model_default_params();
2✔
109
  model_params.use_mlock = opt.mlock_on;
2✔
110
  model_params.use_mmap = opt.mmap_on;
2✔
111

112
  struct llama_model* model = llama_model_load_from_file(
2✔
113
      opt.model_filename.c_str(), model_params);
2✔
114
  if (!model) {
2✔
115
    fildesh_log_error("Failed to open model.");
×
116
    return std::make_tuple(nullptr, nullptr);
×
117
  }
118

119
  if (opt.model_token_limit == 0) {
2✔
120
    opt.model_token_limit = llama_model_n_ctx_train(model);
2✔
121
  }
122
  if (opt.context_token_limit == 0) {
2✔
123
    opt.context_token_limit = opt.model_token_limit;
2✔
124
  }
125

126
  model_params = llama_model_default_params();
2✔
127
  model_params.use_mlock = opt.mlock_on;
2✔
128
  model_params.use_mmap = opt.mmap_on;
2✔
129

130
  llama_context_params ctx_params = llama_context_default_params();
2✔
131
  ctx_params.n_ctx = opt.context_token_limit;
2✔
132
  ctx_params.n_threads = opt.thread_count;
2✔
133
  ctx_params.n_batch = opt.batch_count;
2✔
134
  ctx_params.rope_freq_scale = llama_model_rope_freq_scale_train(model);
2✔
135
  assert(ctx_params.rope_freq_scale > 0.0);
2✔
136
  while (
137
      (unsigned)(opt.model_token_limit / ctx_params.rope_freq_scale)
2✔
138
      <
2✔
139
      opt.context_token_limit)
2✔
140
  {
141
    ctx_params.rope_freq_scale /= 2;
×
142
  }
143

144
  struct llama_context* ctx = llama_init_from_model(model, ctx_params);
2✔
145
  if (!ctx) {
2✔
146
    llama_model_free(model);
×
147
    fildesh_log_error("Failed to create context.");
×
148
    return std::make_tuple(nullptr, nullptr);
×
149
  }
150
  return std::make_tuple(model, ctx);
2✔
151
}
152

153
static
154
  int
155
new_sampling_seed()
2✔
156
{
157
  return static_cast<int>(INT_MAX & time(NULL));
2✔
158
}
159

160
static
161
  void
162
apply_sampler_chain(
1✔
163
    struct llama_sampler* smpl,
164
    const rendezllama::inference::AdjustVia& adjust_via,
165
    const struct llama_model* model,
166
    unsigned seed,
167
    std::ostream& eout)
168
{
169
  const unsigned keep_one = 1;
1✔
170

171
  if (const auto* dry = std::get_if<AdjustViaKind::dry>(&adjust_via)) {
1✔
172
    static const char* seq_breakers[] = {
×
173
      "\n", ":",
174
    };
175
    llama_sampler_init_dry(
×
176
        llama_model_get_vocab(model),
177
        llama_model_n_ctx_train(model),
178
        dry->multiplier,
×
179
        dry->base,
×
180
        dry->allowed_length,
×
181
        dry->window_length,
×
182
        seq_breakers,
183
        sizeof(seq_breakers)/sizeof(*seq_breakers));
184
    eout << "dry:"
×
185
      << "\n  multiplier: " << dry->multiplier
×
186
      << "\n  base: " << dry->base
×
187
      << "\n  allowed_length: " << dry->allowed_length
×
188
      << "\n  window_length: " << dry->window_length
×
189
      << "\n";
×
190
  }
191
  if (const auto* min_p = std::get_if<AdjustViaKind::min_p>(&adjust_via)) {
1✔
192
    llama_sampler_chain_add(smpl, llama_sampler_init_min_p(*min_p, keep_one));
×
193
    eout << "min_p: " << *min_p << "\n";
×
194
  }
195
  if (const auto* penalize_with = std::get_if<AdjustViaKind::penalize_with>(&adjust_via)) {
1✔
196
    llama_sampler_init_penalties(
×
197
        penalize_with->window_length,
×
198
        penalize_with->repetition,
×
199
        penalize_with->frequency,
×
200
        penalize_with->presence);
×
201
    eout << "penalties:"
×
202
      << "\n  window_length: " << penalize_with->window_length
×
203
      << "\n  repetition: " << penalize_with->repetition
×
204
      << "\n  frequency: " << penalize_with->frequency
×
205
      << "\n  presence: " << penalize_with->presence
×
206
      << "\n";
×
207
  }
208
  if (const auto* temperature = std::get_if<AdjustViaKind::temperature>(&adjust_via)) {
1✔
209
    llama_sampler_chain_add(smpl, llama_sampler_init_temp(*temperature));
1✔
210
    eout << "temperature: " << *temperature << "\n";
1✔
211
  }
212
  if (const auto* top_k = std::get_if<AdjustViaKind::top_k>(&adjust_via)) {
1✔
213
    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(*top_k));
×
214
    eout << "top_k: " << *top_k << "\n";
×
215
  }
216
  if (const auto* top_p = std::get_if<AdjustViaKind::top_p>(&adjust_via)) {
1✔
217
    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(*top_p, keep_one));
×
218
    eout << "top_p: " << *top_p << "\n";
×
219
  }
220
  if (const auto* typical_p = std::get_if<AdjustViaKind::typical_p>(&adjust_via)) {
1✔
221
    llama_sampler_chain_add(smpl, llama_sampler_init_typical(*typical_p, keep_one));
×
222
    eout << "typical_p: " << *typical_p << "\n";
×
223
  }
224
  if (const auto* xtc = std::get_if<AdjustViaKind::xtc>(&adjust_via)) {
1✔
225
    llama_sampler_chain_add(smpl, llama_sampler_init_xtc(xtc->probability, xtc->threshold, keep_one, seed));
×
226
    eout << "xtc: "
×
227
      << "\n  probability: " << xtc->probability
×
228
      << "\n  threshold: " << xtc->threshold
×
229
      << "\n";
×
230
  }
231
}
1✔
232

233
static
234
  void
235
adaptive_p_sample(
×
236
    struct llama_sampler* smpl,
237
    const rendezllama::inference::AdaptiveP& adaptive_p,
238
    unsigned seed)
239
{
240
  llama_sampler_chain_add(
×
241
      smpl,
242
      llama_sampler_init_adaptive_p(
243
          adaptive_p.target,
×
244
          adaptive_p.decay,
×
245
          seed));
246
}
×
247

248
static
249
  void
250
mirostat_sample(
×
251
    struct llama_sampler* smpl,
252
    const rendezllama::inference::Mirostat& mirostat,
253
    unsigned seed,
254
    const rendezllama::Vocabulary& vocabulary)
255
{
256
  if (mirostat.version == 1) {
×
257
    const int mirostat_m = 100;
×
258
    llama_sampler_chain_add(
×
259
        smpl,
260
        llama_sampler_init_mirostat(
261
            vocabulary.cardinality(), seed,
×
262
            mirostat.tau, mirostat.eta, mirostat_m));
×
263
  }
264
  else if (mirostat.version == 2) {
×
265
    llama_sampler_chain_add(
×
266
        smpl,
267
        llama_sampler_init_mirostat_v2(
268
            seed, mirostat.tau, mirostat.eta));
×
269
  }
270
}
×
271

272
  void
273
Inference::reinitialize(const ChatOptions& opt, const struct llama_model* model)
2✔
274
{
275
  fildesh::ofstream eout("/dev/stderr");
2✔
276

277
  const auto* sampling = std::get_if<rendezllama::inference::Sampling>(&opt.infer_via);
2✔
278
  assert(sampling);
×
279
  auto seed = sampling->seed;
2✔
280
  if (smpl_ || seed < 0) {
2✔
281
    // We're retrying or just don't have a fixed seed, so we should reseed.
282
    seed = new_sampling_seed();
2✔
283
  }
284
  if (smpl_) {
2✔
285
    llama_sampler_free(smpl_);
×
286
    eout.open("/dev/null");
×
287
  }
288
  token_count_ = 0;
2✔
289
  auto smpl_param = llama_sampler_chain_default_params();
2✔
290
  smpl_ = llama_sampler_chain_init(smpl_param);
2✔
291

292
  for (const auto& adjust_via : sampling->adjust_thru) {
3✔
293
    apply_sampler_chain(smpl_, adjust_via, model, seed, eout);
1✔
294
  }
295

296
  if (std::get_if<rendezllama::inference::Probability>(&sampling->pick_via)) {
2✔
297
    llama_sampler_chain_add(smpl_, llama_sampler_init_dist(seed));
×
298
  }
299
  else if (std::get_if<rendezllama::inference::Determinism>(&sampling->pick_via)) {
2✔
300
    llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());
1✔
301
  }
302
  else if (const auto* adaptive_p = std::get_if<rendezllama::inference::AdaptiveP>(&sampling->pick_via)) {
1✔
303
    adaptive_p_sample(smpl_, *adaptive_p, seed);
×
304
  }
305
  else if (const auto* mirostat = std::get_if<rendezllama::inference::Mirostat>(&sampling->pick_via)) {
1✔
306
    mirostat_sample(smpl_, *mirostat, seed, vocabulary_);
×
307
  }
308
  else {
309
    fildesh_log_error("Missing pick method? Using greedy.");
1✔
310
    llama_sampler_chain_add(smpl_, llama_sampler_init_greedy());
1✔
311
  }
312
}
2✔
313

314
  bool
315
Inference::commit_to_context(
524✔
316
    struct llama_context* ctx,
317
    ChatDisplay& chat_disp,
318
    ChatTrajectory& chat_traj,
319
    const ChatOptions& opt,
320
    const llama_model* model)
321
{
322
  assert(!chat_traj.erased_since_eval_ ||
524✔
323
         chat_traj.context_token_count_ < chat_traj.token_count());
324
  if (chat_traj.erased_since_eval_ || !smpl_) {
524✔
325
    this->reinitialize(opt, model);
2✔
326
  }
327
  if (chat_traj.context_token_count_ == chat_traj.token_count()) {
524✔
328
    return true;
329
  }
330

331
  chat_traj.maybe_rollforget_within_limit(opt.context_token_limit, vocabulary_);
524✔
332

333
  // Reset thread count just in case the user reconfigured it.
334
  const unsigned thread_count = opt.thread_count;
524✔
335
  unsigned batch_thread_count = opt.batch_thread_count;
524✔
336
  if (batch_thread_count == 0) {
524✔
337
    batch_thread_count = std::thread::hardware_concurrency();
524✔
338
  }
339
  if (batch_thread_count == 0) {
524✔
340
    batch_thread_count = thread_count;
×
341
  }
342
  llama_set_n_threads(ctx, thread_count, batch_thread_count);
524✔
343

344
  // Clear KV cache past current position just in case the user deleted tokens.
345
  llama_memory_seq_rm(
1,048✔
346
      llama_get_memory(ctx),
347
      0, chat_traj.context_token_count_, -1);
524✔
348

349
  while (chat_traj.context_token_count_ < chat_traj.token_count()) {
1,572✔
350
    const unsigned n = std::min(
524✔
351
        opt.batch_count,
524✔
352
        chat_traj.token_count() - chat_traj.context_token_count_);
524✔
353

354
#if LLAMA_OPENBLAS_ON
355
    if (n < 32) {
356
      llama_set_n_threads(ctx, thread_count, batch_thread_count);
357
    }
358
    else {
359
      llama_set_n_threads(ctx, thread_count, 1);
360
    }
361
#endif
362
    chat_disp.show_new(chat_traj.context_token_count_ + n, chat_traj, vocabulary_);
524✔
363

364
    if (!batch_.token || (unsigned)batch_.n_tokens < n) {
524✔
365
      llama_batch_free(batch_);
2✔
366
      unsigned n_alloc = n;
2✔
367
      if (n_alloc < opt.batch_count) {n_alloc = opt.batch_count;}
2✔
368
      batch_ = llama_batch_init(n_alloc, /*embd=*/0, /*n_seq_max=*/1);
2✔
369
    }
370
    batch_.n_tokens = n;
524✔
371
    for (unsigned i = 0; i < n; ++i) {
1,071✔
372
      batch_.token[i] = chat_traj.tokens()[chat_traj.context_token_count_ + i];
547✔
373
      batch_.pos[i] = chat_traj.context_token_count_ + i;
547✔
374
      batch_.n_seq_id[i] = 1;
547✔
375
      batch_.seq_id[i][0] = 0;
547✔
376
      batch_.logits[i] = (i == n - 1);
547✔
377
    }
378

379
    const int istat = llama_decode(ctx, batch_);
524✔
380

381
    if (istat != 0) {
524✔
382
      fildesh_log_error("Failed to eval.");
×
383
      chat_traj.context_token_count_ = 0;
×
384
      return false;
×
385
    }
386
    else {
387
      chat_traj.context_token_count_ += n;
524✔
388
    }
389
  }
390
  assert(chat_traj.context_token_count_ == chat_traj.token_count());
524✔
391
  chat_traj.erased_since_eval_ = false;
524✔
392
  while (token_count_ < chat_traj.token_count()) {
549✔
393
    Vocabulary::Token_id token_id = chat_traj.token_at(token_count_);
25✔
394
    llama_sampler_accept(smpl_, token_id);
25✔
395
    token_count_ += 1;
25✔
396
  }
397
  return true;
398
}
399

400
  void
401
Inference::sample_to_trajectory(
523✔
402
    ChatTrajectory& chat_traj,
403
    struct llama_context* ctx,
404
    bool preventing_newline)
405
{
406
  float* logits = llama_get_logits(ctx);
523✔
407
  if (preventing_newline) {
523✔
408
    // Zero probability for message-ending tokens when requested.
409
    logits[vocabulary_.eos_token_id()] = 0;
×
410
    logits[vocabulary_.newline_token_id()] = 0;
×
411
  }
412

413
  std::vector<llama_token_data> candidates;
523✔
414
  candidates.resize(vocabulary_.cardinality());
523✔
415
  for (llama_token i = 0; i < (llama_token)candidates.size(); ++i) {
16,800,267✔
416
    candidates[i] = llama_token_data{
16,799,744✔
417
      i, logits[i], 0.0f,
16,799,744✔
418
    };
419
  }
420
  logits = NULL;
523✔
421
  llama_token_data_array candidates_data[1] = {{
523✔
422
    candidates.data(),
523✔
423
    candidates.size(),
523✔
424
    /*selected=*/0,
425
    /*sorted=*/false,
426
  }};
523✔
427
  llama_sampler_apply(smpl_, candidates_data);
523✔
428
  chat_traj.push_back(candidates[candidates_data->selected].id);
523✔
429
  llama_sampler_accept(smpl_, chat_traj.token());
523✔
430
  token_count_ += 1;
523✔
431
}
523✔
432

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc