• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cofacts / rumors-api / 6621739611

24 Oct 2023 02:51AM UTC coverage: 88.365% (-0.06%) from 88.423%
6621739611

push

github

web-flow
Merge pull request #323 from cofacts/stt

feat(graphql): remove apparent hallucination

757 of 915 branches covered (0.0%)

Branch coverage included in aggregate %.

8 of 8 new or added lines in 1 file covered. (100.0%)

1529 of 1672 relevant lines covered (91.45%)

22.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.52
/src/graphql/util.js
1
import { ImageAnnotatorClient } from '@google-cloud/vision';
2
import {
3
  GraphQLInputObjectType,
4
  GraphQLObjectType,
5
  GraphQLString,
6
  GraphQLInt,
7
  GraphQLList,
8
  GraphQLEnumType,
9
  GraphQLFloat,
10
  GraphQLNonNull,
11
  GraphQLID,
12
  GraphQLBoolean,
13
} from 'graphql';
14
import fetch from 'node-fetch';
15
import ffmpeg from 'fluent-ffmpeg';
16

17
import Connection from './interfaces/Connection';
18
import Edge from './interfaces/Edge';
19
import PageInfo from './interfaces/PageInfo';
20
import Highlights from './models/Highlights';
21
import client from 'util/client';
22
import delayForMs from 'util/delayForMs';
23
import openai from 'util/openai';
24

25
// https://www.graph.cool/docs/tutorials/designing-powerful-apis-with-graphql-query-parameters-aing7uech3
26
//
27
// Filtering args definition & parsing
28
//
29

30
/**
31
 * @param {string} typeName
32
 * @param {GraphQLScalarType} argType
33
 * @param {string} description
34
 * @returns {GraphQLInputObjectType}
35
 */
36
function getArithmeticExpressionType(typeName, argType, description) {
37
  return new GraphQLInputObjectType({
88✔
38
    name: typeName,
39
    description,
40
    fields: {
41
      LT: { type: argType },
42
      LTE: { type: argType },
43
      GT: { type: argType },
44
      GTE: { type: argType },
45
      EQ: { type: argType },
46
    },
47
  });
48
}
49

50
export const timeRangeInput = getArithmeticExpressionType(
44✔
51
  'TimeRangeInput',
52
  GraphQLString,
53
  'List only the entries that were created between the specific time range. ' +
54
    'The time range value is in elasticsearch date format (https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html)'
55
);
56
export const intRangeInput = getArithmeticExpressionType(
44✔
57
  'RangeInput',
58
  GraphQLInt,
59
  'List only the entries whose field match the criteria.'
60
);
61

62
/**
63
 * @param {object} arithmeticFilterObj - {LT, LTE, GT, GTE, EQ}, the structure returned by getArithmeticExpressionType
64
 * @returns {object} Elasticsearch range filter param
65
 * @see https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html#range-query-field-params
66
 */
67
export function getRangeFieldParamFromArithmeticExpression(
68
  arithmeticFilterObj
69
) {
70
  // EQ overrides all other operators
71
  if (typeof arithmeticFilterObj.EQ !== 'undefined') {
43✔
72
    return {
2✔
73
      gte: arithmeticFilterObj.EQ,
74
      lte: arithmeticFilterObj.EQ,
75
    };
76
  }
77

78
  const conditionEntries = Object.entries(arithmeticFilterObj);
41✔
79

80
  if (conditionEntries.length === 0) throw new Error('Invalid Expression!');
41✔
81

82
  return Object.fromEntries(
40✔
83
    conditionEntries.map(([key, value]) => [key.toLowerCase(), value])
61✔
84
  );
85
}
86

87
export const moreLikeThisInput = new GraphQLInputObjectType({
44✔
88
  name: 'MoreLikeThisInput',
89
  description:
90
    'Parameters for Elasticsearch more_like_this query.\n' +
91
    'See: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-mlt-query.html',
92
  fields: {
93
    like: {
94
      type: GraphQLString,
95
      description: 'The text string to search for.',
96
    },
97
    minimumShouldMatch: {
98
      type: GraphQLString,
99
      description:
100
        'more_like_this query\'s "minimum_should_match" query param.\n' +
101
        'See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-minimum-should-match.html for possible values.',
102
    },
103
  },
104
});
105

106
export function createFilterType(typeName, args) {
107
  const filterType = new GraphQLInputObjectType({
279✔
108
    name: typeName,
109
    fields: () => ({
279✔
110
      ...args,
111
      // TODO: converting nested AND / OR to elasticsearch
112
      // AND: { type: new GraphQLList(filterType) },
113
      // OR: { type: new GraphQLList(filterType) },
114
    }),
115
  });
116
  return filterType;
279✔
117
}
118

119
//
120
// Sort args definition & parsing
121
//
122

123
const SortOrderEnum = new GraphQLEnumType({
44✔
124
  name: 'SortOrderEnum',
125
  values: {
126
    ASC: { value: 'asc' },
127
    DESC: { value: 'desc' },
128
  },
129
});
130

131
/**
132
 * @param {string} typeName
133
 * @param {Array<string|{name: string, description: string}>} filterableFields
134
 * @returns {GraphQLList<GarphQLInputObjectType>} sort input type for an field input argument.
135
 */
136
export function createSortType(typeName, filterableFields = []) {
×
137
  return new GraphQLList(
372✔
138
    new GraphQLInputObjectType({
139
      name: typeName,
140
      description:
141
        'An entry of orderBy argument. Specifies field name and the sort order. Only one field name is allowd per entry.',
142
      fields: filterableFields.reduce((fields, field) => {
143
        const fieldName = typeof field === 'string' ? field : field.name;
868✔
144
        const description =
145
          typeof field === 'string' ? undefined : field.description;
868✔
146

147
        return {
868✔
148
          ...fields,
149
          [fieldName]: { type: SortOrderEnum, description },
150
        };
151
      }, {}),
152
    })
153
  );
154
}
155

156
export const pagingArgs = {
44✔
157
  first: {
158
    type: GraphQLInt,
159
    description: 'Returns only first <first> results',
160
    defaultValue: 10,
161
  },
162
  after: {
163
    type: GraphQLString,
164
    description:
165
      'Specify a cursor, returns results after this cursor. cannot be used with "before".',
166
  },
167
  before: {
168
    type: GraphQLString,
169
    description:
170
      'Specify a cursor, returns results before this cursor. cannot be used with "after".',
171
  },
172
};
173

174
/**
175
 * @param {object[]} orderBy - sort input object type
176
 * @param {{[string]: (order: object) => object}} fieldFnMap - Defines one elasticsearch sort argument entry for a field
177
 * @returns {Array<{[string]: {order: string}}>} Elasticsearch sort argument in query body
178
 */
179
export function getSortArgs(orderBy, fieldFnMap = {}) {
47✔
180
  return orderBy
117✔
181
    .map(item => {
182
      const field = Object.keys(item)[0];
26✔
183
      const order = item[field];
26✔
184
      const defaultFieldFn = o => ({ [field]: { order: o } });
26✔
185

186
      return (fieldFnMap[field] || defaultFieldFn)(order);
26✔
187
    })
188
    .concat({ _id: { order: 'desc' } }); // enforce at least 1 sort order for pagination
189
}
190

191
// sort: [{fieldName: {order: 'desc'}}, {fieldName2: {order: 'desc'}}, ...]
192
// This utility function reverts the direction of each sort params.
193
//
194
function reverseSortArgs(sort) {
195
  if (!sort) return undefined;
43!
196
  return sort.map(item => {
43✔
197
    const field = Object.keys(item)[0];
52✔
198
    const order = item[field].order === 'desc' ? 'asc' : 'desc';
52✔
199
    return {
52✔
200
      [field]: {
201
        ...item[field],
202
        order,
203
      },
204
    };
205
  });
206
}
207

208
// Export for custom resolveEdges() and resolveLastCursor()
209
//
210
export function getCursor(cursor) {
211
  return Buffer.from(JSON.stringify(cursor)).toString('base64');
390✔
212
}
213

214
export function getSearchAfterFromCursor(cursor) {
215
  if (!cursor) return undefined;
113✔
216
  return JSON.parse(Buffer.from(cursor, 'base64').toString('utf8'));
12✔
217
}
218

219
async function defaultResolveTotalCount({
220
  first, // eslint-disable-line no-unused-vars
221
  before, // eslint-disable-line no-unused-vars
222
  after, // eslint-disable-line no-unused-vars
223
  ...searchContext
224
}) {
225
  try {
76✔
226
    return (await client.count({
76✔
227
      ...searchContext,
228
      body: {
229
        // count API only supports "query"
230
        query: searchContext.body.query,
231
      },
232
    })).body.count;
233
  } catch (e) /* istanbul ignore next */ {
234
    console.error('[defaultResolveTotalCount]', JSON.stringify(e));
235
    throw e;
236
  }
237
}
238

239
export async function defaultResolveEdges(
240
  { first, before, after, ...searchContext },
241
  args,
242
  { loaders }
243
) {
244
  if (before && after) {
114✔
245
    throw new Error('Use of before & after is prohibited.');
1✔
246
  }
247

248
  const nodes = await loaders.searchResultLoader.load({
113✔
249
    ...searchContext,
250
    body: {
251
      ...searchContext.body,
252
      size: first,
253
      search_after: getSearchAfterFromCursor(before || after),
220✔
254

255
      // if "before" is given, reverse the sort order and later reverse back
256
      //
257
      sort: before
113✔
258
        ? reverseSortArgs(searchContext.body.sort)
259
        : searchContext.body.sort,
260
      highlight: {
261
        order: 'score',
262
        fields: {
263
          text: {
264
            number_of_fragments: 1, // Return only 1 piece highlight text
265
            fragment_size: 200, // word count of highlighted fragment
266
            type: 'plain',
267
          },
268
          reference: {
269
            number_of_fragments: 1, // Return only 1 piece highlight text
270
            fragment_size: 200, // word count of highlighted fragment
271
            type: 'plain',
272
          },
273
        },
274
        pre_tags: ['<HIGHLIGHT>'],
275
        post_tags: ['</HIGHLIGHT>'],
276
      },
277
    },
278
  });
279

280
  if (before) {
113✔
281
    nodes.reverse();
6✔
282
  }
283

284
  return nodes.map(
113✔
285
    ({ _score: score, highlight, inner_hits, _cursor, ...node }) => ({
305✔
286
      node,
287
      cursor: getCursor(_cursor),
288
      score,
289
      highlight,
290
      inner_hits,
291
    })
292
  );
293
}
294

295
async function defaultResolveLastCursor(
296
  {
297
    first, // eslint-disable-line no-unused-vars
298
    before, // eslint-disable-line no-unused-vars
299
    after, // eslint-disable-line no-unused-vars
300
    ...searchContext
301
  },
302
  args,
303
  { loaders }
304
) {
305
  const lastNode = (await loaders.searchResultLoader.load({
37✔
306
    ...searchContext,
307
    body: {
308
      ...searchContext.body,
309
      sort: reverseSortArgs(searchContext.body.sort),
310
    },
311
    size: 1,
312
  }))[0];
313

314
  return lastNode && getCursor(lastNode._cursor);
37✔
315
}
316

317
async function defaultResolveFirstCursor(
318
  {
319
    first, // eslint-disable-line no-unused-vars
320
    before, // eslint-disable-line no-unused-vars
321
    after, // eslint-disable-line no-unused-vars
322
    ...searchContext
323
  },
324
  args,
325
  { loaders }
326
) {
327
  const firstNode = (await loaders.searchResultLoader.load({
37✔
328
    ...searchContext,
329
    size: 1,
330
  }))[0];
331

332
  return firstNode && getCursor(firstNode._cursor);
37✔
333
}
334

335
async function defaultResolveHighlights(edge) {
336
  const { highlight: { text, reference } = {}, inner_hits } = edge;
23✔
337

338
  const hyperlinks = inner_hits?.hyperlinks.hits.hits?.map(
23✔
339
    ({
340
      _source: { url },
341
      highlight: {
×
342
        'hyperlinks.title': title,
343
        'hyperlinks.summary': summary,
344
      } = {},
345
    }) => ({
6✔
346
      url,
347
      title: title ? title[0] : undefined,
6✔
348
      summary: summary ? summary[0] : undefined,
6!
349
    })
350
  );
351

352
  // Elasticsearch highlight returns an array because it can be multiple fragments,
353
  // We directly returns first element(text, title, summary) here because we set number_of_fragments to 1.
354
  return {
23✔
355
    text: text ? text[0] : undefined,
23✔
356
    reference: reference ? reference[0] : undefined,
23✔
357
    hyperlinks,
358
  };
359
}
360

361
// All search
362
//
363
export function createConnectionType(
364
  typeName,
365
  nodeType,
366
  {
271✔
367
    // Default resolvers
368
    resolveTotalCount = defaultResolveTotalCount,
343✔
369
    resolveEdges = defaultResolveEdges,
307✔
370
    resolveLastCursor = defaultResolveLastCursor,
343✔
371
    resolveFirstCursor = defaultResolveFirstCursor,
343✔
372
    resolveHighlights = defaultResolveHighlights,
343✔
373
    extraEdgeFields = {},
307✔
374
  } = {}
375
) {
376
  return new GraphQLObjectType({
343✔
377
    name: typeName,
378
    interfaces: [Connection],
379
    fields: () => ({
310✔
380
      totalCount: {
381
        type: new GraphQLNonNull(GraphQLInt),
382
        description:
383
          'The total count of the entire collection, regardless of "before", "after".',
384
        resolve: resolveTotalCount,
385
      },
386
      edges: {
387
        type: new GraphQLNonNull(
388
          new GraphQLList(
389
            new GraphQLNonNull(
390
              new GraphQLObjectType({
391
                name: `${typeName}Edge`,
392
                interfaces: [Edge],
393
                fields: {
394
                  node: { type: new GraphQLNonNull(nodeType) },
395
                  cursor: { type: new GraphQLNonNull(GraphQLString) },
396
                  score: { type: GraphQLFloat },
397
                  highlight: {
398
                    type: Highlights,
399
                    resolve: resolveHighlights,
400
                  },
401
                  ...extraEdgeFields,
402
                },
403
              })
404
            )
405
          )
406
        ),
407
        resolve: resolveEdges,
408
      },
409
      pageInfo: {
410
        type: new GraphQLNonNull(
411
          new GraphQLObjectType({
412
            name: `${typeName}PageInfo`,
413
            interfaces: [PageInfo],
414
            fields: {
415
              lastCursor: {
416
                type: GraphQLString,
417
                resolve: resolveLastCursor,
418
              },
419
              firstCursor: {
420
                type: GraphQLString,
421
                resolve: resolveFirstCursor,
422
              },
423
            },
424
          })
425
        ),
426
        resolve: params => params,
37✔
427
      },
428
    }),
429
  });
430
}
431

432
/**
433
 * @param {{status: T}[]} entriesWithStatus - list of objects with "status" field
434
 * @param {T[]} statuses - list of status to keep
435
 * @returns {Object[]}
436
 */
437
export function filterByStatuses(entriesWithStatus, statuses) {
438
  return entriesWithStatus
55✔
439
    .filter(Boolean) // Ensure no null inside
440
    .filter(({ status }) => statuses.includes(status));
147✔
441
}
442

443
export const DEFAULT_ARTICLE_STATUSES = ['NORMAL'];
44✔
444
export const DEFAULT_ARTICLE_REPLY_STATUSES = ['NORMAL'];
44✔
445
export const DEFAULT_ARTICLE_CATEGORY_STATUSES = ['NORMAL'];
44✔
446
export const DEFAULT_REPLY_REQUEST_STATUSES = ['NORMAL'];
44✔
447
export const DEFAULT_ARTICLE_REPLY_FEEDBACK_STATUSES = ['NORMAL'];
44✔
448

449
/**
450
 * @param {string} pluralEntityName - the name to display on argument description
451
 * @returns {object} GraphQL args for common list filters
452
 */
453
export function createCommonListFilter(pluralEntityName) {
454
  return {
222✔
455
    appId: {
456
      type: GraphQLString,
457
      description: `Show only ${pluralEntityName} created by a specific app.`,
458
    },
459
    userId: {
460
      type: GraphQLString,
461
      description: `Show only ${pluralEntityName} created by the specific user.`,
462
    },
463
    userIds: {
464
      type: new GraphQLList(new GraphQLNonNull(GraphQLString)),
465
      description: `Show only ${pluralEntityName} created by the specified users.`,
466
    },
467
    createdAt: {
468
      type: timeRangeInput,
469
      description: `List only the ${pluralEntityName} that were created between the specific time range.`,
470
    },
471
    ids: {
472
      type: new GraphQLList(new GraphQLNonNull(GraphQLID)),
473
      description: `If given, only list out ${pluralEntityName} with specific IDs`,
474
    },
475
    selfOnly: {
476
      type: GraphQLBoolean,
477
      description: `Only list the ${pluralEntityName} created by the currently logged in user`,
478
    },
479
  };
480
}
481

482
/**
483
 * Attach (mutates) filterQueries with Elasticsearch query objects by args.filter in GraphQL resolver
484
 *
485
 * @param {Array<Object>} filterQueries - list of filter queries of Elasticsearch bool query
486
 * @param {object} filter - args.filter in resolver
487
 * @param {string} userId - userId for the currently logged in user
488
 * @param {string} appid - appId for the currently logged in user
489
 * @param {string?} fieldPrefix - If given, filters fields will be prefixed with the given string. Disables handling of `ids`.
490
 */
491
export function attachCommonListFilter(
492
  filterQueries,
493
  filter,
494
  userId,
495
  appId,
496
  fieldPrefix = ''
91✔
497
) {
498
  ['userId', 'appId'].forEach(field => {
94✔
499
    if (!filter[field]) return;
188✔
500
    filterQueries.push({ term: { [`${fieldPrefix}${field}`]: filter[field] } });
9✔
501
  });
502

503
  if (filter.userIds) {
94✔
504
    filterQueries.push({ terms: { [`${fieldPrefix}userId`]: filter.userIds } });
1✔
505
  }
506

507
  if (filter.createdAt) {
94✔
508
    filterQueries.push({
11✔
509
      range: {
510
        [`${fieldPrefix}createdAt`]: getRangeFieldParamFromArithmeticExpression(
511
          filter.createdAt
512
        ),
513
      },
514
    });
515
  }
516

517
  if (!fieldPrefix && filter.ids) {
94✔
518
    filterQueries.push({ ids: { values: filter.ids } });
2✔
519
  }
520

521
  if (filter.selfOnly) {
94✔
522
    if (!userId) throw new Error('selfOnly can be set only after log in');
2✔
523
    filterQueries.push(
1✔
524
      { term: { [`${fieldPrefix}userId`]: userId } },
525
      { term: { [`${fieldPrefix}appId`]: appId } }
526
    );
527
  }
528
}
529

530
/**
531
 * Read a successful AI response of a given `type` and `docId`.
532
 * If not, it tries to wait for the latest (within 1min) loading AI response.
533
 * Returns null if there is no successful nor latest loading AI response.
534
 *
535
 * @param {object} param
536
 * @param {'AI_REPLY'} param.type
537
 * @param {string} param.docId
538
 * @returns {AIReponse | null}
539
 */
540
export async function getAIResponse({ type, docId }) {
541
  // Try reading successful AI response.
542
  //
543
  //
544
  for (;;) {
13✔
545
    // First, find latest successful airesponse. Return if found.
546
    //
547
    const {
548
      body: {
549
        hits: {
550
          hits: [successfulAiResponse],
551
        },
552
      },
553
    } = await client.search({
14✔
554
      index: 'airesponses',
555
      type: 'doc',
556
      body: {
557
        query: {
558
          bool: {
559
            must: [
560
              { term: { type } },
561
              { term: { docId } },
562
              { term: { status: 'SUCCESS' } },
563
            ],
564
          },
565
        },
566
        sort: {
567
          createdAt: 'desc',
568
        },
569
        size: 1,
570
      },
571
    });
572

573
    if (successfulAiResponse) {
14✔
574
      return {
3✔
575
        id: successfulAiResponse._id,
576
        ...successfulAiResponse._source,
577
      };
578
    }
579

580
    // If no successful AI responses, find loading responses created within 1 min.
581
    //
582
    const {
583
      body: { count },
584
    } = await client.count({
11✔
585
      index: 'airesponses',
586
      type: 'doc',
587
      body: {
588
        query: {
589
          bool: {
590
            must: [
591
              { term: { type } },
592
              { term: { docId } },
593
              { term: { status: 'LOADING' } },
594
              {
595
                // loading document created within 1 min
596
                range: {
597
                  createdAt: {
598
                    gte: 'now-1m',
599
                  },
600
                },
601
              },
602
            ],
603
          },
604
        },
605
      },
606
    });
607

608
    // No AI response available now, break the loop
609
    //
610
    if (count === 0) {
11✔
611
      break;
10✔
612
    }
613

614
    // Wait a bit to search for successful AI response again.
615
    // If there are any loading AI response becomes successful during the wait,
616
    // it will be picked up when the loop is re-entered.
617
    await delayForMs(1000);
1✔
618
  }
619

620
  // Nothing is found
621
  return null;
10✔
622
}
623

624
/**
625
 * Creates a loading AI Response.
626
 * Returns an updater function that can be used to record real AI response.
627
 *
628
 *
629
 * @param {object} loadingResponseBody
630
 * @param {string} loadingResponseBody.request
631
 * @param {string} loadingResponseBody.type
632
 * @param {string} loadingResponseBody.docId
633
 * @param {object} loadingResponseBody.user
634
 *
635
 * @returns {(responseBody) => Promise<AIResponse>} updater function that updates the created AI
636
 *   response and returns the updated result
637
 */
638
export function createAIResponse({ user, ...loadingResponseBody }) {
639
  const newResponse = {
6✔
640
    userId: user.id,
641
    appId: user.appId,
642
    status: 'LOADING',
643
    createdAt: new Date(),
644
    ...loadingResponseBody,
645
  };
646

647
  // Resolves to loading AI Response.
648
  const newResponseIdPromise = client
6✔
649
    .index({
650
      index: 'airesponses',
651
      type: 'doc',
652
      body: newResponse,
653
    })
654
    .then(({ body: { result, _id } }) => {
655
      /* istanbul ignore if */
656
      if (result !== 'created') {
6✔
657
        throw new Error(`Cannot create AI response: ${result}`);
658
      }
659
      return _id;
6✔
660
    });
661

662
  // Update using aiResponse._id according to apiResult
663
  async function update(responseBody) {
664
    const aiResponseId = await newResponseIdPromise;
6✔
665

666
    const {
667
      body: {
668
        get: { _source },
669
      },
670
    } = await client.update({
6✔
671
      index: 'airesponses',
672
      type: 'doc',
673
      id: aiResponseId,
674
      _source: true,
675
      body: {
676
        doc: {
677
          updatedAt: new Date(),
678
          ...responseBody,
679
        },
680
      },
681
    });
682

683
    return {
6✔
684
      id: aiResponseId,
685
      ..._source,
686
    };
687
  }
688

689
  return update;
6✔
690
}
691

692
const imageAnnotator = new ImageAnnotatorClient();
44✔
693
const OCR_CONFIDENCE_THRESHOLD = 0.75;
44✔
694

695
/**
696
 * @param {ITextAnnotation} fullTextAnnotation - The fullTextAnnotation returned by client.documentTextDetection
697
 * @returns {string} The extracted text that is comprised of paragraphs passing OCR_CONFIDENCE_THRESHOLD
698
 */
699
function extractTextFromFullTextAnnotation(fullTextAnnotation) {
700
  const {
701
    pages: [{ blocks }],
702
  } = fullTextAnnotation;
1✔
703

704
  // Hierarchy described in https://cloud.google.com/vision/docs/fulltext-annotations#annotating_an_image_using_document_text_ocr
705
  //
706
  return blocks
1✔
707
    .flatMap(({ paragraphs }) =>
708
      paragraphs
8✔
709
        .filter(({ confidence }) => confidence >= OCR_CONFIDENCE_THRESHOLD)
23✔
710
        .flatMap(({ words }) =>
711
          words.flatMap(({ symbols }) =>
23✔
712
            symbols.map(({ text, property }) => {
215✔
713
              if (!property || !property.detectedBreak) return text;
367✔
714

715
              // Word break type described in
716
              // http://googleapis.github.io/googleapis/java/grpc-google-cloud-vision-v1/0.1.5/apidocs/com/google/cloud/vision/v1/TextAnnotation.DetectedBreak.BreakType.html#UNKNOWN
717
              const breakStr = [
43✔
718
                'EOL_SURE_SPACE',
719
                'HYPHEN',
720
                'LINE_BREAK',
721
              ].includes(property.detectedBreak.type)
722
                ? '\n'
723
                : ' ';
724
              return property.detectedBreak.isPrefix
43!
725
                ? `${breakStr}${text}`
726
                : `${text}${breakStr}`;
727
            })
728
          )
729
        )
730
    )
731
    .join('');
732
}
733

734
/**
735
 * @param {object} queryInfo - contains type and media entry ID of contents after fileUrl
736
 * @param {string} fileUrl - the audio, image or video file to process
737
 * @param {object} user - the user who requested the transcription
738
 */
739
export async function createTranscript(queryInfo, fileUrl, user) {
740
  if (!user) throw new Error('[createTranscript] user is required');
3!
741

742
  const update = createAIResponse({
3✔
743
    user,
744
    type: 'TRANSCRIPT',
745
    docId: queryInfo.id,
746
  });
747

748
  try {
3✔
749
    switch (queryInfo.type) {
3!
750
      case 'image': {
751
        const [
752
          { fullTextAnnotation },
753
        ] = await imageAnnotator.documentTextDetection(fileUrl);
1✔
754

755
        console.log('[createTranscript]', queryInfo.id, fullTextAnnotation);
1✔
756

757
        // This should not happen, but just in case
758
        //
759
        if (
1!
760
          !fullTextAnnotation ||
3✔
761
          !fullTextAnnotation.pages ||
762
          fullTextAnnotation.pages.length === 0
763
        ) {
764
          return update({
×
765
            status: 'SUCCESS',
766
            // No text detected
767
            text: '',
768
          });
769
        }
770

771
        return update({
1✔
772
          status: 'SUCCESS',
773
          // Write '' if no text detected
774
          text: extractTextFromFullTextAnnotation(fullTextAnnotation),
775
        });
776
      }
777

778
      case 'video':
779
      case 'audio': {
780
        const fileResp = await fetch(fileUrl);
1✔
781

782
        // Ref: https://github.com/openai/openai-node/issues/77#issuecomment-1500899486
783
        const audio = ffmpeg(fileResp.body)
1✔
784
          .noVideo()
785
          .format('mp3')
786
          .pipe();
787

788
        // Hack it to make openai library work
789
        // Ref: https://github.com/openai/openai-node/issues/77#issuecomment-1455247809
790
        audio.path = 'file.mp4';
1✔
791

792
        const { data } = await openai.createTranscription(
1✔
793
          audio,
794
          'whisper-1',
795
          '接下來,是一則在網際網路上傳播的影片的逐字稿。內容如下:',
796
          'verbose_json',
797
          0,
798
          undefined,
799
          // Make axios happy
800
          // Ref: https://github.com/openai/openai-node/issues/77#issuecomment-1500899486
801
          //
802
          { maxContentLength: Infinity, maxBodyLength: Infinity }
803
        );
804

805
        // Remove tokens keep only useful fields
806
        const dataToLog = data.segments.map(
1✔
807
          ({
808
            start,
809
            end,
810
            seek,
811
            text,
812
            avg_logprob,
813
            compression_ratio,
814
            no_speech_prob,
815
          }) => ({
7✔
816
            start,
817
            end,
818
            seek,
819
            text,
820
            avg_logprob,
821
            compression_ratio,
822
            no_speech_prob,
823
          })
824
        );
825

826
        console.log('[createTranscript]', queryInfo.id, dataToLog);
1✔
827

828
        return update({
1✔
829
          status: 'SUCCESS',
830
          text: dataToLog
831
            .reduce((allText, segment, idx) => {
832
              // Ignore segments with identical text & prob with previous segment.
833
              // This is apparently hallucination.
834
              if (idx > 0) {
7✔
835
                const prevSegment = dataToLog[idx - 1];
6✔
836

837
                if (
6!
838
                  prevSegment.text === segment.text &&
6!
839
                  prevSegment.avg_logprob === segment.avg_logprob
840
                ) {
841
                  return allText;
×
842
                }
843
              }
844

845
              return allText + '\n' + segment.text;
7✔
846
            }, '')
847
            .trim(),
848
        });
849
      }
850
      default:
851
        throw new Error(`Type ${queryInfo.type} not supported`);
1✔
852
    }
853
  } catch (e) {
854
    console.error('[createTranscript]', e);
1✔
855
    return update({
1✔
856
      status: 'ERROR',
857
      text: e.toString(),
858
    });
859
  }
860
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc