• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

inclusion-numerique / coop-mediation-numerique / 624c8220-3b75-4588-af5c-02533eb4e889

21 May 2026 12:12PM UTC coverage: 6.955% (-0.05%) from 7.009%
624c8220-3b75-4588-af5c-02533eb4e889

push

circleci

web-flow
MEP 2026-05-21

469 of 10876 branches covered (4.31%)

Branch coverage included in aggregate %.

0 of 1486 new or added lines in 32 files covered. (0.0%)

68 existing lines in 9 files now uncovered.

1466 of 16944 relevant lines covered (8.65%)

35.41 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/apps/web/src/jobs/detect-duplicate-structures/executeDetectDuplicateStructures.ts
1
import { writeFile } from 'node:fs/promises'
2
import { getAuditOutputPath } from '@app/web/jobs/audit-output'
3
import { output } from '@app/web/jobs/output'
4
import { prismaClient } from '@app/web/prismaClient'
5
import type { DetectDuplicateStructuresJob } from './detectDuplicateStructuresJob'
6

7
type StructureLight = {
8
  id: string
9
  nom: string
10
  nomNormalise: string
11
  siret: string | null
12
  adresse: string
13
  adresseNormalisee: string
14
  commune: string
15
  codePostal: string
16
  codeInsee: string | null
17
  telephone: string | null
18
  latitude: number | null
19
  longitude: number | null
20
  visiblePourCartographieNationale: boolean
21
  activitesCount: number
22
  emploisCount: number
23
  mediateursCount: number
24
}
25

26
type PaireDoublon = {
27
  idA: string
28
  nomA: string
29
  siretA: string
30
  adresseA: string
31
  communeA: string
32
  idB: string
33
  nomB: string
34
  siretB: string
35
  adresseB: string
36
  communeB: string
37
  codeInsee: string
38
  scoreNom: number
39
  scoreAdresse: number
40
  scoreGeo: number
41
  scoreSiret: number
42
  scoreTelephone: number
43
  scoreTotal: number
44
  activitesA: number
45
  activitesB: number
46
  emploisA: number
47
  emploisB: number
48
  mediateursA: number
49
  mediateursB: number
50
  visibleCartoA: boolean
51
  visibleCartoB: boolean
52
}
53

54
// ── Pondérations ──
55

NEW
56
const POIDS_NOM = 0.35
×
NEW
57
const POIDS_ADRESSE = 0.25
×
NEW
58
const POIDS_GEO = 0.2
×
NEW
59
const POIDS_SIRET = 0.15
×
NEW
60
const POIDS_TELEPHONE = 0.05
×
61

62
// ── Utilitaires ──
63

NEW
64
const stripDiacritics = (s: string) =>
×
NEW
65
  s.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
×
66

NEW
67
const baseNormalize = (s: string) =>
×
NEW
68
  stripDiacritics(s)
×
69
    .toLowerCase()
70
    .replace(/[^a-z0-9\s]/g, '')
71
    .replace(/\s+/g, ' ')
72
    .trim()
73

74
// ── Normalisation des noms de structures ──
75

76
/**
77
 * Préfixes administratifs interchangeables.
78
 * "commune de X", "mairie de X", "ville de X" → "X"
79
 * "conseil départemental de X", "département de X" → "X"
80
 * "communauté de communes de X", "communauté d'agglomération de X", etc.
81
 */
82
// Préfixes normalisés vers un token canonique au lieu d'être supprimés.
83
// "commune de X" et "mairie de X" deviennent "ville X" → matchent ensemble.
84
// Mais "EPN X" reste tel quel → ne matche pas avec "ville X".
NEW
85
const NOM_PREFIXES_NORMALIZATIONS: [RegExp, string][] = [
×
86
  // Communes (de/du/des/de la/de l')
87
  [/^commune (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
88
  [/^com (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
89
  [/^mairie (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
90
  [/^ville (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
91
  // Départements
92
  [/^conseil departemental (?:de(?:s)?|du|de la|de l)\s+/, 'departement '],
93
  [/^departement (?:de(?:s)?|du|de la|de l)\s+/, 'departement '],
94
  // Intercommunalités
95
  [/^communaute de communes?\s+/, 'cc '],
96
  [/^communaute d agglomeration\s+/, 'cagglo '],
97
  [/^communaute com\s+/, 'cc '],
98
  // Régions
99
  [/^conseil regional (?:de(?:s)?|du|de la|de l)\s+/, 'region '],
100
  [/^region\s+/, 'region '],
101
]
102

103
// Mots-clés qui désignent un service spécifique d'une entité plus large
104
// (mairie, commune, etc.). Si une structure les contient et l'autre non,
105
// elles ne sont PAS la même entité, même avec un SIRET partagé.
NEW
106
const SERVICE_KEYWORDS = [
×
107
  'epn',
108
  'mediatheque',
109
  'bibliotheque',
110
  'ccas',
111
  'cias',
112
  'centre social',
113
  'maison quartier',
114
  'maison de quartier',
115
  'france services',
116
  'mjc',
117
  'espace numerique',
118
  'cyber espace',
119
  'cyberbase',
120
  'pole emploi',
121
  'mission locale',
122
  'point information',
123
  'point info',
124
  'fablab',
125
]
126

NEW
127
const detectServiceKeywords = (s: string): Set<string> => {
×
NEW
128
  const found = new Set<string>()
×
NEW
129
  for (const kw of SERVICE_KEYWORDS) {
×
NEW
130
    if (s.includes(kw)) found.add(kw)
×
131
  }
NEW
132
  return found
×
133
}
134

135
/**
136
 * Returns true if `a` and `b` reference different service kinds.
137
 * Used to prevent fusioning e.g. an EPN with its parent town hall
138
 * even when they share a SIRET and an address.
139
 */
NEW
140
const hasAsymmetricServiceKeyword = (a: string, b: string): boolean => {
×
NEW
141
  const ka = detectServiceKeywords(a)
×
NEW
142
  const kb = detectServiceKeywords(b)
×
NEW
143
  if (ka.size === 0 && kb.size === 0) return false
×
144
  // If one set is a subset of the other, no asymmetry; same service.
NEW
145
  for (const k of ka) if (!kb.has(k)) return true
×
NEW
146
  for (const k of kb) if (!ka.has(k)) return true
×
NEW
147
  return false
×
148
}
149

150
/**
151
 * Abréviations courantes dans les noms de structures.
152
 */
NEW
153
const NOM_ABBREVIATIONS: [RegExp, string][] = [
×
154
  [/\bst\b/g, 'saint'],
155
  [/\bste\b/g, 'sainte'],
156
  [/\basse?\b/g, 'association'],
157
  [/\bassoc\b/g, 'association'],
158
  [/\bfed\b/g, 'federation'],
159
  [/\bfeder\b/g, 'federation'],
160
  [/\bdepart\b/g, 'departementale'],
161
  [/\bamic\b/g, 'amicale'],
162
  [/\bmdf\b/g, 'maison de la famille'],
163
]
164

NEW
165
const normalizeNom = (s: string): string => {
×
NEW
166
  let n = baseNormalize(s)
×
167

NEW
168
  for (const [pattern, replacement] of NOM_PREFIXES_NORMALIZATIONS) {
×
NEW
169
    n = n.replace(pattern, replacement)
×
170
  }
171

NEW
172
  for (const [pattern, replacement] of NOM_ABBREVIATIONS) {
×
NEW
173
    n = n.replace(pattern, replacement)
×
174
  }
175

176
  // Retirer les mots-outils en fin de normalisation
NEW
177
  n = n
×
178
    .replace(/\b(de|du|des|le|la|les|l|d|et|en)\b/g, '')
179
    .replace(/\s+/g, ' ')
180
    .trim()
181

NEW
182
  return n
×
183
}
184

185
// ── Normalisation des adresses ──
186

NEW
187
const ADRESSE_ABBREVIATIONS: [RegExp, string][] = [
×
188
  [/\bav\b/g, 'avenue'],
189
  [/\bbd\b/g, 'boulevard'],
190
  [/\bblvd\b/g, 'boulevard'],
191
  [/\bpl\b/g, 'place'],
192
  [/\bimp\b/g, 'impasse'],
193
  [/\bche\b/g, 'chemin'],
194
  [/\bch\b/g, 'chemin'],
195
  [/\bsq\b/g, 'square'],
196
  [/\brte\b/g, 'route'],
197
  [/\brt\b/g, 'route'],
198
  [/\bres\b/g, 'residence'],
199
  [/\bvc\b/g, 'voie communale'],
200
  [/\bzi\b/g, 'zone industrielle'],
201
  [/\bza\b/g, 'zone artisanale'],
202
  [/\b(\d+)bis\b/g, '$1'],
203
  [/\b(\d+)ter\b/g, '$1'],
204
  [/\b(\d+)b\b/g, '$1'],
205
]
206

NEW
207
const normalizeAdresse = (s: string): string => {
×
NEW
208
  let n = baseNormalize(s)
×
209

NEW
210
  for (const [pattern, replacement] of ADRESSE_ABBREVIATIONS) {
×
NEW
211
    n = n.replace(pattern, replacement)
×
212
  }
213

214
  // Retirer les mots-outils
NEW
215
  n = n
×
216
    .replace(/\b(de|du|des|le|la|les|l|d)\b/g, '')
217
    .replace(/\s+/g, ' ')
218
    .trim()
219

NEW
220
  return n
×
221
}
222

223
// ── Similarité ──
224

NEW
225
const bigrams = (s: string) => {
×
NEW
226
  const set = new Map<string, number>()
×
NEW
227
  for (let i = 0; i < s.length - 1; i++) {
×
NEW
228
    const bigram = s.slice(i, i + 2)
×
NEW
229
    set.set(bigram, (set.get(bigram) ?? 0) + 1)
×
230
  }
NEW
231
  return set
×
232
}
233

NEW
234
const diceSimilarity = (a: string, b: string): number => {
×
NEW
235
  if (a === b) return 1
×
NEW
236
  if (a.length < 2 || b.length < 2) return 0
×
237

NEW
238
  const bigramsA = bigrams(a)
×
NEW
239
  const bigramsB = bigrams(b)
×
240

NEW
241
  let intersection = 0
×
NEW
242
  for (const [bigram, countA] of bigramsA) {
×
NEW
243
    const countB = bigramsB.get(bigram) ?? 0
×
NEW
244
    intersection += Math.min(countA, countB)
×
245
  }
246

NEW
247
  return (2 * intersection) / (a.length - 1 + b.length - 1)
×
248
}
249

NEW
250
const haversineDistance = (
×
251
  lat1: number,
252
  lon1: number,
253
  lat2: number,
254
  lon2: number,
255
): number => {
NEW
256
  const R = 6_371_000
×
NEW
257
  const toRad = (deg: number) => (deg * Math.PI) / 180
×
NEW
258
  const dLat = toRad(lat2 - lat1)
×
NEW
259
  const dLon = toRad(lon2 - lon1)
×
260
  const a =
NEW
261
    Math.sin(dLat / 2) ** 2 +
×
262
    Math.cos(toRad(lat1)) * Math.cos(toRad(lat2)) * Math.sin(dLon / 2) ** 2
NEW
263
  return R * 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
×
264
}
265

266
/**
267
 * Score géographique : 1.0 si < 50m, décroissance linéaire jusqu'à 0 à 500m
268
 */
NEW
269
const scoreGeo = (a: StructureLight, b: StructureLight): number => {
×
NEW
270
  if (
×
271
    a.latitude == null ||
×
272
    a.longitude == null ||
273
    b.latitude == null ||
274
    b.longitude == null
275
  ) {
NEW
276
    return 0
×
277
  }
278

NEW
279
  const distance = haversineDistance(
×
280
    a.latitude,
281
    a.longitude,
282
    b.latitude,
283
    b.longitude,
284
  )
285

NEW
286
  if (distance < 50) return 1
×
NEW
287
  if (distance > 500) return 0
×
NEW
288
  return 1 - (distance - 50) / 450
×
289
}
290

NEW
291
const scoreSiret = (a: StructureLight, b: StructureLight): number => {
×
NEW
292
  if (!a.siret || !b.siret) return 0
×
NEW
293
  return a.siret === b.siret ? 1 : 0
×
294
}
295

NEW
296
const normalizeTelephone = (tel: string | null): string => {
×
NEW
297
  if (!tel) return ''
×
NEW
298
  return tel.replace(/[\s.+\-()]/g, '').replace(/^0033/, '0')
×
299
}
300

NEW
301
const scoreTelephone = (a: StructureLight, b: StructureLight): number => {
×
NEW
302
  const telA = normalizeTelephone(a.telephone)
×
NEW
303
  const telB = normalizeTelephone(b.telephone)
×
NEW
304
  if (!telA || !telB) return 0
×
NEW
305
  return telA === telB ? 1 : 0
×
306
}
307

308
// Min length for the "contained" address heuristic: avoids matching on
309
// generic tokens like "rue" or "place" alone.
NEW
310
const MIN_CONTAINED_ADRESSE_LENGTH = 5
×
311

NEW
312
const scoreAdresse = (a: string, b: string): number => {
×
NEW
313
  if (
×
314
    a.length >= MIN_CONTAINED_ADRESSE_LENGTH &&
×
315
    b.length >= MIN_CONTAINED_ADRESSE_LENGTH
316
  ) {
NEW
317
    if (a.includes(b) || b.includes(a)) return 1
×
318
  }
NEW
319
  return diceSimilarity(a, b)
×
320
}
321

NEW
322
const computeScore = (
×
323
  a: StructureLight,
324
  b: StructureLight,
325
): {
326
  scoreNom: number
327
  scoreAdresse: number
328
  scoreGeo: number
329
  scoreSiret: number
330
  scoreTelephone: number
331
  scoreTotal: number
332
} => {
NEW
333
  const sNom = diceSimilarity(a.nomNormalise, b.nomNormalise)
×
334

335
  // If one structure references a service kind (EPN, médiathèque, CCAS...)
336
  // and the other does not, they are distinct entities even with same
337
  // SIRET + address. Disable the "address contained" heuristic for them.
NEW
338
  const differentService = hasAsymmetricServiceKeyword(
×
339
    a.nomNormalise,
340
    b.nomNormalise,
341
  )
NEW
342
  const sAdresse = differentService
×
343
    ? diceSimilarity(a.adresseNormalisee, b.adresseNormalisee)
344
    : scoreAdresse(a.adresseNormalisee, b.adresseNormalisee)
NEW
345
  const sGeo = scoreGeo(a, b)
×
NEW
346
  const sSiret = scoreSiret(a, b)
×
NEW
347
  const sTelephone = scoreTelephone(a, b)
×
348

349
  // Ignore geo when it's unavailable OR when the address strongly indicates
350
  // the same place (>=0.85): missing coords or wrong coords should not
351
  // penalize a clear address match. But never ignore it when names indicate
352
  // distinct services (the geo distance is a valuable signal here).
353
  const geoAvailable =
NEW
354
    a.latitude != null &&
×
355
    a.longitude != null &&
356
    b.latitude != null &&
357
    b.longitude != null
NEW
358
  const ignoreGeo = !differentService && (!geoAvailable || sAdresse >= 0.85)
×
NEW
359
  const weightsSum = ignoreGeo
×
360
    ? POIDS_NOM + POIDS_ADRESSE + POIDS_SIRET + POIDS_TELEPHONE
361
    : 1
362

363
  const scoreTotal =
NEW
364
    (sNom * POIDS_NOM +
×
365
      sAdresse * POIDS_ADRESSE +
366
      (ignoreGeo ? 0 : sGeo * POIDS_GEO) +
×
367
      sSiret * POIDS_SIRET +
368
      sTelephone * POIDS_TELEPHONE) /
369
    weightsSum
370

NEW
371
  return {
×
372
    scoreNom: sNom,
373
    scoreAdresse: sAdresse,
374
    scoreGeo: sGeo,
375
    scoreSiret: sSiret,
376
    scoreTelephone: sTelephone,
377
    scoreTotal,
378
  }
379
}
380

381
// ── CSV ──
382

NEW
383
const escapeCsvField = (value: string) =>
×
NEW
384
  value.includes(';') || value.includes('"') || value.includes('\n')
×
385
    ? `"${value.replace(/"/g, '""')}"`
386
    : value
387

NEW
388
const csvHeader = [
×
389
  'id_a',
390
  'nom_a',
391
  'siret_a',
392
  'adresse_a',
393
  'commune_a',
394
  'activites_a',
395
  'emplois_a',
396
  'mediateurs_a',
397
  'visible_carto_a',
398
  'id_b',
399
  'nom_b',
400
  'siret_b',
401
  'adresse_b',
402
  'commune_b',
403
  'activites_b',
404
  'emplois_b',
405
  'mediateurs_b',
406
  'visible_carto_b',
407
  'code_insee',
408
  'score_nom',
409
  'score_adresse',
410
  'score_geo',
411
  'score_siret',
412
  'score_telephone',
413
  'score_total',
414
].join(';')
415

NEW
416
const paireToCsv = (p: PaireDoublon): string =>
×
NEW
417
  [
×
418
    p.idA,
419
    escapeCsvField(p.nomA),
420
    p.siretA,
421
    escapeCsvField(p.adresseA),
422
    escapeCsvField(p.communeA),
423
    p.activitesA,
424
    p.emploisA,
425
    p.mediateursA,
426
    p.visibleCartoA ? 'oui' : 'non',
×
427
    p.idB,
428
    escapeCsvField(p.nomB),
429
    p.siretB,
430
    escapeCsvField(p.adresseB),
431
    escapeCsvField(p.communeB),
432
    p.activitesB,
433
    p.emploisB,
434
    p.mediateursB,
435
    p.visibleCartoB ? 'oui' : 'non',
×
436
    p.codeInsee,
437
    p.scoreNom.toFixed(3),
438
    p.scoreAdresse.toFixed(3),
439
    p.scoreGeo.toFixed(3),
440
    p.scoreSiret.toFixed(3),
441
    p.scoreTelephone.toFixed(3),
442
    p.scoreTotal.toFixed(3),
443
  ].join(';')
444

445
// ── Job ──
446

NEW
447
export const executeDetectDuplicateStructures = async (
×
448
  job: DetectDuplicateStructuresJob,
449
) => {
NEW
450
  const seuilScore = job.payload?.seuilScore ?? 0.6
×
NEW
451
  const limit = job.payload?.limit
×
452

NEW
453
  output.log(
×
454
    `detect-duplicate-structures: starting (seuil: ${seuilScore})${limit ? ` (limit: ${limit} codes INSEE)` : ''}...`,
×
455
  )
456

NEW
457
  const structures = await prismaClient.structure.findMany({
×
458
    where: {
459
      suppression: null,
460
      codeInsee: { not: null },
461
    },
462
    select: {
463
      id: true,
464
      nom: true,
465
      siret: true,
466
      adresse: true,
467
      commune: true,
468
      codePostal: true,
469
      codeInsee: true,
470
      telephone: true,
471
      latitude: true,
472
      longitude: true,
473
      visiblePourCartographieNationale: true,
474
      activitesCount: true,
475
      _count: {
476
        select: {
477
          emplois: true,
478
          mediateursEnActivite: true,
479
        },
480
      },
481
    },
482
  })
483

484
  // Regrouper par code INSEE
NEW
485
  const parCodeInsee = new Map<string, StructureLight[]>()
×
NEW
486
  for (const s of structures) {
×
NEW
487
    const codeInsee = s.codeInsee as string
×
NEW
488
    const light: StructureLight = {
×
489
      id: s.id,
490
      nom: s.nom,
491
      nomNormalise: normalizeNom(s.nom),
492
      siret: s.siret,
493
      adresse: s.adresse,
494
      adresseNormalisee: normalizeAdresse(s.adresse),
495
      commune: s.commune,
496
      codePostal: s.codePostal,
497
      codeInsee: s.codeInsee,
498
      telephone: s.telephone,
499
      latitude: s.latitude,
500
      longitude: s.longitude,
501
      visiblePourCartographieNationale: s.visiblePourCartographieNationale,
502
      activitesCount: s.activitesCount,
503
      emploisCount: s._count.emplois,
504
      mediateursCount: s._count.mediateursEnActivite,
505
    }
NEW
506
    const group = parCodeInsee.get(codeInsee)
×
NEW
507
    if (group) {
×
NEW
508
      group.push(light)
×
509
    } else {
NEW
510
      parCodeInsee.set(codeInsee, [light])
×
511
    }
512
  }
513

514
  // Filtrer les groupes avec au moins 2 structures
NEW
515
  const groupesComparables = [...parCodeInsee.entries()].filter(
×
NEW
516
    ([, group]) => group.length >= 2,
×
517
  )
518

NEW
519
  const groupesATraiter = limit
×
520
    ? groupesComparables.slice(0, limit)
521
    : groupesComparables
522

NEW
523
  const totalComparaisons = groupesATraiter.reduce(
×
NEW
524
    (sum, [, group]) => sum + (group.length * (group.length - 1)) / 2,
×
525
    0,
526
  )
527

NEW
528
  output.log(
×
529
    `detect-duplicate-structures: ${structures.length} structures, ${groupesComparables.length} codes INSEE avec ≥2 structures, ${totalComparaisons} comparaisons à effectuer`,
530
  )
531

NEW
532
  const paires: PaireDoublon[] = []
×
NEW
533
  let comparaisonsEffectuees = 0
×
534

NEW
535
  for (const [codeInsee, group] of groupesATraiter) {
×
NEW
536
    for (let i = 0; i < group.length; i++) {
×
NEW
537
      for (let j = i + 1; j < group.length; j++) {
×
NEW
538
        const a = group[i]
×
NEW
539
        const b = group[j]
×
NEW
540
        comparaisonsEffectuees++
×
541

NEW
542
        const scores = computeScore(a, b)
×
543

NEW
544
        if (scores.scoreTotal >= seuilScore) {
×
NEW
545
          paires.push({
×
546
            idA: a.id,
547
            nomA: a.nom,
548
            siretA: a.siret ?? '',
×
549
            adresseA: a.adresse,
550
            communeA: a.commune,
551
            activitesA: a.activitesCount,
552
            emploisA: a.emploisCount,
553
            mediateursA: a.mediateursCount,
554
            visibleCartoA: a.visiblePourCartographieNationale,
555
            idB: b.id,
556
            nomB: b.nom,
557
            siretB: b.siret ?? '',
×
558
            adresseB: b.adresse,
559
            communeB: b.commune,
560
            activitesB: b.activitesCount,
561
            emploisB: b.emploisCount,
562
            mediateursB: b.mediateursCount,
563
            visibleCartoB: b.visiblePourCartographieNationale,
564
            codeInsee,
565
            ...scores,
566
          })
567
        }
568
      }
569
    }
570

NEW
571
    if (comparaisonsEffectuees % 50_000 === 0) {
×
NEW
572
      output.log(
×
573
        `detect-duplicate-structures: progress ${comparaisonsEffectuees}/${totalComparaisons} comparaisons, ${paires.length} paires trouvées`,
574
      )
575
    }
576
  }
577

578
  // Trier par score décroissant
NEW
579
  paires.sort((a, b) => b.scoreTotal - a.scoreTotal)
×
580

581
  // ── Export CSV ──
582

NEW
583
  const csvLines = [csvHeader, ...paires.map(paireToCsv)]
×
NEW
584
  const filePath = getAuditOutputPath('detect-duplicate-structures.csv')
×
NEW
585
  await writeFile(filePath, csvLines.join('\n'), 'utf-8')
×
586

587
  // ── Rapport console ──
588

NEW
589
  output.log(`\n=== DÉTECTION DOUBLONS FLOUS - RÉSULTATS ===`)
×
NEW
590
  output.log(`Structures analysées: ${structures.length}`)
×
NEW
591
  output.log(`Codes INSEE avec ≥2 structures: ${groupesComparables.length}`)
×
NEW
592
  output.log(`Comparaisons effectuées: ${comparaisonsEffectuees}`)
×
NEW
593
  output.log(`Paires détectées (score ≥ ${seuilScore}): ${paires.length}`)
×
594

595
  // Distribution des scores
NEW
596
  if (paires.length > 0) {
×
NEW
597
    const ranges = [
×
598
      { label: '0.9 - 1.0', min: 0.9, max: 1.01 },
599
      { label: '0.8 - 0.9', min: 0.8, max: 0.9 },
600
      { label: '0.7 - 0.8', min: 0.7, max: 0.8 },
601
      { label: '0.6 - 0.7', min: 0.6, max: 0.7 },
602
      { label: '< 0.6', min: 0, max: 0.6 },
603
    ]
NEW
604
    output.log(`\n--- Distribution des scores ---`)
×
NEW
605
    for (const range of ranges) {
×
NEW
606
      const count = paires.filter(
×
NEW
607
        (p) => p.scoreTotal >= range.min && p.scoreTotal < range.max,
×
608
      ).length
NEW
609
      if (count > 0) {
×
NEW
610
        output.log(`  ${range.label}: ${count}`)
×
611
      }
612
    }
613
  }
614

615
  // ── Regroupement en clusters (composantes connexes) ──
616

NEW
617
  const parent = new Map<string, string>()
×
618

NEW
619
  const find = (id: string): string => {
×
NEW
620
    if (!parent.has(id)) parent.set(id, id)
×
NEW
621
    let root = id
×
NEW
622
    while (parent.get(root) !== root) root = parent.get(root) as string
×
623
    // Path compression
NEW
624
    let current = id
×
NEW
625
    while (current !== root) {
×
NEW
626
      const next = parent.get(current) as string
×
NEW
627
      parent.set(current, root)
×
NEW
628
      current = next
×
629
    }
NEW
630
    return root
×
631
  }
632

NEW
633
  const union = (a: string, b: string) => {
×
NEW
634
    const ra = find(a)
×
NEW
635
    const rb = find(b)
×
NEW
636
    if (ra !== rb) parent.set(ra, rb)
×
637
  }
638

NEW
639
  for (const p of paires) {
×
NEW
640
    union(p.idA, p.idB)
×
641
  }
642

643
  // Construire les clusters
644

645
  type ClusterType = 'doublon_certain' | 'multi_site' | 'mixte'
646

647
  type Cluster = {
648
    ids: Set<string>
649
    paires: PaireDoublon[]
650
    scoreMax: number
651
    type: ClusterType
652
    nbLieuxDistincts: number
653
  }
654

NEW
655
  const clusterMap = new Map<string, Cluster>()
×
656

NEW
657
  for (const p of paires) {
×
NEW
658
    const root = find(p.idA)
×
NEW
659
    let cluster = clusterMap.get(root)
×
NEW
660
    if (!cluster) {
×
NEW
661
      cluster = {
×
662
        ids: new Set(),
663
        paires: [],
664
        scoreMax: 0,
665
        type: 'doublon_certain',
666
        nbLieuxDistincts: 1,
667
      }
NEW
668
      clusterMap.set(root, cluster)
×
669
    }
NEW
670
    cluster.ids.add(p.idA)
×
NEW
671
    cluster.ids.add(p.idB)
×
NEW
672
    cluster.paires.push(p)
×
NEW
673
    if (p.scoreTotal > cluster.scoreMax) cluster.scoreMax = p.scoreTotal
×
674
  }
675

676
  // ── Classification des clusters par lieux physiques ──
677
  // Deux structures sont au même lieu si :
678
  // - scoreGeo ≥ 0.7 (~200m) : proximité géographique
679
  // - OU scoreAdresse ≥ 0.85 : adresse quasi identique (fallback sans coordonnées)
680

NEW
681
  const SEUIL_MEME_LIEU_GEO = 0.7
×
NEW
682
  const SEUIL_MEME_LIEU_ADRESSE = 0.85
×
683

NEW
684
  const isSameLieu = (p: PaireDoublon): boolean =>
×
NEW
685
    p.scoreGeo >= SEUIL_MEME_LIEU_GEO ||
×
686
    p.scoreAdresse >= SEUIL_MEME_LIEU_ADRESSE
687

NEW
688
  for (const cluster of clusterMap.values()) {
×
689
    // Union-Find local pour regrouper les structures par lieu physique
NEW
690
    const lieuParent = new Map<string, string>()
×
691

NEW
692
    const findLieu = (id: string): string => {
×
NEW
693
      if (!lieuParent.has(id)) lieuParent.set(id, id)
×
NEW
694
      let root = id
×
NEW
695
      while (lieuParent.get(root) !== root)
×
NEW
696
        root = lieuParent.get(root) as string
×
NEW
697
      let current = id
×
NEW
698
      while (current !== root) {
×
NEW
699
        const next = lieuParent.get(current) as string
×
NEW
700
        lieuParent.set(current, root)
×
NEW
701
        current = next
×
702
      }
NEW
703
      return root
×
704
    }
705

NEW
706
    const unionLieu = (a: string, b: string) => {
×
NEW
707
      const ra = findLieu(a)
×
NEW
708
      const rb = findLieu(b)
×
NEW
709
      if (ra !== rb) lieuParent.set(ra, rb)
×
710
    }
711

712
    // Initialiser tous les IDs
NEW
713
    for (const id of cluster.ids) findLieu(id)
×
714

715
    // Regrouper les structures proches géographiquement ou par adresse
NEW
716
    for (const p of cluster.paires) {
×
NEW
717
      if (isSameLieu(p)) {
×
NEW
718
        unionLieu(p.idA, p.idB)
×
719
      }
720
    }
721

722
    // Compter les lieux distincts
NEW
723
    const lieux = new Set<string>()
×
NEW
724
    for (const id of cluster.ids) lieux.add(findLieu(id))
×
725

NEW
726
    cluster.nbLieuxDistincts = lieux.size
×
727

NEW
728
    if (lieux.size === 1) {
×
NEW
729
      cluster.type = 'doublon_certain'
×
NEW
730
    } else if (lieux.size === cluster.ids.size) {
×
NEW
731
      cluster.type = 'multi_site'
×
732
    } else {
NEW
733
      cluster.type = 'mixte'
×
734
    }
735
  }
736

NEW
737
  const clusters = [...clusterMap.values()].sort(
×
NEW
738
    (a, b) => b.ids.size - a.ids.size,
×
739
  )
740

NEW
741
  const structuresImpliquees = new Set<string>()
×
NEW
742
  for (const c of clusters) {
×
NEW
743
    for (const id of c.ids) structuresImpliquees.add(id)
×
744
  }
745

746
  // ── Rapport clusters ──
747

NEW
748
  output.log(`\n--- Clusters ---`)
×
NEW
749
  output.log(`Clusters trouvés: ${clusters.length}`)
×
750

NEW
751
  const clusterSizeDistrib = new Map<number, number>()
×
NEW
752
  for (const c of clusters) {
×
NEW
753
    const size = c.ids.size
×
NEW
754
    clusterSizeDistrib.set(size, (clusterSizeDistrib.get(size) ?? 0) + 1)
×
755
  }
NEW
756
  output.log(`Distribution par taille:`)
×
NEW
757
  for (const [size, count] of [...clusterSizeDistrib.entries()].sort(
×
NEW
758
    (a, b) => b[0] - a[0],
×
759
  )) {
NEW
760
    output.log(`  ${size} structures: ${count} clusters`)
×
761
  }
762

763
  // Classification par type
NEW
764
  const typeDistrib: Record<ClusterType, number> = {
×
765
    doublon_certain: 0,
766
    multi_site: 0,
767
    mixte: 0,
768
  }
NEW
769
  const structuresParType: Record<ClusterType, number> = {
×
770
    doublon_certain: 0,
771
    multi_site: 0,
772
    mixte: 0,
773
  }
NEW
774
  for (const c of clusters) {
×
NEW
775
    typeDistrib[c.type]++
×
NEW
776
    structuresParType[c.type] += c.ids.size
×
777
  }
778

NEW
779
  output.log(`\n--- Classification ---`)
×
NEW
780
  for (const type of ['doublon_certain', 'multi_site', 'mixte'] as const) {
×
NEW
781
    output.log(
×
782
      `  ${type}: ${typeDistrib[type]} clusters (${structuresParType[type]} structures)`,
783
    )
784
  }
785

786
  // Top 10 clusters les plus gros
NEW
787
  output.log(`\n--- Top 10 plus gros clusters ---`)
×
NEW
788
  for (const cluster of clusters.slice(0, 10)) {
×
NEW
789
    const exemple = cluster.paires[0]
×
NEW
790
    const noms = new Set<string>()
×
NEW
791
    const sirets = new Set<string>()
×
NEW
792
    for (const p of cluster.paires) {
×
NEW
793
      noms.add(p.nomA)
×
NEW
794
      noms.add(p.nomB)
×
NEW
795
      if (p.siretA) sirets.add(p.siretA)
×
NEW
796
      if (p.siretB) sirets.add(p.siretB)
×
797
    }
NEW
798
    output.log(
×
799
      `  [${cluster.type}] [${cluster.ids.size} structures, ${cluster.nbLieuxDistincts} lieux, score_max=${cluster.scoreMax.toFixed(3)}]`,
800
    )
NEW
801
    output.log(
×
802
      `    Noms: ${[...noms]
803
        .slice(0, 3)
NEW
804
        .map((n) => `"${n}"`)
×
805
        .join(', ')}${noms.size > 3 ? ` (+${noms.size - 3})` : ''}`,
×
806
    )
NEW
807
    output.log(
×
808
      `    Commune: ${exemple.communeA} | SIRETs: ${sirets.size > 0 ? [...sirets].join(', ') : '—'}`,
×
809
    )
810
  }
811

812
  // Top 10 paires les plus intéressantes (hors doublons stricts)
NEW
813
  const pairesInteressantes = paires.filter(
×
814
    (p) =>
NEW
815
      !(p.scoreNom >= 0.95 && p.scoreAdresse >= 0.95 && p.scoreSiret === 1),
×
816
  )
817

818
  // Dédupliquer par cluster pour varier les exemples
NEW
819
  const clustersVus = new Set<string>()
×
NEW
820
  const pairesVariees: PaireDoublon[] = []
×
NEW
821
  for (const p of pairesInteressantes) {
×
NEW
822
    const root = find(p.idA)
×
NEW
823
    if (!clustersVus.has(root)) {
×
NEW
824
      clustersVus.add(root)
×
NEW
825
      pairesVariees.push(p)
×
NEW
826
      if (pairesVariees.length >= 10) break
×
827
    }
828
  }
829

NEW
830
  output.log(`\n--- Top 10 paires (hors doublons stricts, 1 par cluster) ---`)
×
NEW
831
  for (const p of pairesVariees) {
×
NEW
832
    output.log(
×
833
      `  score=${p.scoreTotal.toFixed(3)} [nom=${p.scoreNom.toFixed(2)} adr=${p.scoreAdresse.toFixed(2)} geo=${p.scoreGeo.toFixed(2)} siret=${p.scoreSiret.toFixed(0)}]`,
834
    )
NEW
835
    output.log(`    "${p.nomA}" ↔ "${p.nomB}" | ${p.communeA}`)
×
NEW
836
    output.log(
×
837
      `    "${p.adresseA}" ↔ "${p.adresseB}" | SIRET: ${p.siretA || '—'} / ${p.siretB || '—'}`,
×
838
    )
839
  }
840

NEW
841
  output.log(`\nStructures uniques impliquées: ${structuresImpliquees.size}`)
×
NEW
842
  output.log(`Export: ${filePath} (${paires.length} paires)`)
×
843

844
  // ── Export CSV clusters ──
845

NEW
846
  const clustersCsvHeader = [
×
847
    'cluster_id',
848
    'type',
849
    'taille',
850
    'nb_lieux_distincts',
851
    'paires',
852
    'score_max',
853
    'noms',
854
    'sirets',
855
    'commune',
856
    'ids',
857
  ].join(';')
858

NEW
859
  const clustersCsvLines = [
×
860
    clustersCsvHeader,
861
    ...clusters.map((c, i) => {
NEW
862
      const noms = new Set<string>()
×
NEW
863
      const sirets = new Set<string>()
×
NEW
864
      const communes = new Set<string>()
×
NEW
865
      for (const p of c.paires) {
×
NEW
866
        noms.add(p.nomA)
×
NEW
867
        noms.add(p.nomB)
×
NEW
868
        if (p.siretA) sirets.add(p.siretA)
×
NEW
869
        if (p.siretB) sirets.add(p.siretB)
×
NEW
870
        communes.add(p.communeA)
×
871
      }
NEW
872
      return [
×
873
        i + 1,
874
        c.type,
875
        c.ids.size,
876
        c.nbLieuxDistincts,
877
        c.paires.length,
878
        c.scoreMax.toFixed(3),
879
        escapeCsvField([...noms].join(' | ')),
880
        [...sirets].join(' | ') || '',
×
881
        escapeCsvField([...communes].join(' | ')),
882
        [...c.ids].join(' | '),
883
      ].join(';')
884
    }),
885
  ]
886

NEW
887
  const clustersFilePath = getAuditOutputPath('detect-duplicate-clusters.csv')
×
NEW
888
  await writeFile(clustersFilePath, clustersCsvLines.join('\n'), 'utf-8')
×
889

NEW
890
  output.log(
×
891
    `Export clusters: ${clustersFilePath} (${clusters.length} clusters)`,
892
  )
893

NEW
894
  output.log(`\ndetect-duplicate-structures: terminé`)
×
895

NEW
896
  return {
×
897
    structuresAnalysees: structures.length,
898
    codesInseeAvecDoublons: groupesComparables.length,
899
    comparaisonsEffectuees,
900
    pairesDetectees: paires.length,
901
    clusters: {
902
      total: clusters.length,
903
      doublon_certain: typeDistrib.doublon_certain,
904
      multi_site: typeDistrib.multi_site,
905
      mixte: typeDistrib.mixte,
906
    },
907
    structuresImpliquees: structuresImpliquees.size,
908
    seuilScore,
909
    exports: {
910
      paires: filePath,
911
      clusters: clustersFilePath,
912
    },
913
  }
914
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc