• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

inclusion-numerique / coop-mediation-numerique / 605d24cb-457b-4888-9d6f-f1777ed8bf22

18 May 2026 07:26PM UTC coverage: 7.009% (-0.5%) from 7.539%
605d24cb-457b-4888-9d6f-f1777ed8bf22

push

circleci

web-flow
Merge pull request #492 from inclusion-numerique/feat/deduplicate-structures

Feat/deduplicate structures

469 of 10788 branches covered (4.35%)

Branch coverage included in aggregate %.

0 of 1361 new or added lines in 32 files covered. (0.0%)

1 existing line in 1 file now uncovered.

1466 of 16819 relevant lines covered (8.72%)

35.68 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/apps/web/src/jobs/detect-duplicate-structures/executeDetectDuplicateStructures.ts
1
import { writeFile } from 'node:fs/promises'
2
import { getAuditOutputPath } from '@app/web/jobs/audit-output'
3
import { output } from '@app/web/jobs/output'
4
import { prismaClient } from '@app/web/prismaClient'
5
import type { DetectDuplicateStructuresJob } from './detectDuplicateStructuresJob'
6

7
type StructureLight = {
8
  id: string
9
  nom: string
10
  nomNormalise: string
11
  siret: string | null
12
  adresse: string
13
  adresseNormalisee: string
14
  commune: string
15
  codePostal: string
16
  codeInsee: string | null
17
  telephone: string | null
18
  latitude: number | null
19
  longitude: number | null
20
  visiblePourCartographieNationale: boolean
21
  activitesCount: number
22
  emploisCount: number
23
  mediateursCount: number
24
}
25

26
type PaireDoublon = {
27
  idA: string
28
  nomA: string
29
  siretA: string
30
  adresseA: string
31
  communeA: string
32
  idB: string
33
  nomB: string
34
  siretB: string
35
  adresseB: string
36
  communeB: string
37
  codeInsee: string
38
  scoreNom: number
39
  scoreAdresse: number
40
  scoreGeo: number
41
  scoreSiret: number
42
  scoreTelephone: number
43
  scoreTotal: number
44
  activitesA: number
45
  activitesB: number
46
  emploisA: number
47
  emploisB: number
48
  mediateursA: number
49
  mediateursB: number
50
  visibleCartoA: boolean
51
  visibleCartoB: boolean
52
}
53

54
// ── Pondérations ──
55

NEW
56
const POIDS_NOM = 0.35
×
NEW
57
const POIDS_ADRESSE = 0.25
×
NEW
58
const POIDS_GEO = 0.2
×
NEW
59
const POIDS_SIRET = 0.15
×
NEW
60
const POIDS_TELEPHONE = 0.05
×
61

62
// ── Utilitaires ──
63

NEW
64
const stripDiacritics = (s: string) =>
×
NEW
65
  s.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
×
66

NEW
67
const baseNormalize = (s: string) =>
×
NEW
68
  stripDiacritics(s)
×
69
    .toLowerCase()
70
    .replace(/[^a-z0-9\s]/g, '')
71
    .replace(/\s+/g, ' ')
72
    .trim()
73

74
// ── Normalisation des noms de structures ──
75

76
/**
77
 * Préfixes administratifs interchangeables.
78
 * "commune de X", "mairie de X", "ville de X" → "X"
79
 * "conseil départemental de X", "département de X" → "X"
80
 * "communauté de communes de X", "communauté d'agglomération de X", etc.
81
 */
NEW
82
const NOM_PREFIXES_TO_STRIP = [
×
83
  // Communes
84
  /^commune de\s+/,
85
  /^com de\s+/,
86
  /^mairie de\s+/,
87
  /^ville de\s+/,
88
  // Départements
89
  /^conseil departemental de(?:s)?\s+/,
90
  /^conseil departemental du\s+/,
91
  /^conseil departemental de l\s*/,
92
  /^departement de(?:s)?\s+/,
93
  /^departement du\s+/,
94
  /^departement de l\s*/,
95
  // Intercommunalités
96
  /^communaute de communes?\s+/,
97
  /^communaute d agglomeration\s+/,
98
  /^communaute com\s+/,
99
  // Régions
100
  /^conseil regional de\s+/,
101
  /^region\s+/,
102
]
103

104
/**
105
 * Abréviations courantes dans les noms de structures.
106
 */
NEW
107
const NOM_ABBREVIATIONS: [RegExp, string][] = [
×
108
  [/\bst\b/g, 'saint'],
109
  [/\bste\b/g, 'sainte'],
110
  [/\basse?\b/g, 'association'],
111
  [/\bassoc\b/g, 'association'],
112
  [/\bfed\b/g, 'federation'],
113
  [/\bfeder\b/g, 'federation'],
114
  [/\bdepart\b/g, 'departementale'],
115
  [/\bamic\b/g, 'amicale'],
116
  [/\bmdf\b/g, 'maison de la famille'],
117
]
118

NEW
119
const normalizeNom = (s: string): string => {
×
NEW
120
  let n = baseNormalize(s)
×
121

NEW
122
  for (const prefix of NOM_PREFIXES_TO_STRIP) {
×
NEW
123
    n = n.replace(prefix, '')
×
124
  }
125

NEW
126
  for (const [pattern, replacement] of NOM_ABBREVIATIONS) {
×
NEW
127
    n = n.replace(pattern, replacement)
×
128
  }
129

130
  // Retirer les mots-outils en fin de normalisation
NEW
131
  n = n
×
132
    .replace(/\b(de|du|des|le|la|les|l|d|et|en)\b/g, '')
133
    .replace(/\s+/g, ' ')
134
    .trim()
135

NEW
136
  return n
×
137
}
138

139
// ── Normalisation des adresses ──
140

NEW
141
const ADRESSE_ABBREVIATIONS: [RegExp, string][] = [
×
142
  [/\bav\b/g, 'avenue'],
143
  [/\bbd\b/g, 'boulevard'],
144
  [/\bblvd\b/g, 'boulevard'],
145
  [/\bpl\b/g, 'place'],
146
  [/\bimp\b/g, 'impasse'],
147
  [/\bche\b/g, 'chemin'],
148
  [/\bsq\b/g, 'square'],
149
  [/\brte\b/g, 'route'],
150
  [/\bres\b/g, 'residence'],
151
  [/\b(\d+)bis\b/g, '$1'],
152
  [/\b(\d+)ter\b/g, '$1'],
153
  [/\b(\d+)b\b/g, '$1'],
154
]
155

NEW
156
const normalizeAdresse = (s: string): string => {
×
NEW
157
  let n = baseNormalize(s)
×
158

NEW
159
  for (const [pattern, replacement] of ADRESSE_ABBREVIATIONS) {
×
NEW
160
    n = n.replace(pattern, replacement)
×
161
  }
162

163
  // Retirer les mots-outils
NEW
164
  n = n
×
165
    .replace(/\b(de|du|des|le|la|les|l|d)\b/g, '')
166
    .replace(/\s+/g, ' ')
167
    .trim()
168

NEW
169
  return n
×
170
}
171

172
// ── Similarité ──
173

NEW
174
const bigrams = (s: string) => {
×
NEW
175
  const set = new Map<string, number>()
×
NEW
176
  for (let i = 0; i < s.length - 1; i++) {
×
NEW
177
    const bigram = s.slice(i, i + 2)
×
NEW
178
    set.set(bigram, (set.get(bigram) ?? 0) + 1)
×
179
  }
NEW
180
  return set
×
181
}
182

NEW
183
const diceSimilarity = (a: string, b: string): number => {
×
NEW
184
  if (a === b) return 1
×
NEW
185
  if (a.length < 2 || b.length < 2) return 0
×
186

NEW
187
  const bigramsA = bigrams(a)
×
NEW
188
  const bigramsB = bigrams(b)
×
189

NEW
190
  let intersection = 0
×
NEW
191
  for (const [bigram, countA] of bigramsA) {
×
NEW
192
    const countB = bigramsB.get(bigram) ?? 0
×
NEW
193
    intersection += Math.min(countA, countB)
×
194
  }
195

NEW
196
  return (2 * intersection) / (a.length - 1 + b.length - 1)
×
197
}
198

NEW
199
const haversineDistance = (
×
200
  lat1: number,
201
  lon1: number,
202
  lat2: number,
203
  lon2: number,
204
): number => {
NEW
205
  const R = 6_371_000
×
NEW
206
  const toRad = (deg: number) => (deg * Math.PI) / 180
×
NEW
207
  const dLat = toRad(lat2 - lat1)
×
NEW
208
  const dLon = toRad(lon2 - lon1)
×
209
  const a =
NEW
210
    Math.sin(dLat / 2) ** 2 +
×
211
    Math.cos(toRad(lat1)) * Math.cos(toRad(lat2)) * Math.sin(dLon / 2) ** 2
NEW
212
  return R * 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
×
213
}
214

215
/**
216
 * Score géographique : 1.0 si < 50m, décroissance linéaire jusqu'à 0 à 500m
217
 */
NEW
218
const scoreGeo = (a: StructureLight, b: StructureLight): number => {
×
NEW
219
  if (
×
220
    a.latitude == null ||
×
221
    a.longitude == null ||
222
    b.latitude == null ||
223
    b.longitude == null
224
  ) {
NEW
225
    return 0
×
226
  }
227

NEW
228
  const distance = haversineDistance(
×
229
    a.latitude,
230
    a.longitude,
231
    b.latitude,
232
    b.longitude,
233
  )
234

NEW
235
  if (distance < 50) return 1
×
NEW
236
  if (distance > 500) return 0
×
NEW
237
  return 1 - (distance - 50) / 450
×
238
}
239

NEW
240
const scoreSiret = (a: StructureLight, b: StructureLight): number => {
×
NEW
241
  if (!a.siret || !b.siret) return 0
×
NEW
242
  return a.siret === b.siret ? 1 : 0
×
243
}
244

NEW
245
const normalizeTelephone = (tel: string | null): string => {
×
NEW
246
  if (!tel) return ''
×
NEW
247
  return tel.replace(/[\s.+\-()]/g, '').replace(/^0033/, '0')
×
248
}
249

NEW
250
const scoreTelephone = (a: StructureLight, b: StructureLight): number => {
×
NEW
251
  const telA = normalizeTelephone(a.telephone)
×
NEW
252
  const telB = normalizeTelephone(b.telephone)
×
NEW
253
  if (!telA || !telB) return 0
×
NEW
254
  return telA === telB ? 1 : 0
×
255
}
256

NEW
257
const computeScore = (
×
258
  a: StructureLight,
259
  b: StructureLight,
260
): {
261
  scoreNom: number
262
  scoreAdresse: number
263
  scoreGeo: number
264
  scoreSiret: number
265
  scoreTelephone: number
266
  scoreTotal: number
267
} => {
NEW
268
  const sNom = diceSimilarity(a.nomNormalise, b.nomNormalise)
×
NEW
269
  const sAdresse = diceSimilarity(a.adresseNormalisee, b.adresseNormalisee)
×
NEW
270
  const sGeo = scoreGeo(a, b)
×
NEW
271
  const sSiret = scoreSiret(a, b)
×
NEW
272
  const sTelephone = scoreTelephone(a, b)
×
273

274
  const scoreTotal =
NEW
275
    sNom * POIDS_NOM +
×
276
    sAdresse * POIDS_ADRESSE +
277
    sGeo * POIDS_GEO +
278
    sSiret * POIDS_SIRET +
279
    sTelephone * POIDS_TELEPHONE
280

NEW
281
  return {
×
282
    scoreNom: sNom,
283
    scoreAdresse: sAdresse,
284
    scoreGeo: sGeo,
285
    scoreSiret: sSiret,
286
    scoreTelephone: sTelephone,
287
    scoreTotal,
288
  }
289
}
290

291
// ── CSV ──
292

NEW
293
const escapeCsvField = (value: string) =>
×
NEW
294
  value.includes(';') || value.includes('"') || value.includes('\n')
×
295
    ? `"${value.replace(/"/g, '""')}"`
296
    : value
297

NEW
298
const csvHeader = [
×
299
  'id_a',
300
  'nom_a',
301
  'siret_a',
302
  'adresse_a',
303
  'commune_a',
304
  'activites_a',
305
  'emplois_a',
306
  'mediateurs_a',
307
  'visible_carto_a',
308
  'id_b',
309
  'nom_b',
310
  'siret_b',
311
  'adresse_b',
312
  'commune_b',
313
  'activites_b',
314
  'emplois_b',
315
  'mediateurs_b',
316
  'visible_carto_b',
317
  'code_insee',
318
  'score_nom',
319
  'score_adresse',
320
  'score_geo',
321
  'score_siret',
322
  'score_telephone',
323
  'score_total',
324
].join(';')
325

NEW
326
const paireToCsv = (p: PaireDoublon): string =>
×
NEW
327
  [
×
328
    p.idA,
329
    escapeCsvField(p.nomA),
330
    p.siretA,
331
    escapeCsvField(p.adresseA),
332
    escapeCsvField(p.communeA),
333
    p.activitesA,
334
    p.emploisA,
335
    p.mediateursA,
336
    p.visibleCartoA ? 'oui' : 'non',
×
337
    p.idB,
338
    escapeCsvField(p.nomB),
339
    p.siretB,
340
    escapeCsvField(p.adresseB),
341
    escapeCsvField(p.communeB),
342
    p.activitesB,
343
    p.emploisB,
344
    p.mediateursB,
345
    p.visibleCartoB ? 'oui' : 'non',
×
346
    p.codeInsee,
347
    p.scoreNom.toFixed(3),
348
    p.scoreAdresse.toFixed(3),
349
    p.scoreGeo.toFixed(3),
350
    p.scoreSiret.toFixed(3),
351
    p.scoreTelephone.toFixed(3),
352
    p.scoreTotal.toFixed(3),
353
  ].join(';')
354

355
// ── Job ──
356

NEW
357
export const executeDetectDuplicateStructures = async (
×
358
  job: DetectDuplicateStructuresJob,
359
) => {
NEW
360
  const seuilScore = job.payload?.seuilScore ?? 0.6
×
NEW
361
  const limit = job.payload?.limit
×
362

NEW
363
  output.log(
×
364
    `detect-duplicate-structures: starting (seuil: ${seuilScore})${limit ? ` (limit: ${limit} codes INSEE)` : ''}...`,
×
365
  )
366

NEW
367
  const structures = await prismaClient.structure.findMany({
×
368
    where: {
369
      suppression: null,
370
      codeInsee: { not: null },
371
    },
372
    select: {
373
      id: true,
374
      nom: true,
375
      siret: true,
376
      adresse: true,
377
      commune: true,
378
      codePostal: true,
379
      codeInsee: true,
380
      telephone: true,
381
      latitude: true,
382
      longitude: true,
383
      visiblePourCartographieNationale: true,
384
      activitesCount: true,
385
      _count: {
386
        select: {
387
          emplois: true,
388
          mediateursEnActivite: true,
389
        },
390
      },
391
    },
392
  })
393

394
  // Regrouper par code INSEE
NEW
395
  const parCodeInsee = new Map<string, StructureLight[]>()
×
NEW
396
  for (const s of structures) {
×
NEW
397
    const codeInsee = s.codeInsee as string
×
NEW
398
    const light: StructureLight = {
×
399
      id: s.id,
400
      nom: s.nom,
401
      nomNormalise: normalizeNom(s.nom),
402
      siret: s.siret,
403
      adresse: s.adresse,
404
      adresseNormalisee: normalizeAdresse(s.adresse),
405
      commune: s.commune,
406
      codePostal: s.codePostal,
407
      codeInsee: s.codeInsee,
408
      telephone: s.telephone,
409
      latitude: s.latitude,
410
      longitude: s.longitude,
411
      visiblePourCartographieNationale: s.visiblePourCartographieNationale,
412
      activitesCount: s.activitesCount,
413
      emploisCount: s._count.emplois,
414
      mediateursCount: s._count.mediateursEnActivite,
415
    }
NEW
416
    const group = parCodeInsee.get(codeInsee)
×
NEW
417
    if (group) {
×
NEW
418
      group.push(light)
×
419
    } else {
NEW
420
      parCodeInsee.set(codeInsee, [light])
×
421
    }
422
  }
423

424
  // Filtrer les groupes avec au moins 2 structures
NEW
425
  const groupesComparables = [...parCodeInsee.entries()].filter(
×
NEW
426
    ([, group]) => group.length >= 2,
×
427
  )
428

NEW
429
  const groupesATraiter = limit
×
430
    ? groupesComparables.slice(0, limit)
431
    : groupesComparables
432

NEW
433
  const totalComparaisons = groupesATraiter.reduce(
×
NEW
434
    (sum, [, group]) => sum + (group.length * (group.length - 1)) / 2,
×
435
    0,
436
  )
437

NEW
438
  output.log(
×
439
    `detect-duplicate-structures: ${structures.length} structures, ${groupesComparables.length} codes INSEE avec ≥2 structures, ${totalComparaisons} comparaisons à effectuer`,
440
  )
441

NEW
442
  const paires: PaireDoublon[] = []
×
NEW
443
  let comparaisonsEffectuees = 0
×
444

NEW
445
  for (const [codeInsee, group] of groupesATraiter) {
×
NEW
446
    for (let i = 0; i < group.length; i++) {
×
NEW
447
      for (let j = i + 1; j < group.length; j++) {
×
NEW
448
        const a = group[i]
×
NEW
449
        const b = group[j]
×
NEW
450
        comparaisonsEffectuees++
×
451

NEW
452
        const scores = computeScore(a, b)
×
453

NEW
454
        if (scores.scoreTotal >= seuilScore) {
×
NEW
455
          paires.push({
×
456
            idA: a.id,
457
            nomA: a.nom,
458
            siretA: a.siret ?? '',
×
459
            adresseA: a.adresse,
460
            communeA: a.commune,
461
            activitesA: a.activitesCount,
462
            emploisA: a.emploisCount,
463
            mediateursA: a.mediateursCount,
464
            visibleCartoA: a.visiblePourCartographieNationale,
465
            idB: b.id,
466
            nomB: b.nom,
467
            siretB: b.siret ?? '',
×
468
            adresseB: b.adresse,
469
            communeB: b.commune,
470
            activitesB: b.activitesCount,
471
            emploisB: b.emploisCount,
472
            mediateursB: b.mediateursCount,
473
            visibleCartoB: b.visiblePourCartographieNationale,
474
            codeInsee,
475
            ...scores,
476
          })
477
        }
478
      }
479
    }
480

NEW
481
    if (comparaisonsEffectuees % 50_000 === 0) {
×
NEW
482
      output.log(
×
483
        `detect-duplicate-structures: progress ${comparaisonsEffectuees}/${totalComparaisons} comparaisons, ${paires.length} paires trouvées`,
484
      )
485
    }
486
  }
487

488
  // Trier par score décroissant
NEW
489
  paires.sort((a, b) => b.scoreTotal - a.scoreTotal)
×
490

491
  // ── Export CSV ──
492

NEW
493
  const csvLines = [csvHeader, ...paires.map(paireToCsv)]
×
NEW
494
  const filePath = getAuditOutputPath('detect-duplicate-structures.csv')
×
NEW
495
  await writeFile(filePath, csvLines.join('\n'), 'utf-8')
×
496

497
  // ── Rapport console ──
498

NEW
499
  output.log(`\n=== DÉTECTION DOUBLONS FLOUS - RÉSULTATS ===`)
×
NEW
500
  output.log(`Structures analysées: ${structures.length}`)
×
NEW
501
  output.log(`Codes INSEE avec ≥2 structures: ${groupesComparables.length}`)
×
NEW
502
  output.log(`Comparaisons effectuées: ${comparaisonsEffectuees}`)
×
NEW
503
  output.log(`Paires détectées (score ≥ ${seuilScore}): ${paires.length}`)
×
504

505
  // Distribution des scores
NEW
506
  if (paires.length > 0) {
×
NEW
507
    const ranges = [
×
508
      { label: '0.9 - 1.0', min: 0.9, max: 1.01 },
509
      { label: '0.8 - 0.9', min: 0.8, max: 0.9 },
510
      { label: '0.7 - 0.8', min: 0.7, max: 0.8 },
511
      { label: '0.6 - 0.7', min: 0.6, max: 0.7 },
512
      { label: '< 0.6', min: 0, max: 0.6 },
513
    ]
NEW
514
    output.log(`\n--- Distribution des scores ---`)
×
NEW
515
    for (const range of ranges) {
×
NEW
516
      const count = paires.filter(
×
NEW
517
        (p) => p.scoreTotal >= range.min && p.scoreTotal < range.max,
×
518
      ).length
NEW
519
      if (count > 0) {
×
NEW
520
        output.log(`  ${range.label}: ${count}`)
×
521
      }
522
    }
523
  }
524

525
  // ── Regroupement en clusters (composantes connexes) ──
526

NEW
527
  const parent = new Map<string, string>()
×
528

NEW
529
  const find = (id: string): string => {
×
NEW
530
    if (!parent.has(id)) parent.set(id, id)
×
NEW
531
    let root = id
×
NEW
532
    while (parent.get(root) !== root) root = parent.get(root) as string
×
533
    // Path compression
NEW
534
    let current = id
×
NEW
535
    while (current !== root) {
×
NEW
536
      const next = parent.get(current) as string
×
NEW
537
      parent.set(current, root)
×
NEW
538
      current = next
×
539
    }
NEW
540
    return root
×
541
  }
542

NEW
543
  const union = (a: string, b: string) => {
×
NEW
544
    const ra = find(a)
×
NEW
545
    const rb = find(b)
×
NEW
546
    if (ra !== rb) parent.set(ra, rb)
×
547
  }
548

NEW
549
  for (const p of paires) {
×
NEW
550
    union(p.idA, p.idB)
×
551
  }
552

553
  // Construire les clusters
554

555
  type ClusterType = 'doublon_certain' | 'multi_site' | 'mixte'
556

557
  type Cluster = {
558
    ids: Set<string>
559
    paires: PaireDoublon[]
560
    scoreMax: number
561
    type: ClusterType
562
    nbLieuxDistincts: number
563
  }
564

NEW
565
  const clusterMap = new Map<string, Cluster>()
×
566

NEW
567
  for (const p of paires) {
×
NEW
568
    const root = find(p.idA)
×
NEW
569
    let cluster = clusterMap.get(root)
×
NEW
570
    if (!cluster) {
×
NEW
571
      cluster = {
×
572
        ids: new Set(),
573
        paires: [],
574
        scoreMax: 0,
575
        type: 'doublon_certain',
576
        nbLieuxDistincts: 1,
577
      }
NEW
578
      clusterMap.set(root, cluster)
×
579
    }
NEW
580
    cluster.ids.add(p.idA)
×
NEW
581
    cluster.ids.add(p.idB)
×
NEW
582
    cluster.paires.push(p)
×
NEW
583
    if (p.scoreTotal > cluster.scoreMax) cluster.scoreMax = p.scoreTotal
×
584
  }
585

586
  // ── Classification des clusters par lieux physiques ──
587
  // Deux structures sont au même lieu si :
588
  // - scoreGeo ≥ 0.7 (~200m) : proximité géographique
589
  // - OU scoreAdresse ≥ 0.85 : adresse quasi identique (fallback sans coordonnées)
590

NEW
591
  const SEUIL_MEME_LIEU_GEO = 0.7
×
NEW
592
  const SEUIL_MEME_LIEU_ADRESSE = 0.85
×
593

NEW
594
  const isSameLieu = (p: PaireDoublon): boolean =>
×
NEW
595
    p.scoreGeo >= SEUIL_MEME_LIEU_GEO ||
×
596
    p.scoreAdresse >= SEUIL_MEME_LIEU_ADRESSE
597

NEW
598
  for (const cluster of clusterMap.values()) {
×
599
    // Union-Find local pour regrouper les structures par lieu physique
NEW
600
    const lieuParent = new Map<string, string>()
×
601

NEW
602
    const findLieu = (id: string): string => {
×
NEW
603
      if (!lieuParent.has(id)) lieuParent.set(id, id)
×
NEW
604
      let root = id
×
NEW
605
      while (lieuParent.get(root) !== root)
×
NEW
606
        root = lieuParent.get(root) as string
×
NEW
607
      let current = id
×
NEW
608
      while (current !== root) {
×
NEW
609
        const next = lieuParent.get(current) as string
×
NEW
610
        lieuParent.set(current, root)
×
NEW
611
        current = next
×
612
      }
NEW
613
      return root
×
614
    }
615

NEW
616
    const unionLieu = (a: string, b: string) => {
×
NEW
617
      const ra = findLieu(a)
×
NEW
618
      const rb = findLieu(b)
×
NEW
619
      if (ra !== rb) lieuParent.set(ra, rb)
×
620
    }
621

622
    // Initialiser tous les IDs
NEW
623
    for (const id of cluster.ids) findLieu(id)
×
624

625
    // Regrouper les structures proches géographiquement ou par adresse
NEW
626
    for (const p of cluster.paires) {
×
NEW
627
      if (isSameLieu(p)) {
×
NEW
628
        unionLieu(p.idA, p.idB)
×
629
      }
630
    }
631

632
    // Compter les lieux distincts
NEW
633
    const lieux = new Set<string>()
×
NEW
634
    for (const id of cluster.ids) lieux.add(findLieu(id))
×
635

NEW
636
    cluster.nbLieuxDistincts = lieux.size
×
637

NEW
638
    if (lieux.size === 1) {
×
NEW
639
      cluster.type = 'doublon_certain'
×
NEW
640
    } else if (lieux.size === cluster.ids.size) {
×
NEW
641
      cluster.type = 'multi_site'
×
642
    } else {
NEW
643
      cluster.type = 'mixte'
×
644
    }
645
  }
646

NEW
647
  const clusters = [...clusterMap.values()].sort(
×
NEW
648
    (a, b) => b.ids.size - a.ids.size,
×
649
  )
650

NEW
651
  const structuresImpliquees = new Set<string>()
×
NEW
652
  for (const c of clusters) {
×
NEW
653
    for (const id of c.ids) structuresImpliquees.add(id)
×
654
  }
655

656
  // ── Rapport clusters ──
657

NEW
658
  output.log(`\n--- Clusters ---`)
×
NEW
659
  output.log(`Clusters trouvés: ${clusters.length}`)
×
660

NEW
661
  const clusterSizeDistrib = new Map<number, number>()
×
NEW
662
  for (const c of clusters) {
×
NEW
663
    const size = c.ids.size
×
NEW
664
    clusterSizeDistrib.set(size, (clusterSizeDistrib.get(size) ?? 0) + 1)
×
665
  }
NEW
666
  output.log(`Distribution par taille:`)
×
NEW
667
  for (const [size, count] of [...clusterSizeDistrib.entries()].sort(
×
NEW
668
    (a, b) => b[0] - a[0],
×
669
  )) {
NEW
670
    output.log(`  ${size} structures: ${count} clusters`)
×
671
  }
672

673
  // Classification par type
NEW
674
  const typeDistrib: Record<ClusterType, number> = {
×
675
    doublon_certain: 0,
676
    multi_site: 0,
677
    mixte: 0,
678
  }
NEW
679
  const structuresParType: Record<ClusterType, number> = {
×
680
    doublon_certain: 0,
681
    multi_site: 0,
682
    mixte: 0,
683
  }
NEW
684
  for (const c of clusters) {
×
NEW
685
    typeDistrib[c.type]++
×
NEW
686
    structuresParType[c.type] += c.ids.size
×
687
  }
688

NEW
689
  output.log(`\n--- Classification ---`)
×
NEW
690
  for (const type of ['doublon_certain', 'multi_site', 'mixte'] as const) {
×
NEW
691
    output.log(
×
692
      `  ${type}: ${typeDistrib[type]} clusters (${structuresParType[type]} structures)`,
693
    )
694
  }
695

696
  // Top 10 clusters les plus gros
NEW
697
  output.log(`\n--- Top 10 plus gros clusters ---`)
×
NEW
698
  for (const cluster of clusters.slice(0, 10)) {
×
NEW
699
    const exemple = cluster.paires[0]
×
NEW
700
    const noms = new Set<string>()
×
NEW
701
    const sirets = new Set<string>()
×
NEW
702
    for (const p of cluster.paires) {
×
NEW
703
      noms.add(p.nomA)
×
NEW
704
      noms.add(p.nomB)
×
NEW
705
      if (p.siretA) sirets.add(p.siretA)
×
NEW
706
      if (p.siretB) sirets.add(p.siretB)
×
707
    }
NEW
708
    output.log(
×
709
      `  [${cluster.type}] [${cluster.ids.size} structures, ${cluster.nbLieuxDistincts} lieux, score_max=${cluster.scoreMax.toFixed(3)}]`,
710
    )
NEW
711
    output.log(
×
712
      `    Noms: ${[...noms]
713
        .slice(0, 3)
NEW
714
        .map((n) => `"${n}"`)
×
715
        .join(', ')}${noms.size > 3 ? ` (+${noms.size - 3})` : ''}`,
×
716
    )
NEW
717
    output.log(
×
718
      `    Commune: ${exemple.communeA} | SIRETs: ${sirets.size > 0 ? [...sirets].join(', ') : '—'}`,
×
719
    )
720
  }
721

722
  // Top 10 paires les plus intéressantes (hors doublons stricts)
NEW
723
  const pairesInteressantes = paires.filter(
×
724
    (p) =>
NEW
725
      !(p.scoreNom >= 0.95 && p.scoreAdresse >= 0.95 && p.scoreSiret === 1),
×
726
  )
727

728
  // Dédupliquer par cluster pour varier les exemples
NEW
729
  const clustersVus = new Set<string>()
×
NEW
730
  const pairesVariees: PaireDoublon[] = []
×
NEW
731
  for (const p of pairesInteressantes) {
×
NEW
732
    const root = find(p.idA)
×
NEW
733
    if (!clustersVus.has(root)) {
×
NEW
734
      clustersVus.add(root)
×
NEW
735
      pairesVariees.push(p)
×
NEW
736
      if (pairesVariees.length >= 10) break
×
737
    }
738
  }
739

NEW
740
  output.log(`\n--- Top 10 paires (hors doublons stricts, 1 par cluster) ---`)
×
NEW
741
  for (const p of pairesVariees) {
×
NEW
742
    output.log(
×
743
      `  score=${p.scoreTotal.toFixed(3)} [nom=${p.scoreNom.toFixed(2)} adr=${p.scoreAdresse.toFixed(2)} geo=${p.scoreGeo.toFixed(2)} siret=${p.scoreSiret.toFixed(0)}]`,
744
    )
NEW
745
    output.log(`    "${p.nomA}" ↔ "${p.nomB}" | ${p.communeA}`)
×
NEW
746
    output.log(
×
747
      `    "${p.adresseA}" ↔ "${p.adresseB}" | SIRET: ${p.siretA || '—'} / ${p.siretB || '—'}`,
×
748
    )
749
  }
750

NEW
751
  output.log(`\nStructures uniques impliquées: ${structuresImpliquees.size}`)
×
NEW
752
  output.log(`Export: ${filePath} (${paires.length} paires)`)
×
753

754
  // ── Export CSV clusters ──
755

NEW
756
  const clustersCsvHeader = [
×
757
    'cluster_id',
758
    'type',
759
    'taille',
760
    'nb_lieux_distincts',
761
    'paires',
762
    'score_max',
763
    'noms',
764
    'sirets',
765
    'commune',
766
    'ids',
767
  ].join(';')
768

NEW
769
  const clustersCsvLines = [
×
770
    clustersCsvHeader,
771
    ...clusters.map((c, i) => {
NEW
772
      const noms = new Set<string>()
×
NEW
773
      const sirets = new Set<string>()
×
NEW
774
      const communes = new Set<string>()
×
NEW
775
      for (const p of c.paires) {
×
NEW
776
        noms.add(p.nomA)
×
NEW
777
        noms.add(p.nomB)
×
NEW
778
        if (p.siretA) sirets.add(p.siretA)
×
NEW
779
        if (p.siretB) sirets.add(p.siretB)
×
NEW
780
        communes.add(p.communeA)
×
781
      }
NEW
782
      return [
×
783
        i + 1,
784
        c.type,
785
        c.ids.size,
786
        c.nbLieuxDistincts,
787
        c.paires.length,
788
        c.scoreMax.toFixed(3),
789
        escapeCsvField([...noms].join(' | ')),
790
        [...sirets].join(' | ') || '',
×
791
        escapeCsvField([...communes].join(' | ')),
792
        [...c.ids].join(' | '),
793
      ].join(';')
794
    }),
795
  ]
796

NEW
797
  const clustersFilePath = getAuditOutputPath('detect-duplicate-clusters.csv')
×
NEW
798
  await writeFile(clustersFilePath, clustersCsvLines.join('\n'), 'utf-8')
×
799

NEW
800
  output.log(
×
801
    `Export clusters: ${clustersFilePath} (${clusters.length} clusters)`,
802
  )
803

NEW
804
  output.log(`\ndetect-duplicate-structures: terminé`)
×
805

NEW
806
  return {
×
807
    structuresAnalysees: structures.length,
808
    codesInseeAvecDoublons: groupesComparables.length,
809
    comparaisonsEffectuees,
810
    pairesDetectees: paires.length,
811
    clusters: {
812
      total: clusters.length,
813
      doublon_certain: typeDistrib.doublon_certain,
814
      multi_site: typeDistrib.multi_site,
815
      mixte: typeDistrib.mixte,
816
    },
817
    structuresImpliquees: structuresImpliquees.size,
818
    seuilScore,
819
    exports: {
820
      paires: filePath,
821
      clusters: clustersFilePath,
822
    },
823
  }
824
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc