• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

inclusion-numerique / coop-mediation-numerique / d8de6e05-24d8-4280-aae0-2589f3d40ae1

19 May 2026 04:35PM UTC coverage: 9.989% (+3.0%) from 7.008%
d8de6e05-24d8-4280-aae0-2589f3d40ae1

Pull #497

circleci

marc-gavanier
feat: improve structure fusion scoring and review export

Significantly reduces the manual review burden by detecting more
true duplicates automatically and avoiding false positives.

Scoring improvements (detect-duplicate-structures, generate-structures-action-plan):
- Treat clusters of type 'mixte' like 'doublon_certain' with per-pair
  scoring (instead of bulk verification_manuelle), uncovering hundreds
  of auto/probable fusions previously hidden in mixed clusters.
- Boost address score to 1.0 when one normalized address is contained
  in the other (e.g. "Lupino" vs "LUPINO PARVIS NOTRE DAME VICTOIRE").
- Add address abbreviations: VC (voie communale), RT (route), ZA, ZI, CH.
- Redistribute geo weight when coords are unavailable, OR when address
  strongly indicates the same place (>=0.85): prevents penalizing
  structures with missing or erroneous coords.
- Normalize "commune de/du", "mairie de/du", "ville de/du" to a single
  "ville" canonical token so variants match.
- Detect "service keywords" (EPN, médiathèque, CCAS, France services,
  MJC, etc.): when one name has such a keyword and the other does not,
  they are distinct entities even with shared SIRET/address. Disables
  the address-contained heuristic and keeps geo in the score.

Sync resilience (findOrCreateStructure):
- After strict siret+codeInsee miss, fall back to siret-only with
  normalized contained-name match. This catches Dataspace structures
  whose codeInsee diverges from the coop's, without merging an EPN
  with its parent town hall (asymmetric-service-keyword guard).

Review output:
- generate-structures-action-plan: structures-fusion-review.csv now
  uses cluster-grouped format (CIBLE + sources + empty line between
  clusters, sorted by ascending score), matching the existing format
  Tim uses for his manual reviews.
- export-duplicate-sirets: cluster-grouped CSV (empty line between
  SIRETs) and exclude empty-string siret. Enrich each row with
  nom_api, adresse_api, corre... (continued)
Pull Request #497: feat: improve structure fusion scoring and review export

688 of 10878 branches covered (6.32%)

Branch coverage included in aggregate %.

26 of 153 new or added lines in 4 files covered. (16.99%)

1111 existing lines in 95 files now uncovered.

2111 of 17142 relevant lines covered (12.31%)

1.95 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/apps/web/src/jobs/detect-duplicate-structures/executeDetectDuplicateStructures.ts
1
import { writeFile } from 'node:fs/promises'
2
import { getAuditOutputPath } from '@app/web/jobs/audit-output'
3
import { output } from '@app/web/jobs/output'
4
import { prismaClient } from '@app/web/prismaClient'
5
import type { DetectDuplicateStructuresJob } from './detectDuplicateStructuresJob'
6

7
type StructureLight = {
8
  id: string
9
  nom: string
10
  nomNormalise: string
11
  siret: string | null
12
  adresse: string
13
  adresseNormalisee: string
14
  commune: string
15
  codePostal: string
16
  codeInsee: string | null
17
  telephone: string | null
18
  latitude: number | null
19
  longitude: number | null
20
  visiblePourCartographieNationale: boolean
21
  activitesCount: number
22
  emploisCount: number
23
  mediateursCount: number
24
}
25

26
type PaireDoublon = {
27
  idA: string
28
  nomA: string
29
  siretA: string
30
  adresseA: string
31
  communeA: string
32
  idB: string
33
  nomB: string
34
  siretB: string
35
  adresseB: string
36
  communeB: string
37
  codeInsee: string
38
  scoreNom: number
39
  scoreAdresse: number
40
  scoreGeo: number
41
  scoreSiret: number
42
  scoreTelephone: number
43
  scoreTotal: number
44
  activitesA: number
45
  activitesB: number
46
  emploisA: number
47
  emploisB: number
48
  mediateursA: number
49
  mediateursB: number
50
  visibleCartoA: boolean
51
  visibleCartoB: boolean
52
}
53

54
// ── Pondérations ──
55

56
const POIDS_NOM = 0.35
×
57
const POIDS_ADRESSE = 0.25
×
58
const POIDS_GEO = 0.2
×
59
const POIDS_SIRET = 0.15
×
60
const POIDS_TELEPHONE = 0.05
×
61

62
// ── Utilitaires ──
63

64
const stripDiacritics = (s: string) =>
×
65
  s.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
×
66

67
const baseNormalize = (s: string) =>
×
68
  stripDiacritics(s)
×
69
    .toLowerCase()
70
    .replace(/[^a-z0-9\s]/g, '')
71
    .replace(/\s+/g, ' ')
72
    .trim()
73

74
// ── Normalisation des noms de structures ──
75

76
/**
77
 * Préfixes administratifs interchangeables.
78
 * "commune de X", "mairie de X", "ville de X" → "X"
79
 * "conseil départemental de X", "département de X" → "X"
80
 * "communauté de communes de X", "communauté d'agglomération de X", etc.
81
 */
82
// Préfixes normalisés vers un token canonique au lieu d'être supprimés.
83
// "commune de X" et "mairie de X" deviennent "ville X" → matchent ensemble.
84
// Mais "EPN X" reste tel quel → ne matche pas avec "ville X".
NEW
85
const NOM_PREFIXES_NORMALIZATIONS: [RegExp, string][] = [
×
86
  // Communes (de/du/des/de la/de l')
87
  [/^commune (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
88
  [/^com (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
89
  [/^mairie (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
90
  [/^ville (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
91
  // Départements
92
  [/^conseil departemental (?:de(?:s)?|du|de la|de l)\s+/, 'departement '],
93
  [/^departement (?:de(?:s)?|du|de la|de l)\s+/, 'departement '],
94
  // Intercommunalités
95
  [/^communaute de communes?\s+/, 'cc '],
96
  [/^communaute d agglomeration\s+/, 'cagglo '],
97
  [/^communaute com\s+/, 'cc '],
98
  // Régions
99
  [/^conseil regional (?:de(?:s)?|du|de la|de l)\s+/, 'region '],
100
  [/^region\s+/, 'region '],
101
]
102

103
// Mots-clés qui désignent un service spécifique d'une entité plus large
104
// (mairie, commune, etc.). Si une structure les contient et l'autre non,
105
// elles ne sont PAS la même entité, même avec un SIRET partagé.
NEW
106
const SERVICE_KEYWORDS = [
×
107
  'epn',
108
  'mediatheque',
109
  'bibliotheque',
110
  'ccas',
111
  'cias',
112
  'centre social',
113
  'maison quartier',
114
  'maison de quartier',
115
  'france services',
116
  'mjc',
117
  'espace numerique',
118
  'cyber espace',
119
  'cyberbase',
120
  'pole emploi',
121
  'mission locale',
122
  'point information',
123
  'point info',
124
  'fablab',
125
]
126

NEW
127
const detectServiceKeywords = (s: string): Set<string> => {
×
NEW
128
  const found = new Set<string>()
×
NEW
129
  for (const kw of SERVICE_KEYWORDS) {
×
NEW
130
    if (s.includes(kw)) found.add(kw)
×
131
  }
NEW
132
  return found
×
133
}
134

135
/**
136
 * Returns true if `a` and `b` reference different service kinds.
137
 * Used to prevent fusioning e.g. an EPN with its parent town hall
138
 * even when they share a SIRET and an address.
139
 */
NEW
140
const hasAsymmetricServiceKeyword = (a: string, b: string): boolean => {
×
NEW
141
  const ka = detectServiceKeywords(a)
×
NEW
142
  const kb = detectServiceKeywords(b)
×
NEW
143
  if (ka.size === 0 && kb.size === 0) return false
×
144
  // If one set is a subset of the other, no asymmetry; same service.
NEW
145
  for (const k of ka) if (!kb.has(k)) return true
×
NEW
146
  for (const k of kb) if (!ka.has(k)) return true
×
NEW
147
  return false
×
148
}
149

150
/**
151
 * Abréviations courantes dans les noms de structures.
152
 */
153
const NOM_ABBREVIATIONS: [RegExp, string][] = [
×
154
  [/\bst\b/g, 'saint'],
155
  [/\bste\b/g, 'sainte'],
156
  [/\basse?\b/g, 'association'],
157
  [/\bassoc\b/g, 'association'],
158
  [/\bfed\b/g, 'federation'],
159
  [/\bfeder\b/g, 'federation'],
160
  [/\bdepart\b/g, 'departementale'],
161
  [/\bamic\b/g, 'amicale'],
162
  [/\bmdf\b/g, 'maison de la famille'],
163
]
164

165
const normalizeNom = (s: string): string => {
×
166
  let n = baseNormalize(s)
×
167

NEW
168
  for (const [pattern, replacement] of NOM_PREFIXES_NORMALIZATIONS) {
×
NEW
169
    n = n.replace(pattern, replacement)
×
170
  }
171

172
  for (const [pattern, replacement] of NOM_ABBREVIATIONS) {
×
173
    n = n.replace(pattern, replacement)
×
174
  }
175

176
  // Retirer les mots-outils en fin de normalisation
177
  n = n
×
178
    .replace(/\b(de|du|des|le|la|les|l|d|et|en)\b/g, '')
179
    .replace(/\s+/g, ' ')
180
    .trim()
181

182
  return n
×
183
}
184

185
// ── Normalisation des adresses ──
186

187
const ADRESSE_ABBREVIATIONS: [RegExp, string][] = [
×
188
  [/\bav\b/g, 'avenue'],
189
  [/\bbd\b/g, 'boulevard'],
190
  [/\bblvd\b/g, 'boulevard'],
191
  [/\bpl\b/g, 'place'],
192
  [/\bimp\b/g, 'impasse'],
193
  [/\bche\b/g, 'chemin'],
194
  [/\bch\b/g, 'chemin'],
195
  [/\bsq\b/g, 'square'],
196
  [/\brte\b/g, 'route'],
197
  [/\brt\b/g, 'route'],
198
  [/\bres\b/g, 'residence'],
199
  [/\bvc\b/g, 'voie communale'],
200
  [/\bzi\b/g, 'zone industrielle'],
201
  [/\bza\b/g, 'zone artisanale'],
202
  [/\b(\d+)bis\b/g, '$1'],
203
  [/\b(\d+)ter\b/g, '$1'],
204
  [/\b(\d+)b\b/g, '$1'],
205
]
206

207
const normalizeAdresse = (s: string): string => {
×
208
  let n = baseNormalize(s)
×
209

210
  for (const [pattern, replacement] of ADRESSE_ABBREVIATIONS) {
×
211
    n = n.replace(pattern, replacement)
×
212
  }
213

214
  // Retirer les mots-outils
215
  n = n
×
216
    .replace(/\b(de|du|des|le|la|les|l|d)\b/g, '')
217
    .replace(/\s+/g, ' ')
218
    .trim()
219

220
  return n
×
221
}
222

223
// ── Similarité ──
224

225
const bigrams = (s: string) => {
×
226
  const set = new Map<string, number>()
×
227
  for (let i = 0; i < s.length - 1; i++) {
×
228
    const bigram = s.slice(i, i + 2)
×
229
    set.set(bigram, (set.get(bigram) ?? 0) + 1)
×
230
  }
231
  return set
×
232
}
233

234
const diceSimilarity = (a: string, b: string): number => {
×
235
  if (a === b) return 1
×
236
  if (a.length < 2 || b.length < 2) return 0
×
237

238
  const bigramsA = bigrams(a)
×
239
  const bigramsB = bigrams(b)
×
240

241
  let intersection = 0
×
242
  for (const [bigram, countA] of bigramsA) {
×
243
    const countB = bigramsB.get(bigram) ?? 0
×
244
    intersection += Math.min(countA, countB)
×
245
  }
246

247
  return (2 * intersection) / (a.length - 1 + b.length - 1)
×
248
}
249

250
const haversineDistance = (
×
251
  lat1: number,
252
  lon1: number,
253
  lat2: number,
254
  lon2: number,
255
): number => {
256
  const R = 6_371_000
×
257
  const toRad = (deg: number) => (deg * Math.PI) / 180
×
258
  const dLat = toRad(lat2 - lat1)
×
259
  const dLon = toRad(lon2 - lon1)
×
260
  const a =
261
    Math.sin(dLat / 2) ** 2 +
×
262
    Math.cos(toRad(lat1)) * Math.cos(toRad(lat2)) * Math.sin(dLon / 2) ** 2
263
  return R * 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
×
264
}
265

266
/**
267
 * Score géographique : 1.0 si < 50m, décroissance linéaire jusqu'à 0 à 500m
268
 */
269
const scoreGeo = (a: StructureLight, b: StructureLight): number => {
×
270
  if (
×
271
    a.latitude == null ||
×
272
    a.longitude == null ||
273
    b.latitude == null ||
274
    b.longitude == null
275
  ) {
276
    return 0
×
277
  }
278

279
  const distance = haversineDistance(
×
280
    a.latitude,
281
    a.longitude,
282
    b.latitude,
283
    b.longitude,
284
  )
285

286
  if (distance < 50) return 1
×
287
  if (distance > 500) return 0
×
288
  return 1 - (distance - 50) / 450
×
289
}
290

291
const scoreSiret = (a: StructureLight, b: StructureLight): number => {
×
292
  if (!a.siret || !b.siret) return 0
×
293
  return a.siret === b.siret ? 1 : 0
×
294
}
295

296
const normalizeTelephone = (tel: string | null): string => {
×
297
  if (!tel) return ''
×
298
  return tel.replace(/[\s.+\-()]/g, '').replace(/^0033/, '0')
×
299
}
300

301
const scoreTelephone = (a: StructureLight, b: StructureLight): number => {
×
302
  const telA = normalizeTelephone(a.telephone)
×
303
  const telB = normalizeTelephone(b.telephone)
×
304
  if (!telA || !telB) return 0
×
305
  return telA === telB ? 1 : 0
×
306
}
307

308
// Min length for the "contained" address heuristic: avoids matching on
309
// generic tokens like "rue" or "place" alone.
NEW
310
const MIN_CONTAINED_ADRESSE_LENGTH = 5
×
311

NEW
312
const scoreAdresse = (a: string, b: string): number => {
×
NEW
313
  if (a.length >= MIN_CONTAINED_ADRESSE_LENGTH && b.length >= MIN_CONTAINED_ADRESSE_LENGTH) {
×
NEW
314
    if (a.includes(b) || b.includes(a)) return 1
×
315
  }
NEW
316
  return diceSimilarity(a, b)
×
317
}
318

NEW
319
const computeScore = (
×
320
  a: StructureLight,
321
  b: StructureLight,
322
): {
323
  scoreNom: number
324
  scoreAdresse: number
325
  scoreGeo: number
326
  scoreSiret: number
327
  scoreTelephone: number
328
  scoreTotal: number
329
} => {
UNCOV
330
  const sNom = diceSimilarity(a.nomNormalise, b.nomNormalise)
×
331

332
  // If one structure references a service kind (EPN, médiathèque, CCAS...)
333
  // and the other does not, they are distinct entities even with same
334
  // SIRET + address. Disable the "address contained" heuristic for them.
NEW
335
  const differentService = hasAsymmetricServiceKeyword(
×
336
    a.nomNormalise,
337
    b.nomNormalise,
338
  )
NEW
339
  const sAdresse = differentService
×
340
    ? diceSimilarity(a.adresseNormalisee, b.adresseNormalisee)
341
    : scoreAdresse(a.adresseNormalisee, b.adresseNormalisee)
NEW
342
  const sGeo = scoreGeo(a, b)
×
NEW
343
  const sSiret = scoreSiret(a, b)
×
NEW
344
  const sTelephone = scoreTelephone(a, b)
×
345

346
  // Ignore geo when it's unavailable OR when the address strongly indicates
347
  // the same place (>=0.85): missing coords or wrong coords should not
348
  // penalize a clear address match. But never ignore it when names indicate
349
  // distinct services (the geo distance is a valuable signal here).
350
  const geoAvailable =
NEW
351
    a.latitude != null &&
×
352
    a.longitude != null &&
353
    b.latitude != null &&
354
    b.longitude != null
355
  const ignoreGeo =
NEW
356
    !differentService && (!geoAvailable || sAdresse >= 0.85)
×
NEW
357
  const weightsSum = ignoreGeo
×
358
    ? POIDS_NOM + POIDS_ADRESSE + POIDS_SIRET + POIDS_TELEPHONE
359
    : 1
360

361
  const scoreTotal =
NEW
362
    (sNom * POIDS_NOM +
×
363
      sAdresse * POIDS_ADRESSE +
364
      (ignoreGeo ? 0 : sGeo * POIDS_GEO) +
×
365
      sSiret * POIDS_SIRET +
366
      sTelephone * POIDS_TELEPHONE) /
367
    weightsSum
368

NEW
369
  return {
×
370
    scoreNom: sNom,
371
    scoreAdresse: sAdresse,
372
    scoreGeo: sGeo,
373
    scoreSiret: sSiret,
374
    scoreTelephone: sTelephone,
375
    scoreTotal,
376
  }
377
}
378

379
// ── CSV ──
380

UNCOV
381
const escapeCsvField = (value: string) =>
×
UNCOV
382
  value.includes(';') || value.includes('"') || value.includes('\n')
×
383
    ? `"${value.replace(/"/g, '""')}"`
384
    : value
385

UNCOV
386
const csvHeader = [
×
387
  'id_a',
388
  'nom_a',
389
  'siret_a',
390
  'adresse_a',
391
  'commune_a',
392
  'activites_a',
393
  'emplois_a',
394
  'mediateurs_a',
395
  'visible_carto_a',
396
  'id_b',
397
  'nom_b',
398
  'siret_b',
399
  'adresse_b',
400
  'commune_b',
401
  'activites_b',
402
  'emplois_b',
403
  'mediateurs_b',
404
  'visible_carto_b',
405
  'code_insee',
406
  'score_nom',
407
  'score_adresse',
408
  'score_geo',
409
  'score_siret',
410
  'score_telephone',
411
  'score_total',
412
].join(';')
413

UNCOV
414
const paireToCsv = (p: PaireDoublon): string =>
×
UNCOV
415
  [
×
416
    p.idA,
417
    escapeCsvField(p.nomA),
418
    p.siretA,
419
    escapeCsvField(p.adresseA),
420
    escapeCsvField(p.communeA),
421
    p.activitesA,
422
    p.emploisA,
423
    p.mediateursA,
424
    p.visibleCartoA ? 'oui' : 'non',
×
425
    p.idB,
426
    escapeCsvField(p.nomB),
427
    p.siretB,
428
    escapeCsvField(p.adresseB),
429
    escapeCsvField(p.communeB),
430
    p.activitesB,
431
    p.emploisB,
432
    p.mediateursB,
433
    p.visibleCartoB ? 'oui' : 'non',
×
434
    p.codeInsee,
435
    p.scoreNom.toFixed(3),
436
    p.scoreAdresse.toFixed(3),
437
    p.scoreGeo.toFixed(3),
438
    p.scoreSiret.toFixed(3),
439
    p.scoreTelephone.toFixed(3),
440
    p.scoreTotal.toFixed(3),
441
  ].join(';')
442

443
// ── Job ──
444

UNCOV
445
export const executeDetectDuplicateStructures = async (
×
446
  job: DetectDuplicateStructuresJob,
447
) => {
UNCOV
448
  const seuilScore = job.payload?.seuilScore ?? 0.6
×
UNCOV
449
  const limit = job.payload?.limit
×
450

451
  output.log(
×
452
    `detect-duplicate-structures: starting (seuil: ${seuilScore})${limit ? ` (limit: ${limit} codes INSEE)` : ''}...`,
×
453
  )
454

UNCOV
455
  const structures = await prismaClient.structure.findMany({
×
456
    where: {
457
      suppression: null,
458
      codeInsee: { not: null },
459
    },
460
    select: {
461
      id: true,
462
      nom: true,
463
      siret: true,
464
      adresse: true,
465
      commune: true,
466
      codePostal: true,
467
      codeInsee: true,
468
      telephone: true,
469
      latitude: true,
470
      longitude: true,
471
      visiblePourCartographieNationale: true,
472
      activitesCount: true,
473
      _count: {
474
        select: {
475
          emplois: true,
476
          mediateursEnActivite: true,
477
        },
478
      },
479
    },
480
  })
481

482
  // Regrouper par code INSEE
UNCOV
483
  const parCodeInsee = new Map<string, StructureLight[]>()
×
UNCOV
484
  for (const s of structures) {
×
485
    const codeInsee = s.codeInsee as string
×
486
    const light: StructureLight = {
×
487
      id: s.id,
488
      nom: s.nom,
489
      nomNormalise: normalizeNom(s.nom),
490
      siret: s.siret,
491
      adresse: s.adresse,
492
      adresseNormalisee: normalizeAdresse(s.adresse),
493
      commune: s.commune,
494
      codePostal: s.codePostal,
495
      codeInsee: s.codeInsee,
496
      telephone: s.telephone,
497
      latitude: s.latitude,
498
      longitude: s.longitude,
499
      visiblePourCartographieNationale: s.visiblePourCartographieNationale,
500
      activitesCount: s.activitesCount,
501
      emploisCount: s._count.emplois,
502
      mediateursCount: s._count.mediateursEnActivite,
503
    }
UNCOV
504
    const group = parCodeInsee.get(codeInsee)
×
UNCOV
505
    if (group) {
×
506
      group.push(light)
×
507
    } else {
508
      parCodeInsee.set(codeInsee, [light])
×
509
    }
510
  }
511

512
  // Filtrer les groupes avec au moins 2 structures
UNCOV
513
  const groupesComparables = [...parCodeInsee.entries()].filter(
×
UNCOV
514
    ([, group]) => group.length >= 2,
×
515
  )
516

UNCOV
517
  const groupesATraiter = limit
×
518
    ? groupesComparables.slice(0, limit)
519
    : groupesComparables
520

UNCOV
521
  const totalComparaisons = groupesATraiter.reduce(
×
UNCOV
522
    (sum, [, group]) => sum + (group.length * (group.length - 1)) / 2,
×
523
    0,
524
  )
525

UNCOV
526
  output.log(
×
527
    `detect-duplicate-structures: ${structures.length} structures, ${groupesComparables.length} codes INSEE avec ≥2 structures, ${totalComparaisons} comparaisons à effectuer`,
528
  )
529

UNCOV
530
  const paires: PaireDoublon[] = []
×
UNCOV
531
  let comparaisonsEffectuees = 0
×
532

533
  for (const [codeInsee, group] of groupesATraiter) {
×
UNCOV
534
    for (let i = 0; i < group.length; i++) {
×
535
      for (let j = i + 1; j < group.length; j++) {
×
536
        const a = group[i]
×
537
        const b = group[j]
×
538
        comparaisonsEffectuees++
×
539

540
        const scores = computeScore(a, b)
×
541

542
        if (scores.scoreTotal >= seuilScore) {
×
UNCOV
543
          paires.push({
×
544
            idA: a.id,
545
            nomA: a.nom,
546
            siretA: a.siret ?? '',
×
547
            adresseA: a.adresse,
548
            communeA: a.commune,
549
            activitesA: a.activitesCount,
550
            emploisA: a.emploisCount,
551
            mediateursA: a.mediateursCount,
552
            visibleCartoA: a.visiblePourCartographieNationale,
553
            idB: b.id,
554
            nomB: b.nom,
555
            siretB: b.siret ?? '',
×
556
            adresseB: b.adresse,
557
            communeB: b.commune,
558
            activitesB: b.activitesCount,
559
            emploisB: b.emploisCount,
560
            mediateursB: b.mediateursCount,
561
            visibleCartoB: b.visiblePourCartographieNationale,
562
            codeInsee,
563
            ...scores,
564
          })
565
        }
566
      }
567
    }
568

UNCOV
569
    if (comparaisonsEffectuees % 50_000 === 0) {
×
UNCOV
570
      output.log(
×
571
        `detect-duplicate-structures: progress ${comparaisonsEffectuees}/${totalComparaisons} comparaisons, ${paires.length} paires trouvées`,
572
      )
573
    }
574
  }
575

576
  // Trier par score décroissant
UNCOV
577
  paires.sort((a, b) => b.scoreTotal - a.scoreTotal)
×
578

579
  // ── Export CSV ──
580

UNCOV
581
  const csvLines = [csvHeader, ...paires.map(paireToCsv)]
×
UNCOV
582
  const filePath = getAuditOutputPath('detect-duplicate-structures.csv')
×
583
  await writeFile(filePath, csvLines.join('\n'), 'utf-8')
×
584

585
  // ── Rapport console ──
586

UNCOV
587
  output.log(`\n=== DÉTECTION DOUBLONS FLOUS - RÉSULTATS ===`)
×
UNCOV
588
  output.log(`Structures analysées: ${structures.length}`)
×
589
  output.log(`Codes INSEE avec ≥2 structures: ${groupesComparables.length}`)
×
590
  output.log(`Comparaisons effectuées: ${comparaisonsEffectuees}`)
×
591
  output.log(`Paires détectées (score ≥ ${seuilScore}): ${paires.length}`)
×
592

593
  // Distribution des scores
UNCOV
594
  if (paires.length > 0) {
×
UNCOV
595
    const ranges = [
×
596
      { label: '0.9 - 1.0', min: 0.9, max: 1.01 },
597
      { label: '0.8 - 0.9', min: 0.8, max: 0.9 },
598
      { label: '0.7 - 0.8', min: 0.7, max: 0.8 },
599
      { label: '0.6 - 0.7', min: 0.6, max: 0.7 },
600
      { label: '< 0.6', min: 0, max: 0.6 },
601
    ]
UNCOV
602
    output.log(`\n--- Distribution des scores ---`)
×
UNCOV
603
    for (const range of ranges) {
×
604
      const count = paires.filter(
×
605
        (p) => p.scoreTotal >= range.min && p.scoreTotal < range.max,
×
606
      ).length
607
      if (count > 0) {
×
UNCOV
608
        output.log(`  ${range.label}: ${count}`)
×
609
      }
610
    }
611
  }
612

613
  // ── Regroupement en clusters (composantes connexes) ──
614

UNCOV
615
  const parent = new Map<string, string>()
×
616

617
  const find = (id: string): string => {
×
UNCOV
618
    if (!parent.has(id)) parent.set(id, id)
×
619
    let root = id
×
620
    while (parent.get(root) !== root) root = parent.get(root) as string
×
621
    // Path compression
622
    let current = id
×
UNCOV
623
    while (current !== root) {
×
624
      const next = parent.get(current) as string
×
625
      parent.set(current, root)
×
626
      current = next
×
627
    }
628
    return root
×
629
  }
630

UNCOV
631
  const union = (a: string, b: string) => {
×
UNCOV
632
    const ra = find(a)
×
633
    const rb = find(b)
×
634
    if (ra !== rb) parent.set(ra, rb)
×
635
  }
636

UNCOV
637
  for (const p of paires) {
×
UNCOV
638
    union(p.idA, p.idB)
×
639
  }
640

641
  // Construire les clusters
642

643
  type ClusterType = 'doublon_certain' | 'multi_site' | 'mixte'
644

645
  type Cluster = {
646
    ids: Set<string>
647
    paires: PaireDoublon[]
648
    scoreMax: number
649
    type: ClusterType
650
    nbLieuxDistincts: number
651
  }
652

UNCOV
653
  const clusterMap = new Map<string, Cluster>()
×
654

655
  for (const p of paires) {
×
UNCOV
656
    const root = find(p.idA)
×
657
    let cluster = clusterMap.get(root)
×
658
    if (!cluster) {
×
659
      cluster = {
×
660
        ids: new Set(),
661
        paires: [],
662
        scoreMax: 0,
663
        type: 'doublon_certain',
664
        nbLieuxDistincts: 1,
665
      }
UNCOV
666
      clusterMap.set(root, cluster)
×
667
    }
668
    cluster.ids.add(p.idA)
×
UNCOV
669
    cluster.ids.add(p.idB)
×
670
    cluster.paires.push(p)
×
671
    if (p.scoreTotal > cluster.scoreMax) cluster.scoreMax = p.scoreTotal
×
672
  }
673

674
  // ── Classification des clusters par lieux physiques ──
675
  // Deux structures sont au même lieu si :
676
  // - scoreGeo ≥ 0.7 (~200m) : proximité géographique
677
  // - OU scoreAdresse ≥ 0.85 : adresse quasi identique (fallback sans coordonnées)
678

UNCOV
679
  const SEUIL_MEME_LIEU_GEO = 0.7
×
UNCOV
680
  const SEUIL_MEME_LIEU_ADRESSE = 0.85
×
681

682
  const isSameLieu = (p: PaireDoublon): boolean =>
×
UNCOV
683
    p.scoreGeo >= SEUIL_MEME_LIEU_GEO ||
×
684
    p.scoreAdresse >= SEUIL_MEME_LIEU_ADRESSE
685

UNCOV
686
  for (const cluster of clusterMap.values()) {
×
687
    // Union-Find local pour regrouper les structures par lieu physique
688
    const lieuParent = new Map<string, string>()
×
689

690
    const findLieu = (id: string): string => {
×
UNCOV
691
      if (!lieuParent.has(id)) lieuParent.set(id, id)
×
692
      let root = id
×
693
      while (lieuParent.get(root) !== root)
×
694
        root = lieuParent.get(root) as string
×
695
      let current = id
×
696
      while (current !== root) {
×
697
        const next = lieuParent.get(current) as string
×
698
        lieuParent.set(current, root)
×
699
        current = next
×
700
      }
701
      return root
×
702
    }
703

UNCOV
704
    const unionLieu = (a: string, b: string) => {
×
UNCOV
705
      const ra = findLieu(a)
×
706
      const rb = findLieu(b)
×
707
      if (ra !== rb) lieuParent.set(ra, rb)
×
708
    }
709

710
    // Initialiser tous les IDs
UNCOV
711
    for (const id of cluster.ids) findLieu(id)
×
712

713
    // Regrouper les structures proches géographiquement ou par adresse
UNCOV
714
    for (const p of cluster.paires) {
×
UNCOV
715
      if (isSameLieu(p)) {
×
716
        unionLieu(p.idA, p.idB)
×
717
      }
718
    }
719

720
    // Compter les lieux distincts
UNCOV
721
    const lieux = new Set<string>()
×
UNCOV
722
    for (const id of cluster.ids) lieux.add(findLieu(id))
×
723

724
    cluster.nbLieuxDistincts = lieux.size
×
725

726
    if (lieux.size === 1) {
×
UNCOV
727
      cluster.type = 'doublon_certain'
×
728
    } else if (lieux.size === cluster.ids.size) {
×
729
      cluster.type = 'multi_site'
×
730
    } else {
731
      cluster.type = 'mixte'
×
732
    }
733
  }
734

UNCOV
735
  const clusters = [...clusterMap.values()].sort(
×
UNCOV
736
    (a, b) => b.ids.size - a.ids.size,
×
737
  )
738

UNCOV
739
  const structuresImpliquees = new Set<string>()
×
UNCOV
740
  for (const c of clusters) {
×
741
    for (const id of c.ids) structuresImpliquees.add(id)
×
742
  }
743

744
  // ── Rapport clusters ──
745

UNCOV
746
  output.log(`\n--- Clusters ---`)
×
UNCOV
747
  output.log(`Clusters trouvés: ${clusters.length}`)
×
748

749
  const clusterSizeDistrib = new Map<number, number>()
×
UNCOV
750
  for (const c of clusters) {
×
751
    const size = c.ids.size
×
752
    clusterSizeDistrib.set(size, (clusterSizeDistrib.get(size) ?? 0) + 1)
×
753
  }
754
  output.log(`Distribution par taille:`)
×
UNCOV
755
  for (const [size, count] of [...clusterSizeDistrib.entries()].sort(
×
756
    (a, b) => b[0] - a[0],
×
757
  )) {
758
    output.log(`  ${size} structures: ${count} clusters`)
×
759
  }
760

761
  // Classification par type
UNCOV
762
  const typeDistrib: Record<ClusterType, number> = {
×
763
    doublon_certain: 0,
764
    multi_site: 0,
765
    mixte: 0,
766
  }
UNCOV
767
  const structuresParType: Record<ClusterType, number> = {
×
768
    doublon_certain: 0,
769
    multi_site: 0,
770
    mixte: 0,
771
  }
UNCOV
772
  for (const c of clusters) {
×
UNCOV
773
    typeDistrib[c.type]++
×
774
    structuresParType[c.type] += c.ids.size
×
775
  }
776

UNCOV
777
  output.log(`\n--- Classification ---`)
×
UNCOV
778
  for (const type of ['doublon_certain', 'multi_site', 'mixte'] as const) {
×
779
    output.log(
×
780
      `  ${type}: ${typeDistrib[type]} clusters (${structuresParType[type]} structures)`,
781
    )
782
  }
783

784
  // Top 10 clusters les plus gros
UNCOV
785
  output.log(`\n--- Top 10 plus gros clusters ---`)
×
UNCOV
786
  for (const cluster of clusters.slice(0, 10)) {
×
787
    const exemple = cluster.paires[0]
×
788
    const noms = new Set<string>()
×
789
    const sirets = new Set<string>()
×
790
    for (const p of cluster.paires) {
×
791
      noms.add(p.nomA)
×
792
      noms.add(p.nomB)
×
793
      if (p.siretA) sirets.add(p.siretA)
×
794
      if (p.siretB) sirets.add(p.siretB)
×
795
    }
796
    output.log(
×
797
      `  [${cluster.type}] [${cluster.ids.size} structures, ${cluster.nbLieuxDistincts} lieux, score_max=${cluster.scoreMax.toFixed(3)}]`,
798
    )
UNCOV
799
    output.log(
×
800
      `    Noms: ${[...noms]
801
        .slice(0, 3)
UNCOV
802
        .map((n) => `"${n}"`)
×
803
        .join(', ')}${noms.size > 3 ? ` (+${noms.size - 3})` : ''}`,
×
804
    )
UNCOV
805
    output.log(
×
806
      `    Commune: ${exemple.communeA} | SIRETs: ${sirets.size > 0 ? [...sirets].join(', ') : '—'}`,
×
807
    )
808
  }
809

810
  // Top 10 paires les plus intéressantes (hors doublons stricts)
UNCOV
811
  const pairesInteressantes = paires.filter(
×
812
    (p) =>
813
      !(p.scoreNom >= 0.95 && p.scoreAdresse >= 0.95 && p.scoreSiret === 1),
×
814
  )
815

816
  // Dédupliquer par cluster pour varier les exemples
UNCOV
817
  const clustersVus = new Set<string>()
×
UNCOV
818
  const pairesVariees: PaireDoublon[] = []
×
819
  for (const p of pairesInteressantes) {
×
820
    const root = find(p.idA)
×
821
    if (!clustersVus.has(root)) {
×
822
      clustersVus.add(root)
×
823
      pairesVariees.push(p)
×
824
      if (pairesVariees.length >= 10) break
×
825
    }
826
  }
827

UNCOV
828
  output.log(`\n--- Top 10 paires (hors doublons stricts, 1 par cluster) ---`)
×
UNCOV
829
  for (const p of pairesVariees) {
×
830
    output.log(
×
831
      `  score=${p.scoreTotal.toFixed(3)} [nom=${p.scoreNom.toFixed(2)} adr=${p.scoreAdresse.toFixed(2)} geo=${p.scoreGeo.toFixed(2)} siret=${p.scoreSiret.toFixed(0)}]`,
832
    )
UNCOV
833
    output.log(`    "${p.nomA}" ↔ "${p.nomB}" | ${p.communeA}`)
×
UNCOV
834
    output.log(
×
835
      `    "${p.adresseA}" ↔ "${p.adresseB}" | SIRET: ${p.siretA || '—'} / ${p.siretB || '—'}`,
×
836
    )
837
  }
838

UNCOV
839
  output.log(`\nStructures uniques impliquées: ${structuresImpliquees.size}`)
×
UNCOV
840
  output.log(`Export: ${filePath} (${paires.length} paires)`)
×
841

842
  // ── Export CSV clusters ──
843

UNCOV
844
  const clustersCsvHeader = [
×
845
    'cluster_id',
846
    'type',
847
    'taille',
848
    'nb_lieux_distincts',
849
    'paires',
850
    'score_max',
851
    'noms',
852
    'sirets',
853
    'commune',
854
    'ids',
855
  ].join(';')
856

UNCOV
857
  const clustersCsvLines = [
×
858
    clustersCsvHeader,
859
    ...clusters.map((c, i) => {
UNCOV
860
      const noms = new Set<string>()
×
UNCOV
861
      const sirets = new Set<string>()
×
862
      const communes = new Set<string>()
×
863
      for (const p of c.paires) {
×
864
        noms.add(p.nomA)
×
865
        noms.add(p.nomB)
×
866
        if (p.siretA) sirets.add(p.siretA)
×
867
        if (p.siretB) sirets.add(p.siretB)
×
868
        communes.add(p.communeA)
×
869
      }
870
      return [
×
871
        i + 1,
872
        c.type,
873
        c.ids.size,
874
        c.nbLieuxDistincts,
875
        c.paires.length,
876
        c.scoreMax.toFixed(3),
877
        escapeCsvField([...noms].join(' | ')),
878
        [...sirets].join(' | ') || '',
×
879
        escapeCsvField([...communes].join(' | ')),
880
        [...c.ids].join(' | '),
881
      ].join(';')
882
    }),
883
  ]
884

UNCOV
885
  const clustersFilePath = getAuditOutputPath('detect-duplicate-clusters.csv')
×
UNCOV
886
  await writeFile(clustersFilePath, clustersCsvLines.join('\n'), 'utf-8')
×
887

888
  output.log(
×
889
    `Export clusters: ${clustersFilePath} (${clusters.length} clusters)`,
890
  )
891

UNCOV
892
  output.log(`\ndetect-duplicate-structures: terminé`)
×
893

894
  return {
×
895
    structuresAnalysees: structures.length,
896
    codesInseeAvecDoublons: groupesComparables.length,
897
    comparaisonsEffectuees,
898
    pairesDetectees: paires.length,
899
    clusters: {
900
      total: clusters.length,
901
      doublon_certain: typeDistrib.doublon_certain,
902
      multi_site: typeDistrib.multi_site,
903
      mixte: typeDistrib.mixte,
904
    },
905
    structuresImpliquees: structuresImpliquees.size,
906
    seuilScore,
907
    exports: {
908
      paires: filePath,
909
      clusters: clustersFilePath,
910
    },
911
  }
912
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc