• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

inclusion-numerique / coop-mediation-numerique / 14ba6690-9c33-4ca2-ab73-fa1fd8aa6215

21 May 2026 08:58AM UTC coverage: 9.992% (+3.0%) from 7.009%
14ba6690-9c33-4ca2-ab73-fa1fd8aa6215

Pull #497

circleci

marc-gavanier
feat: improve structure fusion scoring and review export

Significantly reduces the manual review burden by detecting more
true duplicates automatically and avoiding false positives.

Scoring improvements (detect-duplicate-structures, generate-structures-action-plan):
- Treat clusters of type 'mixte' like 'doublon_certain' with per-pair
  scoring (instead of bulk verification_manuelle), uncovering hundreds
  of auto/probable fusions previously hidden in mixed clusters.
- Boost address score to 1.0 when one normalized address is contained
  in the other (e.g. "Lupino" vs "LUPINO PARVIS NOTRE DAME VICTOIRE").
- Add address abbreviations: VC (voie communale), RT (route), ZA, ZI, CH.
- Redistribute geo weight when coords are unavailable, OR when address
  strongly indicates the same place (>=0.85): prevents penalizing
  structures with missing or erroneous coords.
- Normalize "commune de/du", "mairie de/du", "ville de/du" to a single
  "ville" canonical token so variants match.
- Detect "service keywords" (EPN, médiathèque, CCAS, France services,
  MJC, etc.): when one name has such a keyword and the other does not,
  they are distinct entities even with shared SIRET/address. Disables
  the address-contained heuristic and keeps geo in the score.

Sync resilience (findOrCreateStructure):
- After strict siret+codeInsee miss, fall back to siret-only with
  normalized contained-name match. This catches Dataspace structures
  whose codeInsee diverges from the coop's, without merging an EPN
  with its parent town hall (asymmetric-service-keyword guard).

Review output:
- generate-structures-action-plan: structures-fusion-review.csv now
  uses cluster-grouped format (CIBLE + sources + empty line between
  clusters, sorted by ascending score), matching the existing format
  Tim uses for his manual reviews.
- export-duplicate-sirets: cluster-grouped CSV (empty line between
  SIRETs) and exclude empty-string siret. Enrich each row with
  nom_api, adresse_api, corre... (continued)
Pull Request #497: feat: improve structure fusion scoring and review export

688 of 10876 branches covered (6.33%)

Branch coverage included in aggregate %.

26 of 150 new or added lines in 4 files covered. (17.33%)

911 existing lines in 95 files now uncovered.

2111 of 17137 relevant lines covered (12.32%)

1.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/apps/web/src/jobs/detect-duplicate-structures/executeDetectDuplicateStructures.ts
1
import { writeFile } from 'node:fs/promises'
2
import { getAuditOutputPath } from '@app/web/jobs/audit-output'
3
import { output } from '@app/web/jobs/output'
4
import { prismaClient } from '@app/web/prismaClient'
5
import type { DetectDuplicateStructuresJob } from './detectDuplicateStructuresJob'
6

7
type StructureLight = {
8
  id: string
9
  nom: string
10
  nomNormalise: string
11
  siret: string | null
12
  adresse: string
13
  adresseNormalisee: string
14
  commune: string
15
  codePostal: string
16
  codeInsee: string | null
17
  telephone: string | null
18
  latitude: number | null
19
  longitude: number | null
20
  visiblePourCartographieNationale: boolean
21
  activitesCount: number
22
  emploisCount: number
23
  mediateursCount: number
24
}
25

26
type PaireDoublon = {
27
  idA: string
28
  nomA: string
29
  siretA: string
30
  adresseA: string
31
  communeA: string
32
  idB: string
33
  nomB: string
34
  siretB: string
35
  adresseB: string
36
  communeB: string
37
  codeInsee: string
38
  scoreNom: number
39
  scoreAdresse: number
40
  scoreGeo: number
41
  scoreSiret: number
42
  scoreTelephone: number
43
  scoreTotal: number
44
  activitesA: number
45
  activitesB: number
46
  emploisA: number
47
  emploisB: number
48
  mediateursA: number
49
  mediateursB: number
50
  visibleCartoA: boolean
51
  visibleCartoB: boolean
52
}
53

54
// ── Pondérations ──
55

56
const POIDS_NOM = 0.35
×
57
const POIDS_ADRESSE = 0.25
×
58
const POIDS_GEO = 0.2
×
59
const POIDS_SIRET = 0.15
×
60
const POIDS_TELEPHONE = 0.05
×
61

62
// ── Utilitaires ──
63

64
const stripDiacritics = (s: string) =>
×
65
  s.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
×
66

67
const baseNormalize = (s: string) =>
×
68
  stripDiacritics(s)
×
69
    .toLowerCase()
70
    .replace(/[^a-z0-9\s]/g, '')
71
    .replace(/\s+/g, ' ')
72
    .trim()
73

74
// ── Normalisation des noms de structures ──
75

76
/**
77
 * Préfixes administratifs interchangeables.
78
 * "commune de X", "mairie de X", "ville de X" → "X"
79
 * "conseil départemental de X", "département de X" → "X"
80
 * "communauté de communes de X", "communauté d'agglomération de X", etc.
81
 */
82
// Préfixes normalisés vers un token canonique au lieu d'être supprimés.
83
// "commune de X" et "mairie de X" deviennent "ville X" → matchent ensemble.
84
// Mais "EPN X" reste tel quel → ne matche pas avec "ville X".
NEW
85
const NOM_PREFIXES_NORMALIZATIONS: [RegExp, string][] = [
×
86
  // Communes (de/du/des/de la/de l')
87
  [/^commune (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
88
  [/^com (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
89
  [/^mairie (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
90
  [/^ville (?:de(?:s)?|du|de la|de l)\s+/, 'ville '],
91
  // Départements
92
  [/^conseil departemental (?:de(?:s)?|du|de la|de l)\s+/, 'departement '],
93
  [/^departement (?:de(?:s)?|du|de la|de l)\s+/, 'departement '],
94
  // Intercommunalités
95
  [/^communaute de communes?\s+/, 'cc '],
96
  [/^communaute d agglomeration\s+/, 'cagglo '],
97
  [/^communaute com\s+/, 'cc '],
98
  // Régions
99
  [/^conseil regional (?:de(?:s)?|du|de la|de l)\s+/, 'region '],
100
  [/^region\s+/, 'region '],
101
]
102

103
// Mots-clés qui désignent un service spécifique d'une entité plus large
104
// (mairie, commune, etc.). Si une structure les contient et l'autre non,
105
// elles ne sont PAS la même entité, même avec un SIRET partagé.
NEW
106
const SERVICE_KEYWORDS = [
×
107
  'epn',
108
  'mediatheque',
109
  'bibliotheque',
110
  'ccas',
111
  'cias',
112
  'centre social',
113
  'maison quartier',
114
  'maison de quartier',
115
  'france services',
116
  'mjc',
117
  'espace numerique',
118
  'cyber espace',
119
  'cyberbase',
120
  'pole emploi',
121
  'mission locale',
122
  'point information',
123
  'point info',
124
  'fablab',
125
]
126

NEW
127
const detectServiceKeywords = (s: string): Set<string> => {
×
NEW
128
  const found = new Set<string>()
×
NEW
129
  for (const kw of SERVICE_KEYWORDS) {
×
NEW
130
    if (s.includes(kw)) found.add(kw)
×
131
  }
NEW
132
  return found
×
133
}
134

135
/**
136
 * Returns true if `a` and `b` reference different service kinds.
137
 * Used to prevent fusioning e.g. an EPN with its parent town hall
138
 * even when they share a SIRET and an address.
139
 */
NEW
140
const hasAsymmetricServiceKeyword = (a: string, b: string): boolean => {
×
NEW
141
  const ka = detectServiceKeywords(a)
×
NEW
142
  const kb = detectServiceKeywords(b)
×
NEW
143
  if (ka.size === 0 && kb.size === 0) return false
×
144
  // If one set is a subset of the other, no asymmetry; same service.
NEW
145
  for (const k of ka) if (!kb.has(k)) return true
×
NEW
146
  for (const k of kb) if (!ka.has(k)) return true
×
NEW
147
  return false
×
148
}
149

150
/**
151
 * Abréviations courantes dans les noms de structures.
152
 */
153
const NOM_ABBREVIATIONS: [RegExp, string][] = [
×
154
  [/\bst\b/g, 'saint'],
155
  [/\bste\b/g, 'sainte'],
156
  [/\basse?\b/g, 'association'],
157
  [/\bassoc\b/g, 'association'],
158
  [/\bfed\b/g, 'federation'],
159
  [/\bfeder\b/g, 'federation'],
160
  [/\bdepart\b/g, 'departementale'],
161
  [/\bamic\b/g, 'amicale'],
162
  [/\bmdf\b/g, 'maison de la famille'],
163
]
164

165
const normalizeNom = (s: string): string => {
×
166
  let n = baseNormalize(s)
×
167

NEW
168
  for (const [pattern, replacement] of NOM_PREFIXES_NORMALIZATIONS) {
×
NEW
169
    n = n.replace(pattern, replacement)
×
170
  }
171

172
  for (const [pattern, replacement] of NOM_ABBREVIATIONS) {
×
173
    n = n.replace(pattern, replacement)
×
174
  }
175

176
  // Retirer les mots-outils en fin de normalisation
177
  n = n
×
178
    .replace(/\b(de|du|des|le|la|les|l|d|et|en)\b/g, '')
179
    .replace(/\s+/g, ' ')
180
    .trim()
181

182
  return n
×
183
}
184

185
// ── Normalisation des adresses ──
186

187
const ADRESSE_ABBREVIATIONS: [RegExp, string][] = [
×
188
  [/\bav\b/g, 'avenue'],
189
  [/\bbd\b/g, 'boulevard'],
190
  [/\bblvd\b/g, 'boulevard'],
191
  [/\bpl\b/g, 'place'],
192
  [/\bimp\b/g, 'impasse'],
193
  [/\bche\b/g, 'chemin'],
194
  [/\bch\b/g, 'chemin'],
195
  [/\bsq\b/g, 'square'],
196
  [/\brte\b/g, 'route'],
197
  [/\brt\b/g, 'route'],
198
  [/\bres\b/g, 'residence'],
199
  [/\bvc\b/g, 'voie communale'],
200
  [/\bzi\b/g, 'zone industrielle'],
201
  [/\bza\b/g, 'zone artisanale'],
202
  [/\b(\d+)bis\b/g, '$1'],
203
  [/\b(\d+)ter\b/g, '$1'],
204
  [/\b(\d+)b\b/g, '$1'],
205
]
206

207
const normalizeAdresse = (s: string): string => {
×
208
  let n = baseNormalize(s)
×
209

210
  for (const [pattern, replacement] of ADRESSE_ABBREVIATIONS) {
×
211
    n = n.replace(pattern, replacement)
×
212
  }
213

214
  // Retirer les mots-outils
215
  n = n
×
216
    .replace(/\b(de|du|des|le|la|les|l|d)\b/g, '')
217
    .replace(/\s+/g, ' ')
218
    .trim()
219

220
  return n
×
221
}
222

223
// ── Similarité ──
224

225
const bigrams = (s: string) => {
×
226
  const set = new Map<string, number>()
×
227
  for (let i = 0; i < s.length - 1; i++) {
×
228
    const bigram = s.slice(i, i + 2)
×
229
    set.set(bigram, (set.get(bigram) ?? 0) + 1)
×
230
  }
231
  return set
×
232
}
233

234
const diceSimilarity = (a: string, b: string): number => {
×
235
  if (a === b) return 1
×
236
  if (a.length < 2 || b.length < 2) return 0
×
237

238
  const bigramsA = bigrams(a)
×
239
  const bigramsB = bigrams(b)
×
240

241
  let intersection = 0
×
242
  for (const [bigram, countA] of bigramsA) {
×
243
    const countB = bigramsB.get(bigram) ?? 0
×
244
    intersection += Math.min(countA, countB)
×
245
  }
246

247
  return (2 * intersection) / (a.length - 1 + b.length - 1)
×
248
}
249

250
const haversineDistance = (
×
251
  lat1: number,
252
  lon1: number,
253
  lat2: number,
254
  lon2: number,
255
): number => {
256
  const R = 6_371_000
×
257
  const toRad = (deg: number) => (deg * Math.PI) / 180
×
258
  const dLat = toRad(lat2 - lat1)
×
259
  const dLon = toRad(lon2 - lon1)
×
260
  const a =
261
    Math.sin(dLat / 2) ** 2 +
×
262
    Math.cos(toRad(lat1)) * Math.cos(toRad(lat2)) * Math.sin(dLon / 2) ** 2
263
  return R * 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
×
264
}
265

266
/**
267
 * Score géographique : 1.0 si < 50m, décroissance linéaire jusqu'à 0 à 500m
268
 */
269
const scoreGeo = (a: StructureLight, b: StructureLight): number => {
×
270
  if (
×
271
    a.latitude == null ||
×
272
    a.longitude == null ||
273
    b.latitude == null ||
274
    b.longitude == null
275
  ) {
276
    return 0
×
277
  }
278

279
  const distance = haversineDistance(
×
280
    a.latitude,
281
    a.longitude,
282
    b.latitude,
283
    b.longitude,
284
  )
285

286
  if (distance < 50) return 1
×
287
  if (distance > 500) return 0
×
288
  return 1 - (distance - 50) / 450
×
289
}
290

291
const scoreSiret = (a: StructureLight, b: StructureLight): number => {
×
292
  if (!a.siret || !b.siret) return 0
×
293
  return a.siret === b.siret ? 1 : 0
×
294
}
295

296
const normalizeTelephone = (tel: string | null): string => {
×
297
  if (!tel) return ''
×
298
  return tel.replace(/[\s.+\-()]/g, '').replace(/^0033/, '0')
×
299
}
300

301
const scoreTelephone = (a: StructureLight, b: StructureLight): number => {
×
302
  const telA = normalizeTelephone(a.telephone)
×
303
  const telB = normalizeTelephone(b.telephone)
×
304
  if (!telA || !telB) return 0
×
305
  return telA === telB ? 1 : 0
×
306
}
307

308
// Min length for the "contained" address heuristic: avoids matching on
309
// generic tokens like "rue" or "place" alone.
NEW
310
const MIN_CONTAINED_ADRESSE_LENGTH = 5
×
311

NEW
312
const scoreAdresse = (a: string, b: string): number => {
×
NEW
313
  if (
×
314
    a.length >= MIN_CONTAINED_ADRESSE_LENGTH &&
×
315
    b.length >= MIN_CONTAINED_ADRESSE_LENGTH
316
  ) {
NEW
317
    if (a.includes(b) || b.includes(a)) return 1
×
318
  }
NEW
319
  return diceSimilarity(a, b)
×
320
}
321

UNCOV
322
const computeScore = (
×
323
  a: StructureLight,
324
  b: StructureLight,
325
): {
326
  scoreNom: number
327
  scoreAdresse: number
328
  scoreGeo: number
329
  scoreSiret: number
330
  scoreTelephone: number
331
  scoreTotal: number
332
} => {
333
  const sNom = diceSimilarity(a.nomNormalise, b.nomNormalise)
×
334

335
  // If one structure references a service kind (EPN, médiathèque, CCAS...)
336
  // and the other does not, they are distinct entities even with same
337
  // SIRET + address. Disable the "address contained" heuristic for them.
NEW
338
  const differentService = hasAsymmetricServiceKeyword(
×
339
    a.nomNormalise,
340
    b.nomNormalise,
341
  )
NEW
342
  const sAdresse = differentService
×
343
    ? diceSimilarity(a.adresseNormalisee, b.adresseNormalisee)
344
    : scoreAdresse(a.adresseNormalisee, b.adresseNormalisee)
345
  const sGeo = scoreGeo(a, b)
×
346
  const sSiret = scoreSiret(a, b)
×
347
  const sTelephone = scoreTelephone(a, b)
×
348

349
  // Ignore geo when it's unavailable OR when the address strongly indicates
350
  // the same place (>=0.85): missing coords or wrong coords should not
351
  // penalize a clear address match. But never ignore it when names indicate
352
  // distinct services (the geo distance is a valuable signal here).
353
  const geoAvailable =
NEW
354
    a.latitude != null &&
×
355
    a.longitude != null &&
356
    b.latitude != null &&
357
    b.longitude != null
NEW
358
  const ignoreGeo = !differentService && (!geoAvailable || sAdresse >= 0.85)
×
NEW
359
  const weightsSum = ignoreGeo
×
360
    ? POIDS_NOM + POIDS_ADRESSE + POIDS_SIRET + POIDS_TELEPHONE
361
    : 1
362

363
  const scoreTotal =
NEW
364
    (sNom * POIDS_NOM +
×
365
      sAdresse * POIDS_ADRESSE +
366
      (ignoreGeo ? 0 : sGeo * POIDS_GEO) +
×
367
      sSiret * POIDS_SIRET +
368
      sTelephone * POIDS_TELEPHONE) /
369
    weightsSum
370

371
  return {
×
372
    scoreNom: sNom,
373
    scoreAdresse: sAdresse,
374
    scoreGeo: sGeo,
375
    scoreSiret: sSiret,
376
    scoreTelephone: sTelephone,
377
    scoreTotal,
378
  }
379
}
380

381
// ── CSV ──
382

383
const escapeCsvField = (value: string) =>
×
384
  value.includes(';') || value.includes('"') || value.includes('\n')
×
385
    ? `"${value.replace(/"/g, '""')}"`
386
    : value
387

388
const csvHeader = [
×
389
  'id_a',
390
  'nom_a',
391
  'siret_a',
392
  'adresse_a',
393
  'commune_a',
394
  'activites_a',
395
  'emplois_a',
396
  'mediateurs_a',
397
  'visible_carto_a',
398
  'id_b',
399
  'nom_b',
400
  'siret_b',
401
  'adresse_b',
402
  'commune_b',
403
  'activites_b',
404
  'emplois_b',
405
  'mediateurs_b',
406
  'visible_carto_b',
407
  'code_insee',
408
  'score_nom',
409
  'score_adresse',
410
  'score_geo',
411
  'score_siret',
412
  'score_telephone',
413
  'score_total',
414
].join(';')
415

416
const paireToCsv = (p: PaireDoublon): string =>
×
417
  [
×
418
    p.idA,
419
    escapeCsvField(p.nomA),
420
    p.siretA,
421
    escapeCsvField(p.adresseA),
422
    escapeCsvField(p.communeA),
423
    p.activitesA,
424
    p.emploisA,
425
    p.mediateursA,
426
    p.visibleCartoA ? 'oui' : 'non',
×
427
    p.idB,
428
    escapeCsvField(p.nomB),
429
    p.siretB,
430
    escapeCsvField(p.adresseB),
431
    escapeCsvField(p.communeB),
432
    p.activitesB,
433
    p.emploisB,
434
    p.mediateursB,
435
    p.visibleCartoB ? 'oui' : 'non',
×
436
    p.codeInsee,
437
    p.scoreNom.toFixed(3),
438
    p.scoreAdresse.toFixed(3),
439
    p.scoreGeo.toFixed(3),
440
    p.scoreSiret.toFixed(3),
441
    p.scoreTelephone.toFixed(3),
442
    p.scoreTotal.toFixed(3),
443
  ].join(';')
444

445
// ── Job ──
446

447
export const executeDetectDuplicateStructures = async (
×
448
  job: DetectDuplicateStructuresJob,
449
) => {
450
  const seuilScore = job.payload?.seuilScore ?? 0.6
×
451
  const limit = job.payload?.limit
×
452

453
  output.log(
×
454
    `detect-duplicate-structures: starting (seuil: ${seuilScore})${limit ? ` (limit: ${limit} codes INSEE)` : ''}...`,
×
455
  )
456

457
  const structures = await prismaClient.structure.findMany({
×
458
    where: {
459
      suppression: null,
460
      codeInsee: { not: null },
461
    },
462
    select: {
463
      id: true,
464
      nom: true,
465
      siret: true,
466
      adresse: true,
467
      commune: true,
468
      codePostal: true,
469
      codeInsee: true,
470
      telephone: true,
471
      latitude: true,
472
      longitude: true,
473
      visiblePourCartographieNationale: true,
474
      activitesCount: true,
475
      _count: {
476
        select: {
477
          emplois: true,
478
          mediateursEnActivite: true,
479
        },
480
      },
481
    },
482
  })
483

484
  // Regrouper par code INSEE
485
  const parCodeInsee = new Map<string, StructureLight[]>()
×
486
  for (const s of structures) {
×
487
    const codeInsee = s.codeInsee as string
×
488
    const light: StructureLight = {
×
489
      id: s.id,
490
      nom: s.nom,
491
      nomNormalise: normalizeNom(s.nom),
492
      siret: s.siret,
493
      adresse: s.adresse,
494
      adresseNormalisee: normalizeAdresse(s.adresse),
495
      commune: s.commune,
496
      codePostal: s.codePostal,
497
      codeInsee: s.codeInsee,
498
      telephone: s.telephone,
499
      latitude: s.latitude,
500
      longitude: s.longitude,
501
      visiblePourCartographieNationale: s.visiblePourCartographieNationale,
502
      activitesCount: s.activitesCount,
503
      emploisCount: s._count.emplois,
504
      mediateursCount: s._count.mediateursEnActivite,
505
    }
506
    const group = parCodeInsee.get(codeInsee)
×
507
    if (group) {
×
508
      group.push(light)
×
509
    } else {
510
      parCodeInsee.set(codeInsee, [light])
×
511
    }
512
  }
513

514
  // Filtrer les groupes avec au moins 2 structures
515
  const groupesComparables = [...parCodeInsee.entries()].filter(
×
516
    ([, group]) => group.length >= 2,
×
517
  )
518

519
  const groupesATraiter = limit
×
520
    ? groupesComparables.slice(0, limit)
521
    : groupesComparables
522

523
  const totalComparaisons = groupesATraiter.reduce(
×
524
    (sum, [, group]) => sum + (group.length * (group.length - 1)) / 2,
×
525
    0,
526
  )
527

528
  output.log(
×
529
    `detect-duplicate-structures: ${structures.length} structures, ${groupesComparables.length} codes INSEE avec ≥2 structures, ${totalComparaisons} comparaisons à effectuer`,
530
  )
531

532
  const paires: PaireDoublon[] = []
×
533
  let comparaisonsEffectuees = 0
×
534

535
  for (const [codeInsee, group] of groupesATraiter) {
×
536
    for (let i = 0; i < group.length; i++) {
×
537
      for (let j = i + 1; j < group.length; j++) {
×
538
        const a = group[i]
×
539
        const b = group[j]
×
540
        comparaisonsEffectuees++
×
541

542
        const scores = computeScore(a, b)
×
543

544
        if (scores.scoreTotal >= seuilScore) {
×
545
          paires.push({
×
546
            idA: a.id,
547
            nomA: a.nom,
548
            siretA: a.siret ?? '',
×
549
            adresseA: a.adresse,
550
            communeA: a.commune,
551
            activitesA: a.activitesCount,
552
            emploisA: a.emploisCount,
553
            mediateursA: a.mediateursCount,
554
            visibleCartoA: a.visiblePourCartographieNationale,
555
            idB: b.id,
556
            nomB: b.nom,
557
            siretB: b.siret ?? '',
×
558
            adresseB: b.adresse,
559
            communeB: b.commune,
560
            activitesB: b.activitesCount,
561
            emploisB: b.emploisCount,
562
            mediateursB: b.mediateursCount,
563
            visibleCartoB: b.visiblePourCartographieNationale,
564
            codeInsee,
565
            ...scores,
566
          })
567
        }
568
      }
569
    }
570

571
    if (comparaisonsEffectuees % 50_000 === 0) {
×
572
      output.log(
×
573
        `detect-duplicate-structures: progress ${comparaisonsEffectuees}/${totalComparaisons} comparaisons, ${paires.length} paires trouvées`,
574
      )
575
    }
576
  }
577

578
  // Trier par score décroissant
579
  paires.sort((a, b) => b.scoreTotal - a.scoreTotal)
×
580

581
  // ── Export CSV ──
582

583
  const csvLines = [csvHeader, ...paires.map(paireToCsv)]
×
584
  const filePath = getAuditOutputPath('detect-duplicate-structures.csv')
×
585
  await writeFile(filePath, csvLines.join('\n'), 'utf-8')
×
586

587
  // ── Rapport console ──
588

589
  output.log(`\n=== DÉTECTION DOUBLONS FLOUS - RÉSULTATS ===`)
×
590
  output.log(`Structures analysées: ${structures.length}`)
×
591
  output.log(`Codes INSEE avec ≥2 structures: ${groupesComparables.length}`)
×
592
  output.log(`Comparaisons effectuées: ${comparaisonsEffectuees}`)
×
593
  output.log(`Paires détectées (score ≥ ${seuilScore}): ${paires.length}`)
×
594

595
  // Distribution des scores
596
  if (paires.length > 0) {
×
597
    const ranges = [
×
598
      { label: '0.9 - 1.0', min: 0.9, max: 1.01 },
599
      { label: '0.8 - 0.9', min: 0.8, max: 0.9 },
600
      { label: '0.7 - 0.8', min: 0.7, max: 0.8 },
601
      { label: '0.6 - 0.7', min: 0.6, max: 0.7 },
602
      { label: '< 0.6', min: 0, max: 0.6 },
603
    ]
604
    output.log(`\n--- Distribution des scores ---`)
×
605
    for (const range of ranges) {
×
606
      const count = paires.filter(
×
607
        (p) => p.scoreTotal >= range.min && p.scoreTotal < range.max,
×
608
      ).length
609
      if (count > 0) {
×
610
        output.log(`  ${range.label}: ${count}`)
×
611
      }
612
    }
613
  }
614

615
  // ── Regroupement en clusters (composantes connexes) ──
616

617
  const parent = new Map<string, string>()
×
618

619
  const find = (id: string): string => {
×
620
    if (!parent.has(id)) parent.set(id, id)
×
621
    let root = id
×
622
    while (parent.get(root) !== root) root = parent.get(root) as string
×
623
    // Path compression
624
    let current = id
×
625
    while (current !== root) {
×
626
      const next = parent.get(current) as string
×
627
      parent.set(current, root)
×
628
      current = next
×
629
    }
630
    return root
×
631
  }
632

633
  const union = (a: string, b: string) => {
×
634
    const ra = find(a)
×
635
    const rb = find(b)
×
636
    if (ra !== rb) parent.set(ra, rb)
×
637
  }
638

639
  for (const p of paires) {
×
640
    union(p.idA, p.idB)
×
641
  }
642

643
  // Construire les clusters
644

645
  type ClusterType = 'doublon_certain' | 'multi_site' | 'mixte'
646

647
  type Cluster = {
648
    ids: Set<string>
649
    paires: PaireDoublon[]
650
    scoreMax: number
651
    type: ClusterType
652
    nbLieuxDistincts: number
653
  }
654

655
  const clusterMap = new Map<string, Cluster>()
×
656

657
  for (const p of paires) {
×
658
    const root = find(p.idA)
×
659
    let cluster = clusterMap.get(root)
×
660
    if (!cluster) {
×
661
      cluster = {
×
662
        ids: new Set(),
663
        paires: [],
664
        scoreMax: 0,
665
        type: 'doublon_certain',
666
        nbLieuxDistincts: 1,
667
      }
668
      clusterMap.set(root, cluster)
×
669
    }
670
    cluster.ids.add(p.idA)
×
671
    cluster.ids.add(p.idB)
×
672
    cluster.paires.push(p)
×
673
    if (p.scoreTotal > cluster.scoreMax) cluster.scoreMax = p.scoreTotal
×
674
  }
675

676
  // ── Classification des clusters par lieux physiques ──
677
  // Deux structures sont au même lieu si :
678
  // - scoreGeo ≥ 0.7 (~200m) : proximité géographique
679
  // - OU scoreAdresse ≥ 0.85 : adresse quasi identique (fallback sans coordonnées)
680

681
  const SEUIL_MEME_LIEU_GEO = 0.7
×
682
  const SEUIL_MEME_LIEU_ADRESSE = 0.85
×
683

684
  const isSameLieu = (p: PaireDoublon): boolean =>
×
685
    p.scoreGeo >= SEUIL_MEME_LIEU_GEO ||
×
686
    p.scoreAdresse >= SEUIL_MEME_LIEU_ADRESSE
687

688
  for (const cluster of clusterMap.values()) {
×
689
    // Union-Find local pour regrouper les structures par lieu physique
690
    const lieuParent = new Map<string, string>()
×
691

692
    const findLieu = (id: string): string => {
×
693
      if (!lieuParent.has(id)) lieuParent.set(id, id)
×
694
      let root = id
×
695
      while (lieuParent.get(root) !== root)
×
696
        root = lieuParent.get(root) as string
×
697
      let current = id
×
698
      while (current !== root) {
×
699
        const next = lieuParent.get(current) as string
×
700
        lieuParent.set(current, root)
×
701
        current = next
×
702
      }
703
      return root
×
704
    }
705

706
    const unionLieu = (a: string, b: string) => {
×
707
      const ra = findLieu(a)
×
708
      const rb = findLieu(b)
×
709
      if (ra !== rb) lieuParent.set(ra, rb)
×
710
    }
711

712
    // Initialiser tous les IDs
713
    for (const id of cluster.ids) findLieu(id)
×
714

715
    // Regrouper les structures proches géographiquement ou par adresse
716
    for (const p of cluster.paires) {
×
717
      if (isSameLieu(p)) {
×
718
        unionLieu(p.idA, p.idB)
×
719
      }
720
    }
721

722
    // Compter les lieux distincts
723
    const lieux = new Set<string>()
×
724
    for (const id of cluster.ids) lieux.add(findLieu(id))
×
725

726
    cluster.nbLieuxDistincts = lieux.size
×
727

728
    if (lieux.size === 1) {
×
729
      cluster.type = 'doublon_certain'
×
730
    } else if (lieux.size === cluster.ids.size) {
×
731
      cluster.type = 'multi_site'
×
732
    } else {
733
      cluster.type = 'mixte'
×
734
    }
735
  }
736

737
  const clusters = [...clusterMap.values()].sort(
×
738
    (a, b) => b.ids.size - a.ids.size,
×
739
  )
740

741
  const structuresImpliquees = new Set<string>()
×
742
  for (const c of clusters) {
×
743
    for (const id of c.ids) structuresImpliquees.add(id)
×
744
  }
745

746
  // ── Rapport clusters ──
747

748
  output.log(`\n--- Clusters ---`)
×
749
  output.log(`Clusters trouvés: ${clusters.length}`)
×
750

751
  const clusterSizeDistrib = new Map<number, number>()
×
752
  for (const c of clusters) {
×
753
    const size = c.ids.size
×
754
    clusterSizeDistrib.set(size, (clusterSizeDistrib.get(size) ?? 0) + 1)
×
755
  }
756
  output.log(`Distribution par taille:`)
×
757
  for (const [size, count] of [...clusterSizeDistrib.entries()].sort(
×
758
    (a, b) => b[0] - a[0],
×
759
  )) {
760
    output.log(`  ${size} structures: ${count} clusters`)
×
761
  }
762

763
  // Classification par type
764
  const typeDistrib: Record<ClusterType, number> = {
×
765
    doublon_certain: 0,
766
    multi_site: 0,
767
    mixte: 0,
768
  }
769
  const structuresParType: Record<ClusterType, number> = {
×
770
    doublon_certain: 0,
771
    multi_site: 0,
772
    mixte: 0,
773
  }
774
  for (const c of clusters) {
×
775
    typeDistrib[c.type]++
×
776
    structuresParType[c.type] += c.ids.size
×
777
  }
778

779
  output.log(`\n--- Classification ---`)
×
780
  for (const type of ['doublon_certain', 'multi_site', 'mixte'] as const) {
×
781
    output.log(
×
782
      `  ${type}: ${typeDistrib[type]} clusters (${structuresParType[type]} structures)`,
783
    )
784
  }
785

786
  // Top 10 clusters les plus gros
787
  output.log(`\n--- Top 10 plus gros clusters ---`)
×
788
  for (const cluster of clusters.slice(0, 10)) {
×
789
    const exemple = cluster.paires[0]
×
790
    const noms = new Set<string>()
×
791
    const sirets = new Set<string>()
×
792
    for (const p of cluster.paires) {
×
793
      noms.add(p.nomA)
×
794
      noms.add(p.nomB)
×
795
      if (p.siretA) sirets.add(p.siretA)
×
796
      if (p.siretB) sirets.add(p.siretB)
×
797
    }
798
    output.log(
×
799
      `  [${cluster.type}] [${cluster.ids.size} structures, ${cluster.nbLieuxDistincts} lieux, score_max=${cluster.scoreMax.toFixed(3)}]`,
800
    )
801
    output.log(
×
802
      `    Noms: ${[...noms]
803
        .slice(0, 3)
804
        .map((n) => `"${n}"`)
×
805
        .join(', ')}${noms.size > 3 ? ` (+${noms.size - 3})` : ''}`,
×
806
    )
807
    output.log(
×
808
      `    Commune: ${exemple.communeA} | SIRETs: ${sirets.size > 0 ? [...sirets].join(', ') : '—'}`,
×
809
    )
810
  }
811

812
  // Top 10 paires les plus intéressantes (hors doublons stricts)
813
  const pairesInteressantes = paires.filter(
×
814
    (p) =>
815
      !(p.scoreNom >= 0.95 && p.scoreAdresse >= 0.95 && p.scoreSiret === 1),
×
816
  )
817

818
  // Dédupliquer par cluster pour varier les exemples
819
  const clustersVus = new Set<string>()
×
820
  const pairesVariees: PaireDoublon[] = []
×
821
  for (const p of pairesInteressantes) {
×
822
    const root = find(p.idA)
×
823
    if (!clustersVus.has(root)) {
×
824
      clustersVus.add(root)
×
825
      pairesVariees.push(p)
×
826
      if (pairesVariees.length >= 10) break
×
827
    }
828
  }
829

830
  output.log(`\n--- Top 10 paires (hors doublons stricts, 1 par cluster) ---`)
×
831
  for (const p of pairesVariees) {
×
832
    output.log(
×
833
      `  score=${p.scoreTotal.toFixed(3)} [nom=${p.scoreNom.toFixed(2)} adr=${p.scoreAdresse.toFixed(2)} geo=${p.scoreGeo.toFixed(2)} siret=${p.scoreSiret.toFixed(0)}]`,
834
    )
835
    output.log(`    "${p.nomA}" ↔ "${p.nomB}" | ${p.communeA}`)
×
836
    output.log(
×
837
      `    "${p.adresseA}" ↔ "${p.adresseB}" | SIRET: ${p.siretA || '—'} / ${p.siretB || '—'}`,
×
838
    )
839
  }
840

841
  output.log(`\nStructures uniques impliquées: ${structuresImpliquees.size}`)
×
842
  output.log(`Export: ${filePath} (${paires.length} paires)`)
×
843

844
  // ── Export CSV clusters ──
845

846
  const clustersCsvHeader = [
×
847
    'cluster_id',
848
    'type',
849
    'taille',
850
    'nb_lieux_distincts',
851
    'paires',
852
    'score_max',
853
    'noms',
854
    'sirets',
855
    'commune',
856
    'ids',
857
  ].join(';')
858

859
  const clustersCsvLines = [
×
860
    clustersCsvHeader,
861
    ...clusters.map((c, i) => {
862
      const noms = new Set<string>()
×
863
      const sirets = new Set<string>()
×
864
      const communes = new Set<string>()
×
865
      for (const p of c.paires) {
×
866
        noms.add(p.nomA)
×
867
        noms.add(p.nomB)
×
868
        if (p.siretA) sirets.add(p.siretA)
×
869
        if (p.siretB) sirets.add(p.siretB)
×
870
        communes.add(p.communeA)
×
871
      }
872
      return [
×
873
        i + 1,
874
        c.type,
875
        c.ids.size,
876
        c.nbLieuxDistincts,
877
        c.paires.length,
878
        c.scoreMax.toFixed(3),
879
        escapeCsvField([...noms].join(' | ')),
880
        [...sirets].join(' | ') || '',
×
881
        escapeCsvField([...communes].join(' | ')),
882
        [...c.ids].join(' | '),
883
      ].join(';')
884
    }),
885
  ]
886

887
  const clustersFilePath = getAuditOutputPath('detect-duplicate-clusters.csv')
×
888
  await writeFile(clustersFilePath, clustersCsvLines.join('\n'), 'utf-8')
×
889

890
  output.log(
×
891
    `Export clusters: ${clustersFilePath} (${clusters.length} clusters)`,
892
  )
893

894
  output.log(`\ndetect-duplicate-structures: terminé`)
×
895

896
  return {
×
897
    structuresAnalysees: structures.length,
898
    codesInseeAvecDoublons: groupesComparables.length,
899
    comparaisonsEffectuees,
900
    pairesDetectees: paires.length,
901
    clusters: {
902
      total: clusters.length,
903
      doublon_certain: typeDistrib.doublon_certain,
904
      multi_site: typeDistrib.multi_site,
905
      mixte: typeDistrib.mixte,
906
    },
907
    structuresImpliquees: structuresImpliquees.size,
908
    seuilScore,
909
    exports: {
910
      paires: filePath,
911
      clusters: clustersFilePath,
912
    },
913
  }
914
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc