• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

SEED-platform / seed / #6522

pending completion
#6522

push

coveralls-python

web-flow
Update derived column migration to prevent conflicting column names and prevent duplicate column names (#3728)

* update derived column migration to create unique names for derived columns

* prevent derived column names that are duplicated with column names

* disable create/save on error

* update and add test

15680 of 22591 relevant lines covered (69.41%)

0.69 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

19.23
/seed/lib/mappings/mapper.py
1
# !/usr/bin/env python
2
# encoding: utf-8
3
"""
1✔
4
:copyright (c) 2014 - 2022, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Department of Energy) and contributors. All rights reserved.
5
:author Dan Gunter <dkgunter@lbl.gov>
6
"""
7
import json
1✔
8
import logging
1✔
9
import os
1✔
10
import re
1✔
11
from collections import OrderedDict
1✔
12
from os.path import dirname, join, realpath
1✔
13

14
from past.builtins import basestring
1✔
15
from unidecode import unidecode
1✔
16

17
LINEAR_UNITS = set(['ft', 'm', 'in'])
1✔
18
MAPPING_DATA_DIR = join(dirname(realpath(__file__)), 'data')
1✔
19

20
_log = logging.getLogger(__name__)
1✔
21

22

23
def _sanitize_and_convert_keys_to_regex(key):
1✔
24
    """Replace spaces with spaces OR underscores, as a regular expr.
25
    Also compresses multiple spaces to a single one, and allows multiple
26
    spaces or underscores in the resulting expression.
27

28
    For example:
29
       "foo  bar__baz" -> "foo( |_)+bar( |_)+baz"
30
    """
31

32
    # force unicode
33
    # TODO: python3 check if this to run in python3
34
    if isinstance(key, basestring):
×
35
        key = unidecode(key)
×
36

37
    # fix superscripts - copied from old code
38
    found = False
×
39
    for pfx in LINEAR_UNITS:
×
40
        if pfx not in key:
×
41
            continue
×
42
        for (sfx, repl) in ('_', '2'), ('^2', '2'), ('^3', '3'):
×
43
            s = pfx + sfx
×
44
            p = key.find(s)
×
45
            if p >= 0:  # yes, the unit has a dimension
×
46
                key = key[:p + len(pfx)] + repl + key[p + len(s):]
×
47
                found = True
×
48
                break
×
49
        if found:
×
50
            break
×
51

52
    # escape special characters before regexing.
53
    for special in ('\\', '(', ')', '?', '*', '+', '.', '{', '}', '^', '$'):
×
54
        key = key.replace(special, '\\' + special)
×
55

56
    # convert underscores to white space
57
    key = key.replace('_', ' ').replace('  ', ' ')
×
58
    # collapse whitespace
59
    key = re.sub(r'\s+', ' ', key).strip()
×
60

61
    # convert white space to regex for space or underscore (repeated)
62
    key = key.replace(' ', '( |_)+')
×
63

64
    return re.compile(key, re.IGNORECASE)
×
65

66

67
def create_column_regexes(raw_columns):
1✔
68
    """
69
    Take the columns in the format below and sanitize the keys and add
70
    in the regex.
71

72
    :param raw_data: list of strings (columns names from imported file)
73

74
    :return: list of dict
75

76
    .. code:
77

78
        Result shall look like:
79

80
        [
81
            {'regex': <_sre.SRE_Pattern object at 0x10f151a50>, 'raw': 'has_underscores'},
82
            {'regex': <_sre.SRE_Pattern object at 0x10f10e870>, 'raw': 'has  multi spaces'}
83
        ]
84
    """
85
    if not raw_columns:
×
86
        _log.debug("No raw_columns provided!")
×
87
        return []
×
88

89
    # clean up the comparing columns
90
    new_list = []
×
91
    for c in raw_columns:
×
92
        new_data = {}
×
93
        new_data['raw'] = c
×
94
        new_data['regex'] = _sanitize_and_convert_keys_to_regex(c)
×
95
        new_list.append(new_data)
×
96

97
    return new_list
×
98

99

100
def get_pm_mapping(raw_columns, mapping_data=None, resolve_duplicates=True):
1✔
101
    """
102
    Create and return Portfolio Manager (PM) mapping for a given version of PM and the given
103
    list of column names.
104

105
    The method will take the raw_columns (from the CSV/XLSX file) and attempt to normalize the
106
    column names so that they can be mapped to the data in the pm-mapping.json['from_field'].
107

108
    .. code:
109
        [
110
            {
111
                "display_name": "Address Line 1",
112
                "to_field": "address_line_1",
113
                "to_table_name": "PropertyState",
114
                "from_field": "Address 1",
115
                "units": "",
116
                "type": "string",
117
                "schema": ""
118
            }
119
        ]
120

121
    .. code:
122

123
        # Without duplicates
124

125
        {
126
            'Address 1': ('PropertyState', 'address_line_1', 100),
127
            'Property ID': ('PropertyState', 'pm_property_id', 100),
128
            'Portfolio Manager Property ID': ('PropertyState', 'Portfolio Manager Property ID', 100),
129
            'Address_1': ('PropertyState', 'Address_1', 100)
130
        }
131

132
        # With duplicates
133

134
        {
135
            'Address 1': ('PropertyState', 'address_line_1', 100),
136
            'Property ID': ('PropertyState', 'pm_property_id', 100),
137
            'Portfolio Manager Property ID': ('PropertyState', 'pm_property_id', 100),
138
            'Address_1': ('PropertyState', 'address_line_1', 100)
139
        }
140

141

142

143
    """
144
    from_columns = create_column_regexes(raw_columns)
×
145

146
    if not mapping_data:
×
147
        f = open(os.path.join(MAPPING_DATA_DIR, "pm-mapping.json"))
×
148
        mapping_data = json.load(f)
×
149

150
    # transform the data into the format expected by the mapper. (see mapping_columns.final_mappings)
151
    final_mappings = OrderedDict()
×
152
    for c in from_columns:
×
153
        column_found = False
×
154
        for d in mapping_data:
×
155
            if c['regex'].match(d['from_field']):
×
156
                # Assume that the mappings are 100% accurate for now.
157
                final_mappings[c['raw']] = (d['to_table_name'], d['to_field'], 100)
×
158
                column_found = True
×
159
                continue
×
160

161
        if not column_found:
×
162
            # if we get here then the columns was never found
163
            _log.debug("Could not find applicable mappings, resorting to raw field ({}) in PropertyState".format(c['raw']))
×
164
            final_mappings[c['raw']] = ('PropertyState', c['raw'], 100)
×
165

166
    # verify that there are no duplicate matchings
167
    if resolve_duplicates:
×
168

169
        # get the set of mappings
170
        mappings = []
×
171
        for v in final_mappings.values():
×
172
            mappings.append(v)
×
173

174
        unique_mappings = set()
×
175
        for k, v in final_mappings.items():
×
176
            if v not in unique_mappings:
×
177
                unique_mappings.add(v)
×
178
            else:
179
                i = 1
×
180
                base = v[1]
×
181
                while v in unique_mappings:
×
182
                    new_v = base + "_duplicate_{}".format(i)
×
183
                    v = (v[0], new_v, v[2])
×
184
                    i += 1
×
185

186
                unique_mappings.add(v)
×
187
                final_mappings[k] = v
×
188

189
    return final_mappings
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc