#6522

pending completion

Build # #6522

Build Type

push

coveralls-python

Committed by

web-flow

Commit Message

Update derived column migration to prevent conflicting column names and prevent duplicate column names (#3728)

* update derived column migration to create unique names for derived columns

* prevent derived column names that are duplicated with column names

* disable create/save on error

* update and add test

Run Details

15680 of 22591 relevant lines covered (69.41%)

0.69 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

19.23

/seed/lib/mappings/mapper.py

# !/usr/bin/env python
# encoding: utf-8
"""
:copyright (c) 2014 - 2022, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Department of Energy) and contributors. All rights reserved.
:author Dan Gunter <dkgunter@lbl.gov>
"""
import json
import logging
import os
import re
from collections import OrderedDict
from os.path import dirname, join, realpath

from past.builtins import basestring
from unidecode import unidecode

LINEAR_UNITS = set(['ft', 'm', 'in'])
MAPPING_DATA_DIR = join(dirname(realpath(__file__)), 'data')

_log = logging.getLogger(__name__)


def _sanitize_and_convert_keys_to_regex(key):
    """Replace spaces with spaces OR underscores, as a regular expr.
    Also compresses multiple spaces to a single one, and allows multiple
    spaces or underscores in the resulting expression.

    For example:
       "foo  bar__baz" -> "foo( |_)+bar( |_)+baz"
    """

    # force unicode
    # TODO: python3 check if this to run in python3
    if isinstance(key, basestring):
        key = unidecode(key)

    # fix superscripts - copied from old code
    found = False
    for pfx in LINEAR_UNITS:
        if pfx not in key:
            continue
        for (sfx, repl) in ('_', '2'), ('^2', '2'), ('^3', '3'):
            s = pfx + sfx
            p = key.find(s)
            if p >= 0:  # yes, the unit has a dimension
                key = key[:p + len(pfx)] + repl + key[p + len(s):]
                found = True
                break
        if found:
            break

    # escape special characters before regexing.
    for special in ('\\', '(', ')', '?', '*', '+', '.', '{', '}', '^', '$'):
        key = key.replace(special, '\\' + special)

    # convert underscores to white space
    key = key.replace('_', ' ').replace('  ', ' ')
    # collapse whitespace
    key = re.sub(r'\s+', ' ', key).strip()

    # convert white space to regex for space or underscore (repeated)
    key = key.replace(' ', '( |_)+')

    return re.compile(key, re.IGNORECASE)


def create_column_regexes(raw_columns):
    """
    Take the columns in the format below and sanitize the keys and add
    in the regex.

    :param raw_data: list of strings (columns names from imported file)

    :return: list of dict

    .. code:

        Result shall look like:

        [
            {'regex': <_sre.SRE_Pattern object at 0x10f151a50>, 'raw': 'has_underscores'},
            {'regex': <_sre.SRE_Pattern object at 0x10f10e870>, 'raw': 'has  multi spaces'}
        ]
    """
    if not raw_columns:
        _log.debug("No raw_columns provided!")
        return []

    # clean up the comparing columns
    new_list = []
    for c in raw_columns:
        new_data = {}
        new_data['raw'] = c
        new_data['regex'] = _sanitize_and_convert_keys_to_regex(c)
        new_list.append(new_data)

    return new_list


def get_pm_mapping(raw_columns, mapping_data=None, resolve_duplicates=True):
    """
    Create and return Portfolio Manager (PM) mapping for a given version of PM and the given
    list of column names.

    The method will take the raw_columns (from the CSV/XLSX file) and attempt to normalize the
    column names so that they can be mapped to the data in the pm-mapping.json['from_field'].

    .. code:
        [
            {
                "display_name": "Address Line 1",
                "to_field": "address_line_1",
                "to_table_name": "PropertyState",
                "from_field": "Address 1",
                "units": "",
                "type": "string",
                "schema": ""
            }
        ]

    .. code:

        # Without duplicates

        {
            'Address 1': ('PropertyState', 'address_line_1', 100),
            'Property ID': ('PropertyState', 'pm_property_id', 100),
            'Portfolio Manager Property ID': ('PropertyState', 'Portfolio Manager Property ID', 100),
            'Address_1': ('PropertyState', 'Address_1', 100)
        }

        # With duplicates

        {
            'Address 1': ('PropertyState', 'address_line_1', 100),
            'Property ID': ('PropertyState', 'pm_property_id', 100),
            'Portfolio Manager Property ID': ('PropertyState', 'pm_property_id', 100),
            'Address_1': ('PropertyState', 'address_line_1', 100)
        }



    """
    from_columns = create_column_regexes(raw_columns)

    if not mapping_data:
        f = open(os.path.join(MAPPING_DATA_DIR, "pm-mapping.json"))
        mapping_data = json.load(f)

    # transform the data into the format expected by the mapper. (see mapping_columns.final_mappings)
    final_mappings = OrderedDict()
    for c in from_columns:
        column_found = False
        for d in mapping_data:
            if c['regex'].match(d['from_field']):
                # Assume that the mappings are 100% accurate for now.
                final_mappings[c['raw']] = (d['to_table_name'], d['to_field'], 100)
                column_found = True
                continue

        if not column_found:
            # if we get here then the columns was never found
            _log.debug("Could not find applicable mappings, resorting to raw field ({}) in PropertyState".format(c['raw']))
            final_mappings[c['raw']] = ('PropertyState', c['raw'], 100)

    # verify that there are no duplicate matchings
    if resolve_duplicates:

        # get the set of mappings
        mappings = []
        for v in final_mappings.values():
            mappings.append(v)

        unique_mappings = set()
        for k, v in final_mappings.items():
            if v not in unique_mappings:
                unique_mappings.add(v)
            else:
                i = 1
                base = v[1]
                while v in unique_mappings:
                    new_v = base + "_duplicate_{}".format(i)
                    v = (v[0], new_v, v[2])
                    i += 1

                unique_mappings.add(v)
                final_mappings[k] = v

    return final_mappings

1	# !/usr/bin/env python
2	# encoding: utf-8
3	"""	1✔
4	:copyright (c) 2014 - 2022, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Department of Energy) and contributors. All rights reserved.
5	:author Dan Gunter <dkgunter@lbl.gov>
6	"""
7	import json	1✔
8	import logging	1✔
9	import os	1✔
10	import re	1✔
11	from collections import OrderedDict	1✔
12	from os.path import dirname, join, realpath	1✔
13
14	from past.builtins import basestring	1✔
15	from unidecode import unidecode	1✔
16
17	LINEAR_UNITS = set(['ft', 'm', 'in'])	1✔
18	MAPPING_DATA_DIR = join(dirname(realpath(__file__)), 'data')	1✔
19
20	_log = logging.getLogger(__name__)	1✔
21
22
23	def _sanitize_and_convert_keys_to_regex(key):	1✔
24	"""Replace spaces with spaces OR underscores, as a regular expr.
25	Also compresses multiple spaces to a single one, and allows multiple
26	spaces or underscores in the resulting expression.
27
28	For example:
29	"foo bar__baz" -> "foo( \|_)+bar( \|_)+baz"
30	"""
31
32	# force unicode
33	# TODO: python3 check if this to run in python3
34	if isinstance(key, basestring):	×
35	key = unidecode(key)	×
36
37	# fix superscripts - copied from old code
38	found = False	×
39	for pfx in LINEAR_UNITS:	×
40	if pfx not in key:	×
41	continue	×
42	for (sfx, repl) in ('_', '2'), ('^2', '2'), ('^3', '3'):	×
43	s = pfx + sfx	×
44	p = key.find(s)	×
45	if p >= 0: # yes, the unit has a dimension	×
46	key = key[:p + len(pfx)] + repl + key[p + len(s):]	×
47	found = True	×
48	break	×
49	if found:	×
50	break	×
51
52	# escape special characters before regexing.
53	for special in ('\\', '(', ')', '?', '*', '+', '.', '{', '}', '^', '$'):	×
54	key = key.replace(special, '\\' + special)	×
55
56	# convert underscores to white space
57	key = key.replace('_', ' ').replace(' ', ' ')	×
58	# collapse whitespace
59	key = re.sub(r'\s+', ' ', key).strip()	×
60
61	# convert white space to regex for space or underscore (repeated)
62	key = key.replace(' ', '( \|_)+')	×
63
64	return re.compile(key, re.IGNORECASE)	×
65
66
67	def create_column_regexes(raw_columns):	1✔
68	"""
69	Take the columns in the format below and sanitize the keys and add
70	in the regex.
71
72	:param raw_data: list of strings (columns names from imported file)
73
74	:return: list of dict
75
76	.. code:
77
78	Result shall look like:
79
80	[
81	{'regex': <_sre.SRE_Pattern object at 0x10f151a50>, 'raw': 'has_underscores'},
82	{'regex': <_sre.SRE_Pattern object at 0x10f10e870>, 'raw': 'has multi spaces'}
83	]
84	"""
85	if not raw_columns:	×
86	_log.debug("No raw_columns provided!")	×
87	return []	×
88
89	# clean up the comparing columns
90	new_list = []	×
91	for c in raw_columns:	×
92	new_data = {}	×
93	new_data['raw'] = c	×
94	new_data['regex'] = _sanitize_and_convert_keys_to_regex(c)	×
95	new_list.append(new_data)	×
96
97	return new_list	×
98
99
100	def get_pm_mapping(raw_columns, mapping_data=None, resolve_duplicates=True):	1✔
101	"""
102	Create and return Portfolio Manager (PM) mapping for a given version of PM and the given
103	list of column names.
104
105	The method will take the raw_columns (from the CSV/XLSX file) and attempt to normalize the
106	column names so that they can be mapped to the data in the pm-mapping.json['from_field'].
107
108	.. code:
109	[
110	{
111	"display_name": "Address Line 1",
112	"to_field": "address_line_1",
113	"to_table_name": "PropertyState",
114	"from_field": "Address 1",
115	"units": "",
116	"type": "string",
117	"schema": ""
118	}
119	]
120
121	.. code:
122
123	# Without duplicates
124
125	{
126	'Address 1': ('PropertyState', 'address_line_1', 100),
127	'Property ID': ('PropertyState', 'pm_property_id', 100),
128	'Portfolio Manager Property ID': ('PropertyState', 'Portfolio Manager Property ID', 100),
129	'Address_1': ('PropertyState', 'Address_1', 100)
130	}
131
132	# With duplicates
133
134	{
135	'Address 1': ('PropertyState', 'address_line_1', 100),
136	'Property ID': ('PropertyState', 'pm_property_id', 100),
137	'Portfolio Manager Property ID': ('PropertyState', 'pm_property_id', 100),
138	'Address_1': ('PropertyState', 'address_line_1', 100)
139	}
140
141
142
143	"""
144	from_columns = create_column_regexes(raw_columns)	×
145
146	if not mapping_data:	×
147	f = open(os.path.join(MAPPING_DATA_DIR, "pm-mapping.json"))	×
148	mapping_data = json.load(f)	×
149
150	# transform the data into the format expected by the mapper. (see mapping_columns.final_mappings)
151	final_mappings = OrderedDict()	×
152	for c in from_columns:	×
153	column_found = False	×
154	for d in mapping_data:	×
155	if c['regex'].match(d['from_field']):	×
156	# Assume that the mappings are 100% accurate for now.
157	final_mappings[c['raw']] = (d['to_table_name'], d['to_field'], 100)	×
158	column_found = True	×
159	continue	×
160
161	if not column_found:	×
162	# if we get here then the columns was never found
163	_log.debug("Could not find applicable mappings, resorting to raw field ({}) in PropertyState".format(c['raw']))	×
164	final_mappings[c['raw']] = ('PropertyState', c['raw'], 100)	×
165
166	# verify that there are no duplicate matchings
167	if resolve_duplicates:	×
168
169	# get the set of mappings
170	mappings = []	×
171	for v in final_mappings.values():	×
172	mappings.append(v)	×
173
174	unique_mappings = set()	×
175	for k, v in final_mappings.items():	×
176	if v not in unique_mappings:	×
177	unique_mappings.add(v)	×
178	else:
179	i = 1	×
180	base = v[1]	×
181	while v in unique_mappings:	×
182	new_v = base + "_duplicate_{}".format(i)	×
183	v = (v[0], new_v, v[2])	×
184	i += 1	×
185
186	unique_mappings.add(v)	×
187	final_mappings[k] = v	×
188
189	return final_mappings	×

SEED-platform / seed / #6522

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous