• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OpenRefine / OpenRefine / 13496948800

24 Feb 2025 11:27AM UTC coverage: 70.071% (+20.3%) from 49.798%
13496948800

Pull #7153

github

web-flow
Merge df7f6e02f into 4d08ea811
Pull Request #7153: operations: Renaming support for c.g.r.o.column

3377 of 5501 branches covered (61.39%)

Branch coverage included in aggregate %.

24 of 26 new or added lines in 6 files covered. (92.31%)

9575 of 12983 relevant lines covered (73.75%)

3.37 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.38
/main/src/com/google/refine/operations/column/ColumnAdditionByFetchingURLsOperation.java
1
/*
2

3
Copyright 2010, Google Inc.
4
All rights reserved.
5

6
Redistribution and use in source and binary forms, with or without
7
modification, are permitted provided that the following conditions are
8
met:
9

10
    * Redistributions of source code must retain the above copyright
11
notice, this list of conditions and the following disclaimer.
12
    * Redistributions in binary form must reproduce the above
13
copyright notice, this list of conditions and the following disclaimer
14
in the documentation and/or other materials provided with the
15
distribution.
16
    * Neither the name of Google Inc. nor the names of its
17
contributors may be used to endorse or promote products derived from
18
this software without specific prior written permission.
19

20
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31

32
*/
33

34
package com.google.refine.operations.column;
35

36
import static com.google.common.base.Strings.isNullOrEmpty;
37

38
import java.io.IOException;
39
import java.io.Serializable;
40
import java.util.ArrayList;
41
import java.util.List;
42
import java.util.Map;
43
import java.util.Optional;
44
import java.util.Properties;
45
import java.util.Set;
46
import java.util.concurrent.TimeUnit;
47

48
import com.fasterxml.jackson.annotation.JsonCreator;
49
import com.fasterxml.jackson.annotation.JsonProperty;
50
import com.google.common.cache.CacheBuilder;
51
import com.google.common.cache.CacheLoader;
52
import com.google.common.cache.LoadingCache;
53
import org.apache.commons.lang.Validate;
54
import org.apache.hc.core5.http.Header;
55
import org.apache.hc.core5.http.message.BasicHeader;
56

57
import com.google.refine.browsing.Engine;
58
import com.google.refine.browsing.EngineConfig;
59
import com.google.refine.browsing.FilteredRows;
60
import com.google.refine.browsing.RowVisitor;
61
import com.google.refine.expr.EvalError;
62
import com.google.refine.expr.Evaluable;
63
import com.google.refine.expr.ExpressionUtils;
64
import com.google.refine.expr.MetaParser;
65
import com.google.refine.expr.ParsingException;
66
import com.google.refine.expr.WrappedCell;
67
import com.google.refine.history.HistoryEntry;
68
import com.google.refine.model.Cell;
69
import com.google.refine.model.Column;
70
import com.google.refine.model.ColumnsDiff;
71
import com.google.refine.model.Project;
72
import com.google.refine.model.Row;
73
import com.google.refine.model.changes.CellAtRow;
74
import com.google.refine.model.changes.ColumnAdditionChange;
75
import com.google.refine.operations.EngineDependentOperation;
76
import com.google.refine.operations.OnError;
77
import com.google.refine.operations.OperationDescription;
78
import com.google.refine.process.LongRunningProcess;
79
import com.google.refine.process.Process;
80
import com.google.refine.util.HttpClient;
81

82
public class ColumnAdditionByFetchingURLsOperation extends EngineDependentOperation {
83

84
    public static final class HttpHeader {
85

86
        @JsonProperty("name")
87
        final public String name;
88
        @JsonProperty("value")
89
        final public String value;
90

91
        @JsonCreator
92
        public HttpHeader(
93
                @JsonProperty("name") String name,
94
                @JsonProperty("value") String value) {
2✔
95
            this.name = name;
3✔
96
            this.value = value;
3✔
97
        }
1✔
98
    }
99

100
    final protected String _baseColumnName;
101
    final protected String _urlExpression;
102
    final protected OnError _onError;
103

104
    final protected String _newColumnName;
105
    final protected int _columnInsertIndex;
106
    final protected int _delay;
107
    final protected boolean _cacheResponses;
108
    final protected List<HttpHeader> _httpHeadersJson;
109
    private Header[] httpHeaders = new Header[0];
4✔
110
    private HttpClient _httpClient;
111

112
    @JsonCreator
113
    public ColumnAdditionByFetchingURLsOperation(
114
            @JsonProperty("engineConfig") EngineConfig engineConfig,
115
            @JsonProperty("baseColumnName") String baseColumnName,
116
            @JsonProperty("urlExpression") String urlExpression,
117
            @JsonProperty("onError") OnError onError,
118
            @JsonProperty("newColumnName") String newColumnName,
119
            @JsonProperty("columnInsertIndex") int columnInsertIndex,
120
            @JsonProperty("delay") int delay,
121
            @JsonProperty("cacheResponses") boolean cacheResponses,
122
            @JsonProperty("httpHeadersJson") List<HttpHeader> httpHeadersJson) {
123
        super(engineConfig);
3✔
124

125
        _baseColumnName = baseColumnName;
3✔
126
        _urlExpression = urlExpression;
3✔
127
        _onError = onError;
3✔
128

129
        _newColumnName = newColumnName;
3✔
130
        _columnInsertIndex = columnInsertIndex;
3✔
131

132
        _delay = delay;
3✔
133
        _cacheResponses = cacheResponses;
3✔
134
        _httpHeadersJson = httpHeadersJson;
3✔
135

136
        List<Header> headers = new ArrayList<Header>();
4✔
137
        if (_httpHeadersJson != null) {
3✔
138
            for (HttpHeader header : _httpHeadersJson) {
11✔
139
                if (!isNullOrEmpty(header.name) && !isNullOrEmpty(header.value)) {
8!
140
                    // TODO: Should we be checking headers against a whitelist here?
141
                    headers.add(new BasicHeader(header.name, header.value));
10✔
142
                }
143
            }
1✔
144
        }
145
        httpHeaders = headers.toArray(httpHeaders);
7✔
146
        _httpClient = new HttpClient(_delay);
7✔
147

148
    }
1✔
149

150
    @Override
151
    public void validate() {
152
        super.validate();
2✔
153
        Validate.notNull(_baseColumnName, "Missing base column name");
4✔
154
        Validate.notNull(_urlExpression, "Missing URL expression");
4✔
155
        try {
156
            MetaParser.parse(_urlExpression);
4✔
157
        } catch (ParsingException e) {
×
158
            throw new IllegalArgumentException(String.format("Invalid expression '%s': %s", _urlExpression, e.getMessage()), e);
×
159
        }
1✔
160
        Validate.notNull(_onError, "Missing 'on error' behaviour");
4✔
161
        Validate.notNull(_newColumnName, "Missing new column name");
4✔
162
        Validate.isTrue(_columnInsertIndex >= 0, "Invalid column insert index");
7!
163
    }
1✔
164

165
    @JsonProperty("newColumnName")
166
    public String getNewColumnName() {
167
        return _newColumnName;
3✔
168
    }
169

170
    @JsonProperty("columnInsertIndex")
171
    public int getColumnInsertIndex() {
172
        return _columnInsertIndex;
3✔
173
    }
174

175
    @JsonProperty("baseColumnName")
176
    public String getBaseColumnName() {
177
        return _baseColumnName;
3✔
178
    }
179

180
    @JsonProperty("urlExpression")
181
    public String getUrlExpression() {
182
        return _urlExpression;
3✔
183
    }
184

185
    @JsonProperty("onError")
186
    public OnError getOnError() {
187
        return _onError;
3✔
188
    }
189

190
    @JsonProperty("delay")
191
    public int getDelay() {
192
        return _delay;
3✔
193
    }
194

195
    @JsonProperty("httpHeadersJson")
196
    public List<HttpHeader> getHttpHeadersJson() {
197
        return _httpHeadersJson;
3✔
198
    }
199

200
    @JsonProperty("cacheResponses")
201
    public boolean getCacheResponses() {
202
        return _cacheResponses;
3✔
203
    }
204

205
    @Override
206
    protected String getBriefDescription(Project project) {
207
        return OperationDescription.column_addition_by_fetching_urls_brief(_newColumnName, _columnInsertIndex, _baseColumnName,
11✔
208
                _urlExpression);
209
    }
210

211
    protected String createDescription(Column column, List<CellAtRow> cellsAtRows) {
212
        return OperationDescription.column_addition_by_fetching_urls_desc(_newColumnName, cellsAtRows.size(), column.getName(),
×
213
                _urlExpression);
214
    }
215

216
    @Override
217
    public Optional<Set<String>> getColumnDependenciesWithoutEngine() {
218
        try {
219
            Evaluable evaluable = MetaParser.parse(_urlExpression);
4✔
220
            return evaluable.getColumnDependencies(Optional.of(_baseColumnName));
6✔
221
        } catch (ParsingException e) {
×
222
            return Optional.empty();
×
223
        }
224
    }
225

226
    @Override
227
    public Optional<ColumnsDiff> getColumnsDiff() {
228
        return Optional.of(ColumnsDiff.builder().addColumn(_newColumnName, _baseColumnName).build());
9✔
229
    }
230

231
    @Override
232
    public ColumnAdditionByFetchingURLsOperation renameColumns(Map<String, String> newColumnNames) {
233
        String renamedExpression;
234
        try {
235
            Evaluable evaluable = MetaParser.parse(_urlExpression);
4✔
236
            Evaluable renamedEvaluable = evaluable.renameColumnDependencies(newColumnNames);
4✔
237
            renamedExpression = renamedEvaluable.getFullSource();
3✔
NEW
238
        } catch (ParsingException e) {
×
NEW
239
            return this;
×
240
        }
1✔
241
        return new ColumnAdditionByFetchingURLsOperation(
6✔
242
                _engineConfig.renameColumnDependencies(newColumnNames),
6✔
243
                newColumnNames.getOrDefault(_baseColumnName, _baseColumnName),
10✔
244
                renamedExpression,
245
                _onError,
246
                newColumnNames.getOrDefault(_newColumnName, _newColumnName),
11✔
247
                _columnInsertIndex,
248
                _delay,
249
                _cacheResponses,
250
                _httpHeadersJson);
251
    }
252

253
    @Override
254
    public Process createProcess(Project project, Properties options) throws Exception {
255
        Engine engine = createEngine(project);
4✔
256
        engine.initializeFromConfig(_engineConfig);
4✔
257

258
        Evaluable eval = MetaParser.parse(_urlExpression);
4✔
259

260
        return new ColumnAdditionByFetchingURLsProcess(
9✔
261
                project,
262
                engine,
263
                eval,
264
                getBriefDescription(null),
4✔
265
                _cacheResponses);
266
    }
267

268
    public class ColumnAdditionByFetchingURLsProcess extends LongRunningProcess implements Runnable {
269

270
        final protected Project _project;
271
        final protected Engine _engine;
272
        final protected Evaluable _eval;
273
        final protected long _historyEntryID;
274
        protected int _cellIndex;
275
        protected LoadingCache<String, Serializable> _urlCache;
276

277
        public ColumnAdditionByFetchingURLsProcess(
278
                Project project,
279
                Engine engine,
280
                Evaluable eval,
281
                String description,
282
                boolean cacheResponses) {
3✔
283
            super(description);
3✔
284
            _project = project;
3✔
285
            _engine = engine;
3✔
286
            _eval = eval;
3✔
287
            _historyEntryID = HistoryEntry.allocateID();
3✔
288
            _urlCache = null;
3✔
289
            if (cacheResponses) {
2✔
290
                _urlCache = CacheBuilder.newBuilder()
3✔
291
                        .maximumSize(2048)
3✔
292
                        .expireAfterWrite(10, TimeUnit.MINUTES)
6✔
293
                        .build(
2✔
294
                                new CacheLoader<String, Serializable>() {
9✔
295

296
                                    public Serializable load(String urlString) throws Exception {
297
                                        Serializable result = fetch(urlString, httpHeaders);
9✔
298
                                        if (result == null) {
2!
299
                                            // the load method should not return any null value
300
                                            throw new Exception("null result returned by fetch");
×
301
                                        }
302
                                        return result;
2✔
303
                                    }
304
                                });
305
            }
306
        }
1✔
307

308
        @Override
309
        protected Runnable getRunnable() {
310
            return this;
2✔
311
        }
312

313
        @Override
314
        public void run() {
315
            Column column = _project.columnModel.getColumnByName(_baseColumnName);
8✔
316
            if (column == null) {
2!
317
                _project.processManager.onFailedProcess(this, new Exception("No column named " + _baseColumnName));
×
318
                return;
×
319
            }
320
            if (_project.columnModel.getColumnByName(_newColumnName) != null) {
8!
321
                _project.processManager.onFailedProcess(this, new Exception("Another column already named " + _newColumnName));
×
322
                return;
×
323
            }
324

325
            List<CellAtRow> urls = new ArrayList<CellAtRow>(_project.rows.size());
8✔
326

327
            FilteredRows filteredRows = _engine.getAllFilteredRows();
4✔
328
            filteredRows.accept(_project, createRowVisitor(urls));
7✔
329

330
            int count = urls.size();
3✔
331
            List<CellAtRow> responseBodies = new ArrayList<CellAtRow>(count);
5✔
332
            int i = 0;
2✔
333
            for (CellAtRow urlData : urls) {
10✔
334
                String urlString = urlData.cell.value.toString();
5✔
335

336
                Serializable response = null;
2✔
337
                if (_urlCache != null) {
3✔
338
                    response = cachedFetch(urlString);
5✔
339
                } else {
340
                    response = fetch(urlString, httpHeaders);
7✔
341
                }
342

343
                if (response != null) {
2!
344
                    CellAtRow cellAtRow = new CellAtRow(
11✔
345
                            urlData.row,
346
                            new Cell(response, null));
347

348
                    responseBodies.add(cellAtRow);
4✔
349
                }
350

351
                _progress = i++ * 100 / count;
8✔
352

353
                if (_canceled) {
3!
354
                    break;
×
355
                }
356
            }
1✔
357

358
            if (!_canceled) {
3!
359
                HistoryEntry historyEntry = new HistoryEntry(
22✔
360
                        _historyEntryID,
361
                        _project,
362
                        _description,
363
                        ColumnAdditionByFetchingURLsOperation.this,
364
                        new ColumnAdditionChange(
365
                                _newColumnName,
366
                                _columnInsertIndex,
367
                                responseBodies));
368

369
                _project.history.addEntry(historyEntry);
5✔
370
                _project.processManager.onDoneProcess(this);
5✔
371
            }
372
        }
1✔
373

374
        Serializable cachedFetch(String urlString) {
375
            try {
376
                return _urlCache.get(urlString);
6✔
377
            } catch (Exception e) {
×
378
                return null;
×
379
            }
380
        }
381

382
        Serializable fetch(String urlString, Header[] headers) {
383
            try { // HttpClients.createDefault()) {
384
                try {
385
                    return _httpClient.getAsString(urlString, headers);
7✔
386
                } catch (IOException e) {
1✔
387
                    return _onError == OnError.StoreError ? new EvalError(e) : null;
11!
388
                }
389
            } catch (Exception e) {
×
390
                return _onError == OnError.StoreError ? new EvalError(e.getMessage()) : null;
×
391
            }
392
        }
393

394
        RowVisitor createRowVisitor(List<CellAtRow> cellsAtRows) {
395
            return new RowVisitor() {
12✔
396

397
                int cellIndex;
398
                Properties bindings;
399
                List<CellAtRow> cellsAtRows;
400

401
                public RowVisitor init(List<CellAtRow> cellsAtRows) {
402
                    Column column = _project.columnModel.getColumnByName(_baseColumnName);
10✔
403

404
                    this.cellIndex = column.getCellIndex();
4✔
405
                    this.bindings = ExpressionUtils.createBindings(_project);
6✔
406
                    this.cellsAtRows = cellsAtRows;
3✔
407
                    return this;
2✔
408
                }
409

410
                @Override
411
                public void start(Project project) {
412
                    // nothing to do
413
                }
1✔
414

415
                @Override
416
                public void end(Project project) {
417
                    // nothing to do
418
                }
1✔
419

420
                @Override
421
                public boolean visit(Project project, int rowIndex, Row row) {
422
                    Cell cell = row.getCell(cellIndex);
5✔
423
                    Cell newCell = null;
2✔
424

425
                    ExpressionUtils.bind(bindings, row, rowIndex, _baseColumnName, cell);
10✔
426

427
                    Object o = _eval.evaluate(bindings);
7✔
428
                    if (o != null) {
2!
429
                        if (o instanceof Cell) {
3!
430
                            newCell = (Cell) o;
×
431
                        } else if (o instanceof WrappedCell) {
3!
432
                            newCell = ((WrappedCell) o).cell;
×
433
                        } else {
434
                            Serializable v = ExpressionUtils.wrapStorable(o);
3✔
435
                            if (ExpressionUtils.isNonBlankData(v)) {
3!
436
                                newCell = new Cell(v.toString(), null);
7✔
437
                            }
438
                        }
439
                    }
440

441
                    if (newCell != null) {
2!
442
                        cellsAtRows.add(new CellAtRow(rowIndex, newCell));
9✔
443
                    }
444

445
                    return false;
2✔
446
                }
447
            }.init(cellsAtRows);
1✔
448
        }
449
    }
450

451
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc