Commit b5fdf1b0 authored by claudiu.cristea's avatar claudiu.cristea

Issue #2819097 by claudiu.cristea: Allow more generic configurations

parent 3ec3016d
......@@ -12,44 +12,62 @@ Usually we use [Composer](https://getcomposer.org/) to install dependencies in D
## Usage
In your migration file:
In your migration file configure the source plugin as follows:
```yaml
id: ...
source:
plugin: spreadsheet
# The source file. The path can be either relative to Drupal root but it can
# be a also an absolute reference such as a stream wrapper.
file: ../resources/source_file.xlsx
# The worksheet to be read.
worksheet: 'Personnel list'
# The first row from where the table starts. Points to the row that contains
# The top-left cell where date area starts (excluding the header, if exists).
# It should use a spreadsheet representation (B2, A2, ...). The data area does
# not include the header. If it's missed, the assumption is that the first row
# contains the table header and the data origin is the first cell of the
# second row. That is A2. In this example the data area starts from the second
# column of the third row.
origin: B3
# The row where the header is placed, if any. If missed, there's no table
# header for the data being inserted and the spreadsheet columns (A, B, ...)
# the table header. If the table row is the first row, this should be 1. The
# value of 3 means that the table header is on the third row.
header_row: 3
# Columns to be returned, basically a list of table header cell values.
# will be used as column names. The value of 2 means that the table header is
# on the second row.
header_row: 2
# The list of columns to be returned. Is basically a list of table header cell
# values if a header has been defined (see `header_row`). If there's no table
# header (i.e. `header_row` is missing), it should contain a list/sequence of
# column letters (A, B, C, ...).
columns:
- ID
- Revision
- 'First name'
- 'Sure name'
- Gender
# If this setting is specified, the source will return also a column
# containing the 'zero based' row index under this name. For this example,
# 'Row index' can be used later in `keys:` list to make this column a primary
# key column.
row_index_column: 'Row index'
# This is a list of source columns that are composing the primary key. The
# list is keyed by column name and has the field schema as value. If no keys
# are defined, the current row position will be returned as primary key, but
# in this case, 'row_index_column' should have a value.
# The name of the column with the row index. If this setting is specified, the
# source will return also a column with this name containing the row index.
# In this example 'Row no.' can be used later in `keys:` list to make this
# column a primary key column.
row_index_column: 'Row no.'
# The primary key as list of keys. Is a list of source columns that are
# composing the primary key. The list is keyed by column name and has the
# field schema as value. If the table have a header (i.e. `header_row` is set)
# the keys will be set ad the name of header cells acting as primary index.
# Otherwide the column letters (A, B, C, ...) can be used. If no keys are
# defined, the current row position will be returned as primary key, but in
# this case, 'row_index_column' should have a value.
keys:
ID:
type: integer
Revision:
type: string
destination:
...
```
## Author
......
......@@ -45,7 +45,8 @@ class Spreadsheet extends SourcePluginBase implements ConfigurablePluginInterfac
return [
'file' => NULL,
'worksheet' => NULL,
'header_row' => 1,
'origin' => 'A2',
'header_row' => 2,
'columns' => [],
'keys' => [],
'row_index_column' => NULL,
......
......@@ -18,46 +18,25 @@ class SpreadsheetIterator implements SpreadsheetIteratorInterface {
protected $configuration = [];
/**
* Columns list keyed by header cell and having column index as value.
*
* @var array
*/
protected $columns = [];
/**
* Primary keys list keyed by header cell and having column index as value.
*
* @var string[]
*/
protected $keys = [];
/**
* All headers keyed by cell value and having column index as value.
*
* @var string[]
*/
protected $headers;
/**
*The total number of rows in the worksheet.
* The 'zero based' relative index of the current row.
*
* @var integer
* @var int
*/
protected $rowsCount;
protected $relativeRow = 0;
/**
* The total number of columns in the worksheet.
* The absolute row index of the current row.
*
* @var integer
* @var int
*/
protected $columnsCount;
protected $absoluteRow;
/**
* The 'zero based' relative index of the current row.
* Static cache for some of the computed values.
*
* @var int
* @var array
*/
protected $currentRow = 0;
protected $cache = [];
/**
* {@inheritdoc}
......@@ -72,8 +51,13 @@ class SpreadsheetIterator implements SpreadsheetIteratorInterface {
}
return array_values(array_map(
function ($column_delta) {
return $this->getWorksheet()->getCellByColumnAndRow($column_delta, $this->getAbsoluteRowIndex(), FALSE)->getValue();
function ($col_letter) {
$cell_reference = "$col_letter{$this->getAbsoluteRowIndex()}";
if ($cell = $this->getWorksheet()->getCell($cell_reference, FALSE)) {
return $cell->getValue();
}
$key = array_search($col_letter, $this->getKeys());
throw new \RuntimeException("Key column '$key' contains a null value at $cell_reference.");
},
$keys
));
......@@ -83,40 +67,42 @@ class SpreadsheetIterator implements SpreadsheetIteratorInterface {
* {@inheritdoc}
*/
public function valid() {
return ($this->currentRow >= 0) && ($this->getAbsoluteRowIndex() <= $this->getRowsCount());
return ($this->relativeRow >= 0) && ($this->getAbsoluteRowIndex() <= $this->getRowsCount());
}
/**
* {@inheritdoc}
*/
public function rewind() {
$this->currentRow = 0;
unset($this->absoluteRow);
$this->relativeRow = 0;
}
/**
* {@inheritdoc}
*/
public function current() {
$all_columns = $this->getKeys() + $this->getColumns();
$keys = $this->getKeys();
$all_columns = $keys + $this->getColumns();
if ($row_index_column = $this->getRowIndexColumn()) {
$all_columns[$row_index_column] = -1;
// We set '@' here so that when it will be sorted, later, it will be the
// first in the list. Ascii of '@' is lower than ascii of 'A'.
$all_columns[$row_index_column] = '@';
}
else {
if (empty($keys = $this->getKeys())) {
throw new \InvalidArgumentException("Row index should act as key but no name has been provided. Pass a string in \$config['row_index_column'] key when setting the configuration in SpreadsheetIterator::setConfiguration(\$config), to provide a name for this column.");
}
elseif (empty($keys)) {
throw new \InvalidArgumentException("Row index should act as key but no name has been provided. Pass a string in \$config['row_index_column'] key when setting the configuration in SpreadsheetIterator::setConfiguration(\$config), to provide a name for this column.");
}
// Arrange columns in their spreadsheet native order.
asort($all_columns);
return array_map(
function ($column_delta) {
if ($column_delta === -1) {
function ($col_letter) {
if ($col_letter === '@') {
return $this->getAbsoluteRowIndex();
}
elseif ($cell = $this->getWorksheet()->getCellByColumnAndRow($column_delta, $this->getAbsoluteRowIndex(), FALSE)) {
elseif ($cell = $this->getWorksheet()->getCell("$col_letter{$this->getAbsoluteRowIndex()}", FALSE)) {
return $cell->getValue();
}
// Fall back to NULL.
......@@ -130,16 +116,17 @@ class SpreadsheetIterator implements SpreadsheetIteratorInterface {
* {@inheritdoc}
*/
public function next() {
$this->currentRow++;
unset($this->absoluteRow);
$this->relativeRow++;
}
/**
* {@inheritdoc}
*/
public function setConfiguration(array $configuration) {
$this->configuration = $configuration;
// Unset cached values.
unset($this->columns, $this->keys, $this->rowsCount, $this->columnsCount, $this->headers);
$this->clearCache();
$this->configuration = $configuration;
return $this;
}
......@@ -154,61 +141,94 @@ class SpreadsheetIterator implements SpreadsheetIteratorInterface {
* {@inheritdoc}
*/
public function getWorksheet() {
if (empty($this->configuration['worksheet']) || !$this->configuration['worksheet'] instanceof Worksheet) {
throw new \InvalidArgumentException("No valid 'worksheet' configuration.");
if (!isset($this->cache['worksheet'])) {
if (empty($this->configuration['worksheet']) || !$this->configuration['worksheet'] instanceof Worksheet) {
throw new \InvalidArgumentException("No valid 'worksheet' configuration.");
}
$this->cache['worksheet'] = $this->configuration['worksheet'];
}
return $this->configuration['worksheet'];
return $this->cache['worksheet'];
}
/**
* {@inheritdoc}
*/
public function getOrigin() {
if (!isset($this->cache['origin'])) {
$config = $this->getConfiguration();
if (empty($config['origin'])) {
// Defaulting to a table where the first row contains the header and data
// starts on the second row, column A.
return 'A2';
}
if ($coordinates = Cell::coordinateFromString($config['origin'])) {
$row_count = $this->getRowsCount();
$column_count = $this->getColumnsCount();
if (($coordinates[1] > $row_count) || ((Cell::columnIndexFromString($coordinates[0])) > $column_count)) {
$max = Cell::stringFromColumnIndex($column_count - 1) . $row_count;
throw new \InvalidArgumentException("Origin '{$config['origin']}' is out of bounds. Max value is '$max'.");
}
}
$this->cache['origin'] = strtoupper($config['origin']);
}
return $this->cache['origin'];
}
/**
* {@inheritdoc}
*/
public function getColumns() {
if (!isset($this->columns)) {
if (!isset($this->cache['columns'])) {
$headers = $this->getHeaders();
if (empty($this->configuration['columns'])) {
// If no columns were passed, all columns will be used.
$this->columns = $headers;
$this->cache['columns'] = $headers;
}
else {
$this->columns = [];
$this->cache['columns'] = [];
foreach ($this->configuration['columns'] as $column) {
$column = trim($column);
if (!isset($headers[$column])) {
throw new \InvalidArgumentException("Column '$column' doesn't exist in the table header.");
}
$this->columns[$column] = $headers[$column];
$this->cache['columns'][$column] = $headers[$column];
}
}
}
return $this->columns;
return $this->cache['columns'];
}
/**
* {@inheritdoc}
*/
public function getKeys() {
if (!isset($this->keys)) {
$this->keys = [];
if (!isset($this->cache['keys'])) {
$this->cache['keys'] = [];
if (!empty($this->configuration['keys'])) {
$headers = $this->getHeaders();
$this->keys = [];
foreach ($this->configuration['keys'] as $key) {
if ($key != $this->getRowIndexColumn() && !isset($headers[$key])) {
throw new \InvalidArgumentException("Key '$key' doesn't exist in the table header.");
}
$this->keys[$key] = $headers[$key];
$this->cache['keys'][$key] = $headers[$key];
}
}
}
return $this->keys;
return $this->cache['keys'];
}
/**
* {@inheritdoc}
*/
public function getHeaderRow() {
return empty($this->configuration['header_row']) ? 1 : $this->configuration['header_row'];
if (!isset($this->cache['header_row'])) {
$header_row = isset($this->configuration['header_row']) ? $this->configuration['header_row'] : NULL;
if ($header_row !== NULL && (!is_numeric($this->configuration['header_row']) || ($this->configuration['header_row'] < 1))) {
throw new \InvalidArgumentException("Wrong header_row value '{$this->configuration['header_row']}'.");
}
$this->cache['header_row'] = $header_row;
}
return $this->cache['header_row'];
}
/**
......@@ -229,40 +249,56 @@ class SpreadsheetIterator implements SpreadsheetIteratorInterface {
* {@inheritdoc}
*/
public function getHeaders() {
if (!isset($this->headers)) {
for ($col = 0; $col < $this->getColumnsCount(); ++$col) {
if ($cell = $this->getWorksheet()->getCellByColumnAndRow($col, $this->getHeaderRow(), FALSE)) {
$value = trim($cell->getValue());
if (isset($this->headers[$value])) {
throw new \RuntimeException("Table header '{$value}' is duplicated.");
if (!isset($this->cache['headers'])) {
// Get the first column index (zero based).
$first_col_index = Cell::columnIndexFromString(Cell::coordinateFromString($this->getOrigin())[0]) - 1;
for ($col_index = $first_col_index; $col_index < $this->getColumnsCount(); ++$col_index) {
$col_letter = Cell::stringFromColumnIndex($col_index);
if ($header_row = $this->getHeaderRow()) {
if ($cell = $this->getWorksheet()->getCell("$col_letter$header_row", FALSE)) {
$value = trim($cell->getValue());
if (isset($this->cache['headers'][$value])) {
throw new \RuntimeException("Table header '{$value}' is duplicated.");
}
}
}
else {
$value = $col_letter;
}
if (!empty($value)) {
$this->headers[$value] = $col;
// Only non-empty cells can act as header.
$this->cache['headers'][$value] = $col_letter;
}
}
}
return $this->headers;
return $this->cache['headers'];
}
/**
* {@inheritdoc}
*/
public function getRowsCount() {
if (!isset($this->rowsCount)) {
$this->rowsCount = $this->getWorksheet()->getHighestDataRow();
if (!isset($this->cache['rows_count'])) {
$this->cache['rows_count'] = $this->getWorksheet()->getHighestDataRow();
}
return $this->rowsCount;
return $this->cache['rows_count'];
}
/**
* {@inheritdoc}
*/
public function getColumnsCount() {
if (!isset($this->columnsCount)) {
$this->columnsCount = Cell::columnIndexFromString($this->getWorksheet()->getHighestDataColumn());
if (!isset($this->cache['columns_count'])) {
$this->cache['columns_count'] = Cell::columnIndexFromString($this->getWorksheet()->getHighestDataColumn());
}
return $this->columnsCount;
return $this->cache['columns_count'];
}
/**
* {@inheritdoc}
*/
public function clearCache() {
$this->cache = [];
}
/**
......@@ -271,8 +307,11 @@ class SpreadsheetIterator implements SpreadsheetIteratorInterface {
* @return int
*/
protected function getAbsoluteRowIndex() {
// Add 1 because the first data row starts immediately after header row.
return $this->getHeaderRow() + $this->currentRow + 1;
if (!isset($this->absoluteRow)) {
$row = Cell::coordinateFromString($this->getOrigin())[1];
$this->absoluteRow = $row + $this->relativeRow;
}
return $this->absoluteRow;
}
}
......@@ -50,6 +50,13 @@ interface SpreadsheetIteratorInterface extends \Iterator{
*/
public function getWorksheet();
/**
* Retrieves the top-left origin of data area.
*
* @return string
*/
public function getOrigin();
/**
* Gets the list of columns.
*
......@@ -92,7 +99,9 @@ interface SpreadsheetIteratorInterface extends \Iterator{
* Retrieves a full list of headers.
*
* @return string[]
* An array having the column index as key and header name as value.
* An associative array having the header name as key and header column
* index as value. If there is no header row defined, the key is the same as
* the value. The column index has a letter representation (A, B, C, ...).
*
* @throws \RuntimeException
* If a header cell is duplicated.
......@@ -113,4 +122,9 @@ interface SpreadsheetIteratorInterface extends \Iterator{
*/
public function getColumnsCount();
/**
* Clears the internal cache.
*/
public function clearCache();
}
......@@ -36,35 +36,83 @@ class SpreadsheetIteratorTest extends UnitTestCase {
$this->iterator = (new SpreadsheetIterator())
->setConfiguration([
'worksheet' => $this->getWorksheet(),
'origin' => 'B3',
'header_row' => 2,
'columns' => ['a', 'c', 'd'],
'columns' => ['column b', 'column d', 'column e'],
]);
}
/**
* Tests iterator rows and columns count.
*
* @covers ::getRowsCount
* @covers ::getColumnsCount
*/
public function testRowsAndColumnsCount() {
$this->assertEquals(5, $this->iterator->getRowsCount());
$this->assertEquals(4, $this->iterator->getColumnsCount());
$this->assertEquals(5, $this->iterator->getColumnsCount());
}
/**
* Tests headers.
* @covers ::getOrigin
* @dataProvider providerTestGetException
*
* @param string $origin
* The origin cell reference to be tested.
* @param bool $expect_exception
* If a exception is expected.
*/
public function testGetOrigin($origin, $expect_exception) {
$config = ['origin' => $origin] + $this->iterator->getConfiguration();
$this->iterator->setConfiguration($config);
if ($expect_exception) {
$this->setExpectedException('InvalidArgumentException');
}
$this->assertEquals($origin, $this->iterator->getOrigin());
}
/**
* Provides test cases for ::testGetOrigin()
*
* @return array[]
*/
public function providerTestGetException() {
return [
// The minimum valid origin.
['A1', FALSE],
// A valid origin.
['B3', FALSE],
// Column out of bounds.
['F3', TRUE],
// Row out of bounds.
['E6', TRUE],
// Both, row and column, out of bounds.
['F6', TRUE],
// The maximum valid value.
['E5', FALSE],
];
}
/**
* @covers ::getHeaders
*/
public function testGetHeaders() {
$cols = ['a' => 0, 'b' => 1, 'c' => 2, 'd' => 3];
$cols = ['column b' => 'B', 'column c' => 'C', 'column d' => 'D', 'column e' => 'E'];
$this->assertSame($cols, $this->iterator->getHeaders());
// Check headers when there's no header row.
$config = $this->iterator->getConfiguration();
unset($config['header_row']);
$this->iterator->setConfiguration($config);
$this->assertSame(['B' => 'B', 'C' => 'C', 'D' => 'D', 'E' => 'E'], $this->iterator->getHeaders());
// Check duplicate headers.
$this->getWorksheet()->setCellValue('C2', 'column b');
$config['header_row'] = 2;
$this->iterator->setConfiguration($config);
$this->setExpectedException('RuntimeException');
$this->iterator->getHeaders();
}
/**
* Test iteration.
*
* @covers ::current
*/
public function testIteration() {
......@@ -74,19 +122,19 @@ class SpreadsheetIteratorTest extends UnitTestCase {
$this->assertTrue($this->iterator->valid());
$this->assertSame([3], $this->iterator->key());
$this->assertSame(['row' => 3, 'a' => 'a0', 'c' => 'c0', 'd' => 'd0'], $this->iterator->current());
$this->assertSame(['row' => 3, 'column b' => 'cell b0', 'column d' => 'cell d0', 'column e' => 'cell e0'], $this->iterator->current());
// Move the cursor.
$this->iterator->next();
$this->assertTrue($this->iterator->valid());
$this->assertSame([4], $this->iterator->key());
$this->assertSame(['row' => 4, 'a' => 'a1', 'c' => 'c1', 'd' => 'd1'], $this->iterator->current());
$this->assertSame(['row' => 4, 'column b' => 'cell b1', 'column d' => 'cell d1', 'column e' => 'cell e1'], $this->iterator->current());
// Move the cursor.
$this->iterator->next();
$this->assertTrue($this->iterator->valid());
$this->assertSame([5], $this->iterator->key());
$this->assertSame(['row' => 5, 'a' => 'a2', 'c' => 'c2', 'd' => 'd2'], $this->iterator->current());
$this->assertSame(['row' => 5, 'column b' => 'cell b2', 'column d' => 'cell d2', 'column e' => 'cell e2'], $this->iterator->current());
// Move the cursor. Should run out of set.
$this->iterator->next();
......@@ -96,23 +144,32 @@ class SpreadsheetIteratorTest extends UnitTestCase {
$this->iterator->rewind();
$this->assertTrue($this->iterator->valid());
$this->assertSame([3], $this->iterator->key());
$this->assertSame(['row' => 3, 'a' => 'a0', 'c' => 'c0', 'd' => 'd0'], $this->iterator->current());
$this->assertSame(['row' => 3, 'column b' => 'cell b0', 'column d' => 'cell d0', 'column e' => 'cell e0'], $this->iterator->current());
// Try to return all columns.
$config['columns'] = [];
$this->iterator->setConfiguration($config);
$this->assertTrue($this->iterator->valid());
$this->assertSame([3], $this->iterator->key());
$this->assertSame(['row' => 3, 'a' => 'a0', 'b' => 'b0', 'c' => 'c0', 'd' => 'd0'], $this->iterator->current());
$this->assertSame(['row' => 3, 'column b' => 'cell b0', 'column c' => 'cell c0', 'column d' => 'cell d0', 'column e' => 'cell e0'], $this->iterator->current());
// Use different primary keys.
$config['columns'] = ['a', 'd'];
$config['keys'] = ['b', 'c'];
$config['columns'] = ['column b', 'column e'];
$config['keys'] = ['column c', 'column d'];
unset($config['row_index_column']);
$this->iterator->setConfiguration($config);
$this->assertTrue($this->iterator->valid());
$this->assertSame(['b0', 'c0'], $this->iterator->key());
$this->assertSame(['a' => 'a0', 'b' => 'b0', 'c' => 'c0', 'd' => 'd0'], $this->iterator->current());
$this->assertSame(['cell c0', 'cell d0'], $this->iterator->key());
$this->assertSame(['column b' => 'cell b0', 'column c' => 'cell c0', 'column d' => 'cell d0', 'column e' => 'cell e0'], $this->iterator->current());
// Test with no header_row.
unset($config['header_row']);
$config['columns'] = ['B', 'E'];