Skip to content

Commit efd0fae

Browse files
authored
Merge pull request #43 from keboola/zajca-bom
utf8 bom detection
2 parents eb5a835 + f99d35f commit efd0fae

10 files changed

+102
-4
lines changed

.travis.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,16 @@ php:
77
- 7.3
88
- 7.4
99

10-
before_script:
10+
env:
11+
global:
12+
- XDEBUG_MODE=coverage
13+
14+
before_script:
1115
- composer install
1216
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
1317
- chmod +x ./cc-test-reporter
1418
- export GIT_COMMIT_SHA=$TRAVIS_COMMIT
15-
- export GIT_BRANCH=$TRAVIS_BRANCH
19+
- export GIT_BRANCH=$TRAVIS_BRANCH
1620
- ./cc-test-reporter before-build
1721

1822
script:
@@ -22,6 +26,6 @@ after_success:
2226
- ./cc-test-reporter after-build --exit-code 0 --debug
2327

2428
notifications:
25-
email: false
29+
email: false
2630
slack:
2731
secure: WVnUU0fkZS75md3mm7B08SxhP3HDeHbJ8GTPR1DUVjK3MHAmKeSah/plNNxn9I/TdlXnHzQO5WBN33nUq0ODGGT4WFzFa66YTX2tb+bNSmewBOv82hEoITTI1PI9SLq0WNtcamHWCM3Rt1XtiZb3DQk/OcUfiWrrn74q4PPX+VY=

src/CsvReader.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ public function __construct(
5858
$this->validateLineBreak();
5959

6060
rewind($this->filePointer);
61-
$this->header = $this->readLine();
61+
$this->header = UTF8BOMHelper::detectAndRemoveBOM($this->readLine());
6262
$this->rewind();
6363
}
6464

src/UTF8BOMHelper.php

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
<?php
2+
3+
namespace Keboola\Csv;
4+
5+
class UTF8BOMHelper
6+
{
7+
/**
8+
* @param array $header
9+
* @return array
10+
*/
11+
public static function detectAndRemoveBOM($header)
12+
{
13+
if (!is_array($header)) {
14+
return $header;
15+
}
16+
$utf32BigEndianBom = chr(0x00) . chr(0x00) . chr(0xFE) . chr(0xFF);
17+
$utf32LittleEndianBom = chr(0xFF) . chr(0xFE) . chr(0x00) . chr(0x00);
18+
$utf16BigEndianBom = chr(0xFE) . chr(0xFF);
19+
$utf16LittleEndianBom = chr(0xFF) . chr(0xFE);
20+
$utf8Bom = chr(0xEF) . chr(0xBB) . chr(0xBF);
21+
22+
foreach ([
23+
$utf32BigEndianBom,
24+
$utf32LittleEndianBom,
25+
$utf16BigEndianBom,
26+
$utf16LittleEndianBom,
27+
$utf8Bom,
28+
] as $bomString) {
29+
if (strpos($header[0], $bomString) === 0) {
30+
$header[0] = trim(substr($header[0], strlen($bomString)), '"');
31+
break;
32+
}
33+
}
34+
35+
return $header;
36+
}
37+
}

tests/CsvReadTest.php

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,26 @@ public function testParseEscapedBy()
137137
self::assertEquals($expected, iterator_to_array($csvFile));
138138
}
139139

140+
/**
141+
* @dataProvider bomProvider
142+
*/
143+
public function testUtf8BOM($bomFile)
144+
{
145+
$csvFile = new CsvReader(__DIR__ . '/data/bom/' . $bomFile . '.csv');
146+
self::assertEquals(['id', 'name',], $csvFile->getHeader());
147+
}
148+
149+
public function bomProvider()
150+
{
151+
return [
152+
['utf32BigEndianBom'],
153+
['utf32LittleEndianBom'],
154+
['utf16BigEndianBom'],
155+
['utf16LittleEndianBom'],
156+
['utf8Bom'],
157+
];
158+
}
159+
140160
public function testParseMacLineEndsInField()
141161
{
142162
$csvFile = new CsvReader(__DIR__ . '/data/test-input.lineBreaks.csv', ",", '"', '\\');

tests/UTF8BOMHelperTest.php

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
<?php
2+
3+
namespace Keboola\Csv\Tests;
4+
5+
use Keboola\Csv\CsvReader;
6+
use Keboola\Csv\UTF8BOMHelper;
7+
use PHPUnit\Framework\TestCase;
8+
9+
class UTF8BOMHelperTest extends TestCase
10+
{
11+
/**
12+
* @dataProvider bomProvider
13+
* @param string $bomFile
14+
*/
15+
public function testDetectAndRemoveBOM($bomFile)
16+
{
17+
$file = __DIR__ . '/data/bom/' . $bomFile . '.csv';
18+
$reader = new CsvReader($file);
19+
$firstLine = $reader->current();
20+
$this->assertNotSame(['id', 'name'], $firstLine);
21+
$this->assertSame(['id', 'name'], UTF8BOMHelper::detectAndRemoveBOM($firstLine));
22+
}
23+
24+
public function bomProvider()
25+
{
26+
return [
27+
['utf32BigEndianBom'],
28+
['utf32LittleEndianBom'],
29+
['utf16BigEndianBom'],
30+
['utf16LittleEndianBom'],
31+
['utf8Bom'],
32+
];
33+
}
34+
}

tests/data/bom/utf16BigEndianBom.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
��"id","name"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
��"id","name"

tests/data/bom/utf32BigEndianBom.csv

15 Bytes
Binary file not shown.
15 Bytes
Binary file not shown.

tests/data/bom/utf8Bom.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"id","name"

0 commit comments

Comments
 (0)