Skip to content

Commit a439656

Browse files
Merge pull request #41 from keboola/webrouse-COM-350-fix-line-breaks-detect
Fix line breaks detection in edge-case
2 parents b77f1c7 + 2811a35 commit a439656

11 files changed

+356
-40
lines changed

.codeclimate.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ version: "2"
22
plugins:
33
phpmd:
44
enabled: true
5+
checks:
6+
CleanCode/StaticAccess:
7+
enabled: false
58
phpcodesniffer:
69
enabled: true
710
sonar-php:

.travis.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ language: php
22
php:
33
- 5.6
44
- 7.0
5-
- hhvm
5+
- 7.1
6+
- 7.2
7+
- 7.3
8+
- 7.4
69

710
before_script:
811
- composer install

composer.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
},
3030
"require-dev": {
3131
"phpunit/phpunit": "^5.7",
32-
"squizlabs/php_codesniffer": "^3.2"
32+
"squizlabs/php_codesniffer": "^3.2",
33+
"ext-json": "*"
3334
}
3435
}

src/CsvReader.php

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -111,29 +111,10 @@ protected function openCsvFile($fileName)
111111
*/
112112
protected function detectLineBreak()
113113
{
114-
rewind($this->getFilePointer());
115-
$sample = fread($this->getFilePointer(), 10000);
116-
117-
$possibleLineBreaks = [
118-
"\r\n", // win
119-
"\r", // mac
120-
"\n", // unix
121-
];
122-
123-
$lineBreaksPositions = [];
124-
foreach ($possibleLineBreaks as $lineBreak) {
125-
$position = strpos($sample, $lineBreak);
126-
if ($position === false) {
127-
continue;
128-
}
129-
$lineBreaksPositions[$lineBreak] = $position;
130-
}
131-
132-
133-
asort($lineBreaksPositions);
134-
reset($lineBreaksPositions);
114+
@rewind($this->getFilePointer());
115+
$sample = @fread($this->getFilePointer(), 10000);
135116

136-
return empty($lineBreaksPositions) ? "\n" : key($lineBreaksPositions);
117+
return LineBreaksHelper::detectLineBreaks($sample, $this->getEnclosure(), $this->getEscapedBy());
137118
}
138119

139120
/**
@@ -148,7 +129,7 @@ protected function readLine()
148129
// allow empty enclosure hack
149130
$enclosure = !$this->getEnclosure() ? chr(0) : $this->getEnclosure();
150131
$escapedBy = !$this->getEscapedBy() ? chr(0) : $this->getEscapedBy();
151-
return fgetcsv($this->getFilePointer(), null, $this->getDelimiter(), $enclosure, $escapedBy);
132+
return @fgetcsv($this->getFilePointer(), null, $this->getDelimiter(), $enclosure, $escapedBy);
152133
}
153134

154135
/**

src/CsvWriter.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ public function writeRow(array $row)
9090
"Cannot write to CSV file " . $this->fileName .
9191
($ret === false && error_get_last() ? 'Error: ' . error_get_last()['message'] : '') .
9292
' Return: ' . json_encode($ret) .
93-
' To write: ' . strlen($str) . ' Written: ' . $ret,
93+
' To write: ' . strlen($str) . ' Written: ' . (int) $ret,
9494
Exception::WRITE_ERROR
9595
);
9696
}

src/LineBreaksHelper.php

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
<?php
2+
3+
namespace Keboola\Csv;
4+
5+
class LineBreaksHelper
6+
{
7+
const REGEXP_DELIMITER = '~';
8+
9+
/**
10+
* Detect line-breaks style in CSV file
11+
* @param string $sample
12+
* @param string $enclosure
13+
* @param string $escapedBy
14+
* @return string
15+
*/
16+
public static function detectLineBreaks($sample, $enclosure, $escapedBy)
17+
{
18+
$cleared = self::clearCsvValues($sample, $enclosure, $escapedBy);
19+
20+
$possibleLineBreaks = [
21+
"\r\n", // win
22+
"\r", // mac
23+
"\n", // unix
24+
];
25+
26+
$lineBreaksPositions = [];
27+
foreach ($possibleLineBreaks as $lineBreak) {
28+
$position = strpos($cleared, $lineBreak);
29+
if ($position === false) {
30+
continue;
31+
}
32+
$lineBreaksPositions[$lineBreak] = $position;
33+
}
34+
35+
36+
asort($lineBreaksPositions);
37+
reset($lineBreaksPositions);
38+
39+
return empty($lineBreaksPositions) ? "\n" : key($lineBreaksPositions);
40+
}
41+
42+
/**
43+
* Clear enclosured values in CSV eg. "abc" to "",
44+
* because these values can contain line breaks eg, "abc\n\r\n\r\r\r\r",
45+
* and this makes it difficult to detect line breaks style in CSV,
46+
* if are another line breaks present in first line.
47+
* @internal Should be used only in detectLineBreaks, public for easier testing.
48+
* @param string $sample
49+
* @param string $enclosure
50+
* @param string $escapedBy eg. empty string or \
51+
* @return string
52+
*/
53+
public static function clearCsvValues($sample, $enclosure, $escapedBy)
54+
{
55+
// Usually an enclosure character is escaped by doubling it, however, the escapeBy can be used
56+
$doubleEnclosure = $enclosure . $enclosure;
57+
$escapedEnclosure = empty($escapedBy) ? $doubleEnclosure : $escapedBy . $enclosure;
58+
$escapedEscape = empty($escapedBy) ? null : $escapedBy . $escapedBy;
59+
60+
/*
61+
* Regexp examples:
62+
* enclosure: |"|, escapedBy: none, regexp: ~"(?>(?>"")|[^"])*"~
63+
* enclosure: |"|, escapedBy: |\|, regexp: ~"(?>(?>\\"|\\\\)|[^"])*"~
64+
*/
65+
// @formatter:off
66+
$regexp =
67+
// regexp start
68+
self::REGEXP_DELIMITER .
69+
// enclosure start
70+
preg_quote($enclosure, self::REGEXP_DELIMITER) .
71+
/*
72+
* Once-only group => if there is a match, do not try other alternatives
73+
* See: https://www.php.net/manual/en/regexp.reference.onlyonce.php
74+
* Without once-only group will be |"abc\"| false positive,
75+
* because |\| is matched by group and |"| at the end of regexp.
76+
*/
77+
// repeated once-only group start
78+
'(?>' .
79+
// once-only group start
80+
'(?>' .
81+
// escaped enclosure
82+
preg_quote($escapedEnclosure, self::REGEXP_DELIMITER) .
83+
// OR escaped escape char
84+
($escapedEscape ? '|' . preg_quote($escapedEscape, self::REGEXP_DELIMITER) : '') .
85+
// group end
86+
')' .
87+
// OR not enclosure
88+
'|[^' . preg_quote($enclosure, self::REGEXP_DELIMITER) . ']' .
89+
// group end
90+
')*' .
91+
// enclosure end
92+
preg_quote($enclosure, self::REGEXP_DELIMITER) .
93+
// regexp end
94+
self::REGEXP_DELIMITER;
95+
// @formatter:on
96+
97+
return preg_replace($regexp, $doubleEnclosure, $sample);
98+
}
99+
}

tests/CsvReadTest.php

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,24 @@ public function testParseEscapedBy()
137137
self::assertEquals($expected, iterator_to_array($csvFile));
138138
}
139139

140+
public function testParseMacLineEndsInField()
141+
{
142+
$csvFile = new CsvReader(__DIR__ . '/data/test-input.lineBreaks.csv', ",", '"', '\\');
143+
144+
$expected = [
145+
[
146+
'test',
147+
"some text\rwith\r\\r line breaks\rinside\rbut\rrows\rare\rusing \\n \\\"line\\\" break\r",
148+
],
149+
[
150+
'name', 'data'
151+
]
152+
];
153+
154+
self::assertEquals($expected, iterator_to_array($csvFile));
155+
}
156+
157+
140158
public function testEmptyHeader()
141159
{
142160
$csvFile = new CsvReader(__DIR__ . '/data/test-input.empty.csv', ',', '"');
@@ -364,14 +382,6 @@ public function invalidSkipLinesProvider()
364382
];
365383
}
366384

367-
public function testInvalidNewLines()
368-
{
369-
self::expectException(Exception::class);
370-
self::expectExceptionMessage('Invalid line break. Please use unix \n or win \r\n line breaks.');
371-
new CsvReader(__DIR__ . DIRECTORY_SEPARATOR . 'data/binary');
372-
}
373-
374-
375385
public function testValidWithoutRewind()
376386
{
377387
$fileName = __DIR__ . '/data/simple.csv';
@@ -479,4 +489,54 @@ public function testInvalidFile()
479489
self::expectExceptionMessage('Invalid file: array');
480490
new CsvReader(['bad']);
481491
}
492+
493+
/**
494+
* @dataProvider getPerformanceTestInputs
495+
* @param string $fileContent
496+
* @param int $expectedRows
497+
* @param float $maxDuration
498+
*/
499+
public function testPerformance($fileContent, $expectedRows, $maxDuration)
500+
{
501+
self::markTestSkipped(
502+
'Run this test only manually. Because the duration is very different in local CI environment.'
503+
);
504+
505+
try {
506+
$fileName = sys_get_temp_dir() . DIRECTORY_SEPARATOR . uniqid('perf-test');
507+
file_put_contents($fileName, $fileContent);
508+
$startTime = microtime(true);
509+
$reader = new CsvReader($fileName);
510+
$rows = 0;
511+
foreach ($reader as $line){
512+
$rows++;
513+
}
514+
$duration = microtime(true) - $startTime;
515+
self::assertSame($expectedRows, $rows);
516+
self::assertLessThanOrEqual($maxDuration, $duration);
517+
} finally {
518+
@unlink($fileName);
519+
}
520+
}
521+
522+
public function getPerformanceTestInputs()
523+
{
524+
yield '1M-simple-rows' => [
525+
str_repeat("abc,def,\"xyz\"\n", 1000000),
526+
1000000,
527+
8.0
528+
];
529+
530+
yield '1M-empty-lines-n' => [
531+
str_repeat("\n", 1000000),
532+
1000000,
533+
8.0
534+
];
535+
536+
yield '1M-no-separators' => [
537+
str_repeat(md5('abc') . "\n", 1000000),
538+
1000000,
539+
8.0
540+
];
541+
}
482542
}

tests/CsvWriteTest.php

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
use Keboola\Csv\CsvWriter;
77
use Keboola\Csv\Exception;
88
use PHPUnit\Framework\TestCase;
9+
use PHPUnit_Framework_Constraint_Or;
10+
use PHPUnit_Framework_Constraint_StringContains;
911

1012
class CsvWriteTest extends TestCase
1113
{
@@ -87,9 +89,19 @@ public function testWriteInvalidObject()
8789
];
8890

8991
$csvFile->writeRow($rows[0]);
90-
self::expectException(Exception::class);
91-
self::expectExceptionMessage("Cannot write data into column: stdClass::");
92-
$csvFile->writeRow($rows[1]);
92+
93+
try {
94+
$csvFile->writeRow($rows[1]);
95+
self::fail('Expected exception was not thrown.');
96+
} catch (Exception $e) {
97+
// Exception message differs between PHP versions.
98+
$or = new PHPUnit_Framework_Constraint_Or();
99+
$or->setConstraints([
100+
new PHPUnit_Framework_Constraint_StringContains("Cannot write data into column: stdClass::"),
101+
new PHPUnit_Framework_Constraint_StringContains("Cannot write data into column: (object) array(\n)")
102+
]);
103+
self::assertThat($e->getMessage(), $or);
104+
}
93105
}
94106

95107
public function testWriteValidObject()
@@ -182,9 +194,24 @@ public function testInvalidPointer()
182194
$pointer = fopen($fileName, 'r');
183195
$csvFile = new CsvWriter($pointer);
184196
$rows = [['col1', 'col2']];
185-
self::expectException(Exception::class);
186-
self::expectExceptionMessage('Cannot write to CSV file Return: 0 To write: 14 Written: 0');
187-
$csvFile->writeRow($rows[0]);
197+
198+
try {
199+
$csvFile->writeRow($rows[0]);
200+
self::fail('Expected exception was not thrown.');
201+
} catch (Exception $e) {
202+
// Exception message differs between PHP versions.
203+
$or = new PHPUnit_Framework_Constraint_Or();
204+
$or->setConstraints([
205+
new PHPUnit_Framework_Constraint_StringContains(
206+
'Cannot write to CSV file Return: 0 To write: 14 Written: 0'
207+
),
208+
new PHPUnit_Framework_Constraint_StringContains(
209+
'Cannot write to CSV file Error: fwrite(): ' .
210+
'write of 14 bytes failed with errno=9 Bad file descriptor Return: false To write: 14 Written: 0'
211+
)
212+
]);
213+
self::assertThat($e->getMessage(), $or);
214+
}
188215
}
189216

190217
public function testInvalidPointer2()

0 commit comments

Comments
 (0)