Csv.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. <?php
  2. namespace PhpOffice\PhpSpreadsheet\Reader;
  3. use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
  4. use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
  5. use PhpOffice\PhpSpreadsheet\Spreadsheet;
  6. class Csv extends BaseReader
  7. {
  8. /**
  9. * Input encoding.
  10. *
  11. * @var string
  12. */
  13. private $inputEncoding = 'UTF-8';
  14. /**
  15. * Delimiter.
  16. *
  17. * @var string
  18. */
  19. private $delimiter;
  20. /**
  21. * Enclosure.
  22. *
  23. * @var string
  24. */
  25. private $enclosure = '"';
  26. /**
  27. * Sheet index to read.
  28. *
  29. * @var int
  30. */
  31. private $sheetIndex = 0;
  32. /**
  33. * Load rows contiguously.
  34. *
  35. * @var bool
  36. */
  37. private $contiguous = false;
  38. /**
  39. * Row counter for loading rows contiguously.
  40. *
  41. * @var int
  42. */
  43. private $contiguousRow = -1;
  44. /**
  45. * Create a new CSV Reader instance.
  46. */
  47. public function __construct()
  48. {
  49. $this->readFilter = new DefaultReadFilter();
  50. }
  51. /**
  52. * Set input encoding.
  53. *
  54. * @param string $pValue Input encoding, eg: 'UTF-8'
  55. *
  56. * @return Csv
  57. */
  58. public function setInputEncoding($pValue)
  59. {
  60. $this->inputEncoding = $pValue;
  61. return $this;
  62. }
  63. /**
  64. * Get input encoding.
  65. *
  66. * @return string
  67. */
  68. public function getInputEncoding()
  69. {
  70. return $this->inputEncoding;
  71. }
  72. /**
  73. * Move filepointer past any BOM marker.
  74. */
  75. protected function skipBOM()
  76. {
  77. rewind($this->fileHandle);
  78. switch ($this->inputEncoding) {
  79. case 'UTF-8':
  80. fgets($this->fileHandle, 4) == "\xEF\xBB\xBF" ?
  81. fseek($this->fileHandle, 3) : fseek($this->fileHandle, 0);
  82. break;
  83. case 'UTF-16LE':
  84. fgets($this->fileHandle, 3) == "\xFF\xFE" ?
  85. fseek($this->fileHandle, 2) : fseek($this->fileHandle, 0);
  86. break;
  87. case 'UTF-16BE':
  88. fgets($this->fileHandle, 3) == "\xFE\xFF" ?
  89. fseek($this->fileHandle, 2) : fseek($this->fileHandle, 0);
  90. break;
  91. case 'UTF-32LE':
  92. fgets($this->fileHandle, 5) == "\xFF\xFE\x00\x00" ?
  93. fseek($this->fileHandle, 4) : fseek($this->fileHandle, 0);
  94. break;
  95. case 'UTF-32BE':
  96. fgets($this->fileHandle, 5) == "\x00\x00\xFE\xFF" ?
  97. fseek($this->fileHandle, 4) : fseek($this->fileHandle, 0);
  98. break;
  99. default:
  100. break;
  101. }
  102. }
  103. /**
  104. * Identify any separator that is explicitly set in the file.
  105. */
  106. protected function checkSeparator()
  107. {
  108. $line = fgets($this->fileHandle);
  109. if ($line === false) {
  110. return;
  111. }
  112. if ((strlen(trim($line, "\r\n")) == 5) && (stripos($line, 'sep=') === 0)) {
  113. $this->delimiter = substr($line, 4, 1);
  114. return;
  115. }
  116. return $this->skipBOM();
  117. }
  118. /**
  119. * Infer the separator if it isn't explicitly set in the file or specified by the user.
  120. */
  121. protected function inferSeparator()
  122. {
  123. if ($this->delimiter !== null) {
  124. return;
  125. }
  126. $potentialDelimiters = [',', ';', "\t", '|', ':', ' '];
  127. $counts = [];
  128. foreach ($potentialDelimiters as $delimiter) {
  129. $counts[$delimiter] = [];
  130. }
  131. // Count how many times each of the potential delimiters appears in each line
  132. $numberLines = 0;
  133. while (($line = fgets($this->fileHandle)) !== false && (++$numberLines < 1000)) {
  134. // Drop everything that is enclosed to avoid counting false positives in enclosures
  135. $enclosure = preg_quote($this->enclosure, '/');
  136. $line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/U', '', $line);
  137. $countLine = [];
  138. for ($i = strlen($line) - 1; $i >= 0; --$i) {
  139. $char = $line[$i];
  140. if (isset($counts[$char])) {
  141. if (!isset($countLine[$char])) {
  142. $countLine[$char] = 0;
  143. }
  144. ++$countLine[$char];
  145. }
  146. }
  147. foreach ($potentialDelimiters as $delimiter) {
  148. $counts[$delimiter][] = isset($countLine[$delimiter])
  149. ? $countLine[$delimiter]
  150. : 0;
  151. }
  152. }
  153. // Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently)
  154. $meanSquareDeviations = [];
  155. $middleIdx = floor(($numberLines - 1) / 2);
  156. foreach ($potentialDelimiters as $delimiter) {
  157. $series = $counts[$delimiter];
  158. sort($series);
  159. $median = ($numberLines % 2)
  160. ? $series[$middleIdx]
  161. : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
  162. if ($median === 0) {
  163. continue;
  164. }
  165. $meanSquareDeviations[$delimiter] = array_reduce(
  166. $series,
  167. function ($sum, $value) use ($median) {
  168. return $sum + pow($value - $median, 2);
  169. }
  170. ) / count($series);
  171. }
  172. // ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected)
  173. $min = INF;
  174. foreach ($potentialDelimiters as $delimiter) {
  175. if (!isset($meanSquareDeviations[$delimiter])) {
  176. continue;
  177. }
  178. if ($meanSquareDeviations[$delimiter] < $min) {
  179. $min = $meanSquareDeviations[$delimiter];
  180. $this->delimiter = $delimiter;
  181. }
  182. }
  183. // If no delimiter could be detected, fall back to the default
  184. if ($this->delimiter === null) {
  185. $this->delimiter = reset($potentialDelimiters);
  186. }
  187. return $this->skipBOM();
  188. }
  189. /**
  190. * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
  191. *
  192. * @param string $pFilename
  193. *
  194. * @throws Exception
  195. *
  196. * @return array
  197. */
  198. public function listWorksheetInfo($pFilename)
  199. {
  200. // Open file
  201. if (!$this->canRead($pFilename)) {
  202. throw new Exception($pFilename . ' is an Invalid Spreadsheet file.');
  203. }
  204. $this->openFile($pFilename);
  205. $fileHandle = $this->fileHandle;
  206. // Skip BOM, if any
  207. $this->skipBOM();
  208. $this->checkSeparator();
  209. $this->inferSeparator();
  210. $worksheetInfo = [];
  211. $worksheetInfo[0]['worksheetName'] = 'Worksheet';
  212. $worksheetInfo[0]['lastColumnLetter'] = 'A';
  213. $worksheetInfo[0]['lastColumnIndex'] = 0;
  214. $worksheetInfo[0]['totalRows'] = 0;
  215. $worksheetInfo[0]['totalColumns'] = 0;
  216. // Loop through each line of the file in turn
  217. while (($rowData = fgetcsv($fileHandle, 0, $this->delimiter, $this->enclosure)) !== false) {
  218. ++$worksheetInfo[0]['totalRows'];
  219. $worksheetInfo[0]['lastColumnIndex'] = max($worksheetInfo[0]['lastColumnIndex'], count($rowData) - 1);
  220. }
  221. $worksheetInfo[0]['lastColumnLetter'] = Coordinate::stringFromColumnIndex($worksheetInfo[0]['lastColumnIndex'] + 1);
  222. $worksheetInfo[0]['totalColumns'] = $worksheetInfo[0]['lastColumnIndex'] + 1;
  223. // Close file
  224. fclose($fileHandle);
  225. return $worksheetInfo;
  226. }
  227. /**
  228. * Loads Spreadsheet from file.
  229. *
  230. * @param string $pFilename
  231. *
  232. * @throws Exception
  233. *
  234. * @return Spreadsheet
  235. */
  236. public function load($pFilename)
  237. {
  238. // Create new Spreadsheet
  239. $spreadsheet = new Spreadsheet();
  240. // Load into this instance
  241. return $this->loadIntoExisting($pFilename, $spreadsheet);
  242. }
  243. /**
  244. * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
  245. *
  246. * @param string $pFilename
  247. * @param Spreadsheet $spreadsheet
  248. *
  249. * @throws Exception
  250. *
  251. * @return Spreadsheet
  252. */
  253. public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
  254. {
  255. $lineEnding = ini_get('auto_detect_line_endings');
  256. ini_set('auto_detect_line_endings', true);
  257. // Open file
  258. if (!$this->canRead($pFilename)) {
  259. throw new Exception($pFilename . ' is an Invalid Spreadsheet file.');
  260. }
  261. $this->openFile($pFilename);
  262. $fileHandle = $this->fileHandle;
  263. // Skip BOM, if any
  264. $this->skipBOM();
  265. $this->checkSeparator();
  266. $this->inferSeparator();
  267. // Create new PhpSpreadsheet object
  268. while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
  269. $spreadsheet->createSheet();
  270. }
  271. $sheet = $spreadsheet->setActiveSheetIndex($this->sheetIndex);
  272. // Set our starting row based on whether we're in contiguous mode or not
  273. $currentRow = 1;
  274. if ($this->contiguous) {
  275. $currentRow = ($this->contiguousRow == -1) ? $sheet->getHighestRow() : $this->contiguousRow;
  276. }
  277. // Loop through each line of the file in turn
  278. while (($rowData = fgetcsv($fileHandle, 0, $this->delimiter, $this->enclosure)) !== false) {
  279. $columnLetter = 'A';
  280. foreach ($rowData as $rowDatum) {
  281. if ($rowDatum != '' && $this->readFilter->readCell($columnLetter, $currentRow)) {
  282. // Convert encoding if necessary
  283. if ($this->inputEncoding !== 'UTF-8') {
  284. $rowDatum = StringHelper::convertEncoding($rowDatum, 'UTF-8', $this->inputEncoding);
  285. }
  286. // Set cell value
  287. $sheet->getCell($columnLetter . $currentRow)->setValue($rowDatum);
  288. }
  289. ++$columnLetter;
  290. }
  291. ++$currentRow;
  292. }
  293. // Close file
  294. fclose($fileHandle);
  295. if ($this->contiguous) {
  296. $this->contiguousRow = $currentRow;
  297. }
  298. ini_set('auto_detect_line_endings', $lineEnding);
  299. // Return
  300. return $spreadsheet;
  301. }
  302. /**
  303. * Get delimiter.
  304. *
  305. * @return string
  306. */
  307. public function getDelimiter()
  308. {
  309. return $this->delimiter;
  310. }
  311. /**
  312. * Set delimiter.
  313. *
  314. * @param string $delimiter Delimiter, eg: ','
  315. *
  316. * @return CSV
  317. */
  318. public function setDelimiter($delimiter)
  319. {
  320. $this->delimiter = $delimiter;
  321. return $this;
  322. }
  323. /**
  324. * Get enclosure.
  325. *
  326. * @return string
  327. */
  328. public function getEnclosure()
  329. {
  330. return $this->enclosure;
  331. }
  332. /**
  333. * Set enclosure.
  334. *
  335. * @param string $enclosure Enclosure, defaults to "
  336. *
  337. * @return CSV
  338. */
  339. public function setEnclosure($enclosure)
  340. {
  341. if ($enclosure == '') {
  342. $enclosure = '"';
  343. }
  344. $this->enclosure = $enclosure;
  345. return $this;
  346. }
  347. /**
  348. * Get sheet index.
  349. *
  350. * @return int
  351. */
  352. public function getSheetIndex()
  353. {
  354. return $this->sheetIndex;
  355. }
  356. /**
  357. * Set sheet index.
  358. *
  359. * @param int $pValue Sheet index
  360. *
  361. * @return CSV
  362. */
  363. public function setSheetIndex($pValue)
  364. {
  365. $this->sheetIndex = $pValue;
  366. return $this;
  367. }
  368. /**
  369. * Set Contiguous.
  370. *
  371. * @param bool $contiguous
  372. *
  373. * @return Csv
  374. */
  375. public function setContiguous($contiguous)
  376. {
  377. $this->contiguous = (bool) $contiguous;
  378. if (!$contiguous) {
  379. $this->contiguousRow = -1;
  380. }
  381. return $this;
  382. }
  383. /**
  384. * Get Contiguous.
  385. *
  386. * @return bool
  387. */
  388. public function getContiguous()
  389. {
  390. return $this->contiguous;
  391. }
  392. /**
  393. * Can the current IReader read the file?
  394. *
  395. * @param string $pFilename
  396. *
  397. * @return bool
  398. */
  399. public function canRead($pFilename)
  400. {
  401. // Check if file exists
  402. try {
  403. $this->openFile($pFilename);
  404. } catch (Exception $e) {
  405. return false;
  406. }
  407. fclose($this->fileHandle);
  408. return true;
  409. }
  410. }