Html.php 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657
  1. <?php
  2. namespace PhpOffice\PhpSpreadsheet\Reader;
  3. use DOMDocument;
  4. use DOMElement;
  5. use DOMNode;
  6. use DOMText;
  7. use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
  8. use PhpOffice\PhpSpreadsheet\Spreadsheet;
  9. use PhpOffice\PhpSpreadsheet\Style\Border;
  10. use PhpOffice\PhpSpreadsheet\Style\Color;
  11. use PhpOffice\PhpSpreadsheet\Style\Fill;
  12. use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet;
  13. /** PhpSpreadsheet root directory */
  14. class Html extends BaseReader
  15. {
  16. /**
  17. * Sample size to read to determine if it's HTML or not.
  18. */
  19. const TEST_SAMPLE_SIZE = 2048;
  20. /**
  21. * Input encoding.
  22. *
  23. * @var string
  24. */
  25. protected $inputEncoding = 'ANSI';
  26. /**
  27. * Sheet index to read.
  28. *
  29. * @var int
  30. */
  31. protected $sheetIndex = 0;
  32. /**
  33. * Formats.
  34. *
  35. * @var array
  36. */
  37. protected $formats = [
  38. 'h1' => [
  39. 'font' => [
  40. 'bold' => true,
  41. 'size' => 24,
  42. ],
  43. ], // Bold, 24pt
  44. 'h2' => [
  45. 'font' => [
  46. 'bold' => true,
  47. 'size' => 18,
  48. ],
  49. ], // Bold, 18pt
  50. 'h3' => [
  51. 'font' => [
  52. 'bold' => true,
  53. 'size' => 13.5,
  54. ],
  55. ], // Bold, 13.5pt
  56. 'h4' => [
  57. 'font' => [
  58. 'bold' => true,
  59. 'size' => 12,
  60. ],
  61. ], // Bold, 12pt
  62. 'h5' => [
  63. 'font' => [
  64. 'bold' => true,
  65. 'size' => 10,
  66. ],
  67. ], // Bold, 10pt
  68. 'h6' => [
  69. 'font' => [
  70. 'bold' => true,
  71. 'size' => 7.5,
  72. ],
  73. ], // Bold, 7.5pt
  74. 'a' => [
  75. 'font' => [
  76. 'underline' => true,
  77. 'color' => [
  78. 'argb' => Color::COLOR_BLUE,
  79. ],
  80. ],
  81. ], // Blue underlined
  82. 'hr' => [
  83. 'borders' => [
  84. 'bottom' => [
  85. 'borderStyle' => Border::BORDER_THIN,
  86. 'color' => [
  87. Color::COLOR_BLACK,
  88. ],
  89. ],
  90. ],
  91. ], // Bottom border
  92. ];
  93. protected $rowspan = [];
  94. /**
  95. * Create a new HTML Reader instance.
  96. */
  97. public function __construct()
  98. {
  99. $this->readFilter = new DefaultReadFilter();
  100. }
  101. /**
  102. * Validate that the current file is an HTML file.
  103. *
  104. * @param string $pFilename
  105. *
  106. * @return bool
  107. */
  108. public function canRead($pFilename)
  109. {
  110. // Check if file exists
  111. try {
  112. $this->openFile($pFilename);
  113. } catch (Exception $e) {
  114. return false;
  115. }
  116. $beginning = $this->readBeginning();
  117. $startWithTag = self::startsWithTag($beginning);
  118. $containsTags = self::containsTags($beginning);
  119. $endsWithTag = self::endsWithTag($this->readEnding());
  120. fclose($this->fileHandle);
  121. return $startWithTag && $containsTags && $endsWithTag;
  122. }
  123. private function readBeginning()
  124. {
  125. fseek($this->fileHandle, 0);
  126. return fread($this->fileHandle, self::TEST_SAMPLE_SIZE);
  127. }
  128. private function readEnding()
  129. {
  130. $meta = stream_get_meta_data($this->fileHandle);
  131. $filename = $meta['uri'];
  132. $size = filesize($filename);
  133. if ($size === 0) {
  134. return '';
  135. }
  136. $blockSize = self::TEST_SAMPLE_SIZE;
  137. if ($size < $blockSize) {
  138. $blockSize = $size;
  139. }
  140. fseek($this->fileHandle, $size - $blockSize);
  141. return fread($this->fileHandle, $blockSize);
  142. }
  143. private static function startsWithTag($data)
  144. {
  145. return '<' === substr(trim($data), 0, 1);
  146. }
  147. private static function endsWithTag($data)
  148. {
  149. return '>' === substr(trim($data), -1, 1);
  150. }
  151. private static function containsTags($data)
  152. {
  153. return strlen($data) !== strlen(strip_tags($data));
  154. }
  155. /**
  156. * Loads Spreadsheet from file.
  157. *
  158. * @param string $pFilename
  159. *
  160. * @throws Exception
  161. *
  162. * @return Spreadsheet
  163. */
  164. public function load($pFilename)
  165. {
  166. // Create new Spreadsheet
  167. $spreadsheet = new Spreadsheet();
  168. // Load into this instance
  169. return $this->loadIntoExisting($pFilename, $spreadsheet);
  170. }
  171. /**
  172. * Set input encoding.
  173. *
  174. * @param string $pValue Input encoding, eg: 'ANSI'
  175. *
  176. * @return Html
  177. */
  178. public function setInputEncoding($pValue)
  179. {
  180. $this->inputEncoding = $pValue;
  181. return $this;
  182. }
  183. /**
  184. * Get input encoding.
  185. *
  186. * @return string
  187. */
  188. public function getInputEncoding()
  189. {
  190. return $this->inputEncoding;
  191. }
  192. // Data Array used for testing only, should write to Spreadsheet object on completion of tests
  193. protected $dataArray = [];
  194. protected $tableLevel = 0;
  195. protected $nestedColumn = ['A'];
  196. protected function setTableStartColumn($column)
  197. {
  198. if ($this->tableLevel == 0) {
  199. $column = 'A';
  200. }
  201. ++$this->tableLevel;
  202. $this->nestedColumn[$this->tableLevel] = $column;
  203. return $this->nestedColumn[$this->tableLevel];
  204. }
  205. protected function getTableStartColumn()
  206. {
  207. return $this->nestedColumn[$this->tableLevel];
  208. }
  209. protected function releaseTableStartColumn()
  210. {
  211. --$this->tableLevel;
  212. return array_pop($this->nestedColumn);
  213. }
  214. protected function flushCell(Worksheet $sheet, $column, $row, &$cellContent)
  215. {
  216. if (is_string($cellContent)) {
  217. // Simple String content
  218. if (trim($cellContent) > '') {
  219. // Only actually write it if there's content in the string
  220. // Write to worksheet to be done here...
  221. // ... we return the cell so we can mess about with styles more easily
  222. $sheet->setCellValue($column . $row, $cellContent);
  223. $this->dataArray[$row][$column] = $cellContent;
  224. }
  225. } else {
  226. // We have a Rich Text run
  227. // TODO
  228. $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
  229. }
  230. $cellContent = (string) '';
  231. }
  232. /**
  233. * @param DOMNode $element
  234. * @param Worksheet $sheet
  235. * @param int $row
  236. * @param string $column
  237. * @param string $cellContent
  238. */
  239. protected function processDomElement(DOMNode $element, Worksheet $sheet, &$row, &$column, &$cellContent)
  240. {
  241. foreach ($element->childNodes as $child) {
  242. if ($child instanceof DOMText) {
  243. $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
  244. if (is_string($cellContent)) {
  245. // simply append the text if the cell content is a plain text string
  246. $cellContent .= $domText;
  247. }
  248. // but if we have a rich text run instead, we need to append it correctly
  249. // TODO
  250. } elseif ($child instanceof DOMElement) {
  251. $attributeArray = [];
  252. foreach ($child->attributes as $attribute) {
  253. $attributeArray[$attribute->name] = $attribute->value;
  254. }
  255. switch ($child->nodeName) {
  256. case 'meta':
  257. foreach ($attributeArray as $attributeName => $attributeValue) {
  258. switch ($attributeName) {
  259. case 'content':
  260. // TODO
  261. // Extract character set, so we can convert to UTF-8 if required
  262. break;
  263. }
  264. }
  265. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  266. break;
  267. case 'title':
  268. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  269. $sheet->setTitle($cellContent, true, false);
  270. $cellContent = '';
  271. break;
  272. case 'span':
  273. case 'div':
  274. case 'font':
  275. case 'i':
  276. case 'em':
  277. case 'strong':
  278. case 'b':
  279. if (isset($attributeArray['class']) && $attributeArray['class'] === 'comment') {
  280. $sheet->getComment($column . $row)
  281. ->getText()
  282. ->createTextRun($child->textContent);
  283. break;
  284. }
  285. if ($cellContent > '') {
  286. $cellContent .= ' ';
  287. }
  288. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  289. if ($cellContent > '') {
  290. $cellContent .= ' ';
  291. }
  292. break;
  293. case 'hr':
  294. $this->flushCell($sheet, $column, $row, $cellContent);
  295. ++$row;
  296. if (isset($this->formats[$child->nodeName])) {
  297. $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
  298. } else {
  299. $cellContent = '----------';
  300. $this->flushCell($sheet, $column, $row, $cellContent);
  301. }
  302. ++$row;
  303. // Add a break after a horizontal rule, simply by allowing the code to dropthru
  304. // no break
  305. case 'br':
  306. if ($this->tableLevel > 0) {
  307. // If we're inside a table, replace with a \n
  308. $cellContent .= "\n";
  309. } else {
  310. // Otherwise flush our existing content and move the row cursor on
  311. $this->flushCell($sheet, $column, $row, $cellContent);
  312. ++$row;
  313. }
  314. break;
  315. case 'a':
  316. foreach ($attributeArray as $attributeName => $attributeValue) {
  317. switch ($attributeName) {
  318. case 'href':
  319. $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
  320. if (isset($this->formats[$child->nodeName])) {
  321. $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
  322. }
  323. break;
  324. case 'class':
  325. if ($attributeValue === 'comment-indicator') {
  326. break; // Ignore - it's just a red square.
  327. }
  328. }
  329. }
  330. $cellContent .= ' ';
  331. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  332. break;
  333. case 'h1':
  334. case 'h2':
  335. case 'h3':
  336. case 'h4':
  337. case 'h5':
  338. case 'h6':
  339. case 'ol':
  340. case 'ul':
  341. case 'p':
  342. if ($this->tableLevel > 0) {
  343. // If we're inside a table, replace with a \n
  344. $cellContent .= "\n";
  345. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  346. } else {
  347. if ($cellContent > '') {
  348. $this->flushCell($sheet, $column, $row, $cellContent);
  349. ++$row;
  350. }
  351. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  352. $this->flushCell($sheet, $column, $row, $cellContent);
  353. if (isset($this->formats[$child->nodeName])) {
  354. $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
  355. }
  356. ++$row;
  357. $column = 'A';
  358. }
  359. break;
  360. case 'li':
  361. if ($this->tableLevel > 0) {
  362. // If we're inside a table, replace with a \n
  363. $cellContent .= "\n";
  364. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  365. } else {
  366. if ($cellContent > '') {
  367. $this->flushCell($sheet, $column, $row, $cellContent);
  368. }
  369. ++$row;
  370. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  371. $this->flushCell($sheet, $column, $row, $cellContent);
  372. $column = 'A';
  373. }
  374. break;
  375. case 'table':
  376. $this->flushCell($sheet, $column, $row, $cellContent);
  377. $column = $this->setTableStartColumn($column);
  378. if ($this->tableLevel > 1) {
  379. --$row;
  380. }
  381. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  382. $column = $this->releaseTableStartColumn();
  383. if ($this->tableLevel > 1) {
  384. ++$column;
  385. } else {
  386. ++$row;
  387. }
  388. break;
  389. case 'thead':
  390. case 'tbody':
  391. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  392. break;
  393. case 'tr':
  394. $column = $this->getTableStartColumn();
  395. $cellContent = '';
  396. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  397. ++$row;
  398. break;
  399. case 'th':
  400. case 'td':
  401. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  402. // apply inline style
  403. $this->applyInlineStyle($sheet, $row, $column, $attributeArray);
  404. while (isset($this->rowspan[$column . $row])) {
  405. ++$column;
  406. }
  407. $this->flushCell($sheet, $column, $row, $cellContent);
  408. if (isset($attributeArray['rowspan'], $attributeArray['colspan'])) {
  409. //create merging rowspan and colspan
  410. $columnTo = $column;
  411. for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
  412. ++$columnTo;
  413. }
  414. $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
  415. foreach (Coordinate::extractAllCellReferencesInRange($range) as $value) {
  416. $this->rowspan[$value] = true;
  417. }
  418. $sheet->mergeCells($range);
  419. $column = $columnTo;
  420. } elseif (isset($attributeArray['rowspan'])) {
  421. //create merging rowspan
  422. $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
  423. foreach (Coordinate::extractAllCellReferencesInRange($range) as $value) {
  424. $this->rowspan[$value] = true;
  425. }
  426. $sheet->mergeCells($range);
  427. } elseif (isset($attributeArray['colspan'])) {
  428. //create merging colspan
  429. $columnTo = $column;
  430. for ($i = 0; $i < $attributeArray['colspan'] - 1; ++$i) {
  431. ++$columnTo;
  432. }
  433. $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
  434. $column = $columnTo;
  435. } elseif (isset($attributeArray['bgcolor'])) {
  436. $sheet->getStyle($column . $row)->applyFromArray(
  437. [
  438. 'fill' => [
  439. 'fillType' => Fill::FILL_SOLID,
  440. 'color' => ['rgb' => $attributeArray['bgcolor']],
  441. ],
  442. ]
  443. );
  444. }
  445. ++$column;
  446. break;
  447. case 'body':
  448. $row = 1;
  449. $column = 'A';
  450. $cellContent = '';
  451. $this->tableLevel = 0;
  452. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  453. break;
  454. default:
  455. $this->processDomElement($child, $sheet, $row, $column, $cellContent);
  456. }
  457. }
  458. }
  459. }
  460. /**
  461. * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
  462. *
  463. * @param string $pFilename
  464. * @param Spreadsheet $spreadsheet
  465. *
  466. * @throws Exception
  467. *
  468. * @return Spreadsheet
  469. */
  470. public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
  471. {
  472. // Validate
  473. if (!$this->canRead($pFilename)) {
  474. throw new Exception($pFilename . ' is an Invalid HTML file.');
  475. }
  476. // Create new sheet
  477. while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
  478. $spreadsheet->createSheet();
  479. }
  480. $spreadsheet->setActiveSheetIndex($this->sheetIndex);
  481. // Create a new DOM object
  482. $dom = new DOMDocument();
  483. // Reload the HTML file into the DOM object
  484. $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
  485. if ($loaded === false) {
  486. throw new Exception('Failed to load ' . $pFilename . ' as a DOM Document');
  487. }
  488. // Discard white space
  489. $dom->preserveWhiteSpace = false;
  490. $row = 0;
  491. $column = 'A';
  492. $content = '';
  493. $this->processDomElement($dom, $spreadsheet->getActiveSheet(), $row, $column, $content);
  494. // Return
  495. return $spreadsheet;
  496. }
  497. /**
  498. * Get sheet index.
  499. *
  500. * @return int
  501. */
  502. public function getSheetIndex()
  503. {
  504. return $this->sheetIndex;
  505. }
  506. /**
  507. * Set sheet index.
  508. *
  509. * @param int $pValue Sheet index
  510. *
  511. * @return HTML
  512. */
  513. public function setSheetIndex($pValue)
  514. {
  515. $this->sheetIndex = $pValue;
  516. return $this;
  517. }
  518. /**
  519. * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks.
  520. *
  521. * @param string $xml
  522. *
  523. * @return string
  524. */
  525. public function securityScan($xml)
  526. {
  527. $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
  528. if (preg_match($pattern, $xml)) {
  529. throw new Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
  530. }
  531. return $xml;
  532. }
  533. /**
  534. * Apply inline css inline style.
  535. *
  536. * NOTES :
  537. * Currently only intended for td & th element,
  538. * and only takes 'background-color' and 'color'; property with HEX color
  539. *
  540. * TODO :
  541. * - Implement to other propertie, such as border
  542. *
  543. * @param Worksheet $sheet
  544. * @param array $attributeArray
  545. * @param int $row
  546. * @param string $column
  547. */
  548. private function applyInlineStyle(&$sheet, $row, $column, $attributeArray)
  549. {
  550. if (!isset($attributeArray['style'])) {
  551. return;
  552. }
  553. $supported_styles = ['background-color', 'color'];
  554. // add color styles (background & text) from dom element,currently support : td & th, using ONLY inline css style with RGB color
  555. $styles = explode(';', $attributeArray['style']);
  556. foreach ($styles as $st) {
  557. $value = explode(':', $st);
  558. if (empty(trim($value[0])) || !in_array(trim($value[0]), $supported_styles)) {
  559. continue;
  560. }
  561. //check if has #, so we can get clean hex
  562. if (substr(trim($value[1]), 0, 1) == '#') {
  563. $style_color = substr(trim($value[1]), 1);
  564. }
  565. if (empty($style_color)) {
  566. continue;
  567. }
  568. switch (trim($value[0])) {
  569. case 'background-color':
  570. $sheet->getStyle($column . $row)->applyFromArray(['fill' => ['fillType' => Fill::FILL_SOLID, 'color' => ['rgb' => "{$style_color}"]]]);
  571. break;
  572. case 'color':
  573. $sheet->getStyle($column . $row)->applyFromArray(['font' => ['color' => ['rgb' => "$style_color}"]]]);
  574. break;
  575. }
  576. }
  577. }
  578. }