diff --git a/document-converter.php b/document-converter.php deleted file mode 100644 index 87681b7..0000000 --- a/document-converter.php +++ /dev/null @@ -1,80 +0,0 @@ -filename = $filePath; - } - - private function read_doc() { - $fileHandle = fopen($this->filename, 'r'); - $line = @fread($fileHandle, filesize($this->filename)); - $lines = explode(chr(0x0D), $line); - $outtext = ''; - - foreach ($lines as $thisline) { - $pos = strpos($thisline, chr(0x00)); - - if ($pos !== false || strlen($thisline) == 0) { - } - else { - $outtext .= $thisline . ' '; - } - } - - return preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/", '', $outtext); - } - - private function read_docx() { - $striped_content = ''; - $content = ''; - - $zip = zip_open($this->filename); - - if (!$zip || is_numeric($zip)) return false; - - while ($zip_entry = zip_read($zip)) { - if (zip_entry_open($zip, $zip_entry) == false) continue; - if (zip_entry_name($zip_entry) != "word/document.xml") continue; - - $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); - - zip_entry_close($zip_entry); - } - - zip_close($zip); - - $content = str_replace('', ' ', $content); - $content = str_replace('', '\r\n', $content); - $striped_content = strip_tags($content); - - return $striped_content; - } - - public function convert_to_text() { - $fileArray = pathinfo($this->filename); - $file_ext = $fileArray['extension']; - - switch ($file_ext) { - case 'doc': - return $this->read_doc(); - break; - - case 'docx': - return $this->read_docx(); - break; - - default: - return 'Invalid file type'; - break; - } - } -} diff --git a/inc/document-converter.php b/inc/document-converter.php new file mode 100644 index 0000000..de85fb6 --- /dev/null +++ b/inc/document-converter.php @@ -0,0 +1,57 @@ +open($name, 16); + + if ($unzip !== true) { + throw new Exception("Couldn't open file: $name", 1); + } + + $content = $zip->getFromName('word/document.xml'); + $zip->close(); + + return $content; + } + catch (Exception $exception) { + return $exception->getMessage(); + } + } + + public static function formatText(string $text): string + { + $formattedText = $text; + $formattedText = str_replace('', ' ', $formattedText); + $formattedText = str_replace('', '\r\n', $formattedText); + $formattedText = strip_tags($formattedText); + $formattedText = str_replace('\r\n', '
', $formattedText); + + return $formattedText; + } +} diff --git a/index.php b/index.php index 421a628..c932126 100644 --- a/index.php +++ b/index.php @@ -5,14 +5,14 @@ * plain text file. */ -include_once 'document-converter.php'; +include_once 'inc/document-converter.php'; dir_digger(__DIR__ . '\documents\\', scandir(__DIR__ . '\documents\\')); // Windows path /** - * Goes through all the nested directries + * Goes through all the nested directories * * @param string $dir_url * @param array $dir_cont @@ -29,12 +29,12 @@ function dir_digger($dir_url, $dir_cont) { dir_digger($nested_dir_url, $nested_dir_cont); } else { - $file_url = $dir_url . $dir_cont[$i]; - $file_obj = new Docx_Conversion($file_url); - $file_cont = $file_obj->convert_to_text(); + // TODO: add the file check around here: + $fileName = $dir_url . $dir_cont[$i]; + $fileContent = DocxToHTML::getText($fileName); - if ($file_cont !== 'Invalid file type') { - echo '
' . $file_cont . '
'; + if ($fileContent !== 'Invalid file type') { + echo '
' . $fileContent . '
'; } } } diff --git a/readme.md b/readme.md index 6a572e1..770a5d3 100644 --- a/readme.md +++ b/readme.md @@ -3,17 +3,12 @@ The script in this repository crawls through directories, looks for MS Word docu Remember to change the Windows `\` with `/` in the paths if you're running the script on Linux. ## Requirements -- folder named `/documetns` that will contain the documents in the root dir. +- folder named `/documents` that will contain the documents in the root dir. -## Known issues -- in Windows, the script can't output `.doc` files properly, outputs a string of random characters (`Y, B8L 1(IzZYrH9pd4n(KgVB,lDAeX)Ly5ot ebW3gp j/gQjZTae9i5j5 fE514g7vnO( ,jV9kvvadVoTAn7jahy@ARhW.GMuO /e5sZWfPtfkA0zUw@tAm4T2j 6Q`). - -## Resoruces -- base on a [stackoverflow answer](https://stackoverflow.com/questions/19503653/how-to-extract-text-from-word-file-doc-docx-xlsx-pptx-php) +## Resources +- base on a [StackOverflow answer](https://stackoverflow.com/questions/19503653/how-to-extract-text-from-word-file-doc-docx-xlsx-pptx-php) ## TODO: -- craete interface that allows the upload of multiple forms; -- extract the recursive serach into it's own function; -- refactor the main class to allow scaling; -- add markup parser; +- extract the recursive search into it's own function; +- add markup parser; and - add more supported files.