diff --git a/document-converter.php b/document-converter.php
deleted file mode 100644
index 87681b7..0000000
--- a/document-converter.php
+++ /dev/null
@@ -1,80 +0,0 @@
-filename = $filePath;
- }
-
- private function read_doc() {
- $fileHandle = fopen($this->filename, 'r');
- $line = @fread($fileHandle, filesize($this->filename));
- $lines = explode(chr(0x0D), $line);
- $outtext = '';
-
- foreach ($lines as $thisline) {
- $pos = strpos($thisline, chr(0x00));
-
- if ($pos !== false || strlen($thisline) == 0) {
- }
- else {
- $outtext .= $thisline . ' ';
- }
- }
-
- return preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/", '', $outtext);
- }
-
- private function read_docx() {
- $striped_content = '';
- $content = '';
-
- $zip = zip_open($this->filename);
-
- if (!$zip || is_numeric($zip)) return false;
-
- while ($zip_entry = zip_read($zip)) {
- if (zip_entry_open($zip, $zip_entry) == false) continue;
- if (zip_entry_name($zip_entry) != "word/document.xml") continue;
-
- $content .= zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));
-
- zip_entry_close($zip_entry);
- }
-
- zip_close($zip);
-
- $content = str_replace('', ' ', $content);
- $content = str_replace('', '\r\n', $content);
- $striped_content = strip_tags($content);
-
- return $striped_content;
- }
-
- public function convert_to_text() {
- $fileArray = pathinfo($this->filename);
- $file_ext = $fileArray['extension'];
-
- switch ($file_ext) {
- case 'doc':
- return $this->read_doc();
- break;
-
- case 'docx':
- return $this->read_docx();
- break;
-
- default:
- return 'Invalid file type';
- break;
- }
- }
-}
diff --git a/inc/document-converter.php b/inc/document-converter.php
new file mode 100644
index 0000000..de85fb6
--- /dev/null
+++ b/inc/document-converter.php
@@ -0,0 +1,57 @@
+open($name, 16);
+
+ if ($unzip !== true) {
+ throw new Exception("Couldn't open file: $name", 1);
+ }
+
+ $content = $zip->getFromName('word/document.xml');
+ $zip->close();
+
+ return $content;
+ }
+ catch (Exception $exception) {
+ return $exception->getMessage();
+ }
+ }
+
+ public static function formatText(string $text): string
+ {
+ $formattedText = $text;
+ $formattedText = str_replace('', ' ', $formattedText);
+ $formattedText = str_replace('', '\r\n', $formattedText);
+ $formattedText = strip_tags($formattedText);
+ $formattedText = str_replace('\r\n', '
', $formattedText);
+
+ return $formattedText;
+ }
+}
diff --git a/index.php b/index.php
index 421a628..c932126 100644
--- a/index.php
+++ b/index.php
@@ -5,14 +5,14 @@
* plain text file.
*/
-include_once 'document-converter.php';
+include_once 'inc/document-converter.php';
dir_digger(__DIR__ . '\documents\\', scandir(__DIR__ . '\documents\\')); // Windows path
/**
- * Goes through all the nested directries
+ * Goes through all the nested directories
*
* @param string $dir_url
* @param array $dir_cont
@@ -29,12 +29,12 @@ function dir_digger($dir_url, $dir_cont) {
dir_digger($nested_dir_url, $nested_dir_cont);
}
else {
- $file_url = $dir_url . $dir_cont[$i];
- $file_obj = new Docx_Conversion($file_url);
- $file_cont = $file_obj->convert_to_text();
+ // TODO: add the file check around here:
+ $fileName = $dir_url . $dir_cont[$i];
+ $fileContent = DocxToHTML::getText($fileName);
- if ($file_cont !== 'Invalid file type') {
- echo '' . $file_cont . '
';
+ if ($fileContent !== 'Invalid file type') {
+ echo '' . $fileContent . '
';
}
}
}
diff --git a/readme.md b/readme.md
index 6a572e1..770a5d3 100644
--- a/readme.md
+++ b/readme.md
@@ -3,17 +3,12 @@ The script in this repository crawls through directories, looks for MS Word docu
Remember to change the Windows `\` with `/` in the paths if you're running the script on Linux.
## Requirements
-- folder named `/documetns` that will contain the documents in the root dir.
+- folder named `/documents` that will contain the documents in the root dir.
-## Known issues
-- in Windows, the script can't output `.doc` files properly, outputs a string of random characters (`Y, B8L 1(IzZYrH9pd4n(KgVB,lDAeX)Ly5otebW3gpj/gQjZTae9i5j5fE514g7vnO( ,jV9kvvadVoTAn7jahy@ARhW.GMuO /e5sZWfPtfkA0zUw@tAm4T2j 6Q`).
-
-## Resoruces
-- base on a [stackoverflow answer](https://stackoverflow.com/questions/19503653/how-to-extract-text-from-word-file-doc-docx-xlsx-pptx-php)
+## Resources
+- base on a [StackOverflow answer](https://stackoverflow.com/questions/19503653/how-to-extract-text-from-word-file-doc-docx-xlsx-pptx-php)
## TODO:
-- craete interface that allows the upload of multiple forms;
-- extract the recursive serach into it's own function;
-- refactor the main class to allow scaling;
-- add markup parser;
+- extract the recursive search into it's own function;
+- add markup parser; and
- add more supported files.