The following script is used to import 600 Word documents into a database as cleaned html.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | #!c:\oracle\ora81/Apache/perl/5.00503/bin/MSWin32-x86/perl.exe ################################################################################# # Requirements: # ------------ # # - Perl 5.00503 or later installed and working on your machine # - change the first line in this script to point to the perl executable # # # # Modules: # -------- # # - you will need (HTML::Parser, HTML::Treebuilder ) # - install instrucitons # win32: (you need Visual Studio to install manually. If you are using ActiveState # Perl then use the Perl Package Manager they provide else see below) # 1. go to www.cpan.org browse the modules and find the above mentioned # modules. # 2. download # 3. Unzip/Untar # 4. Open a command prompt in the HTML::Parser unziped directory # 5. type> perl makefile.pl # 6. type> nmake # 7. type> nmake test # 8. type> nmake install # 9. do the same for the HTML::Treebuilder directory # # # # # Description: # ------------ # # This script was created to import 600 word documents as html into a Oracle database. # # This script is step 2 in an migration process including an vbs (Visual Basic Script) # # Step 1: Convert all documents from MS Word to HTML # # Option Explicit # # Dim wordObj, fsObj # # Set wordObj = WScript.CreateObject("Word.Application") # Set fsObj = WScript.CreateObject("Scripting.FileSystemObject") # # ' SubRoutine call convert all files in folder to HTML # WordToHTML fsObj.GetFolder("..") # # wordObj.Quit # # Set fsObj = Nothing # Set wordObj = Nothing # # # # ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' # ' Subroutines # ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' # Sub WordToHTML(Folder) # On Error Resume Next # Dim File # For Each File In Folder.Files # Set opgaveDoc = wordObj.Documents.Open (File.Path) # wordobj.ActiveDocument.SaveAs replace(replace(replace(Replace(file.path, " ", "_"),"å","aa"),"æ","ae"),"ø","oe") & ".htm", 8 # wordobj.ActiveDocument.close # Next # End Sub # # # # Step 2: this script # - clean the MS HTML using HMTLTidy # - locate the data in the cleaned html document # - clean data of unneccesary formating # - insert the found data in the wanted format (HTML template) # - generate SQL statement to insert task into database # - execute SQL statement # - log the process # ################################################################################# ################################################################################# # INCLUDE MODULES ################################################################################# #use Carp; #use diagnostics; use DBI; use HTML::TreeBuilder; ################################################################################# # VARABLE DECLARATION ################################################################################# my $user = "USERNAME" ; # database username my $passwd = "PASSWORD" ; # database password my $driver = "Oracle" ; # DBD (DataBase Driver) my $db_name = "DATABASENAME; # database server instance name (tnsname) my $pathHtmlBackup = ".\\temp\\" ; # where to put the original backup file my $pathLogFile = ".\\" ; # where to put the processing log file my $pathDocuments = ".\\doc\\" ; # path to where the word documents are located my $doDatabaseUpdate = 1; # used during development 0=off, 1=on my $DBITraceEnabled = 0; # If you want DBI tracing: 1=on, 0=off my $DBITraceLevel = 2; # DBI tracing level (1-9). 1 or 2 should be sufficient my $DBITraceFilePath = "c:\\trace.log" ; # Path to where you want the trace file. On win32 remember to escape the \ with # another \ so "c:\trace.log" would become "c:\\trace.log" ################################################################################# # SUB ROUTINE DECLARATION ################################################################################# sub decodeHTMLTask; ################################################################################# # MAIN SCRIPT FLOW ################################################################################# print "Connecting to database. Please wait ...\n" ; my $dataSource = "dbi:$driver:${db_name}" ; my $dbh = DBI-> connect ( $dataSource , $user , $passwd ) || die "Error connecting to database: $DBI::errstr\n" ; if ( $DBITraceEnabled ) { $dbh ->trace( $DBITraceLevel , $DBITraceFilePath );} # Make sure buffer is big enough for CLOB, BLOB $dbh ->{LongReadLen}=500 * 1024; # 500KB # Getting all the html docs from $pathDocuments @htmlFileList = `dir /b *.htm`; chop @htmlFileList ; open (LOGFILE, ">migration.log.txt" ) || die ( "Could not open log file" ); # Process each an every file with a html extension found in the current directory foreach $file ( @htmlFileList ) { # Backup the converted word document $cmd = `copy $file ${pathHtmlBackup}${file}.org.html`; print LOGFILE ( "Backup file : copy $file ${pathHtmlBackup}${file}.org.html\n" ); # Run the html file through HTMLTidy which cleans out the html $cmd = `TidyDbg.exe -config clean.cfg $file `; print LOGFILE ( "Clean file : TidyDbg.exe -config clean.cfg $file\n" ); # Move the cleaned html file $cmd = `move $file ${pathHtmlBackup}${file}.clean.html`; print LOGFILE ( "Move file : move $file ${pathHtmlBackup}${file}.clean.html\n" ); open (CLEANHTML, "${pathHtmlBackup}${file}.clean.html" ) || die ( "Cannot open file ${pathHtmlBackup}${file}.clean.html\n" ); open (CLEANNOBRHTML, ">${pathHtmlBackup}${file}.clean.nobr.html" ) || die ( "Cannot open file ${pathHtmlBackup}${file}.clean.nobr.html\n" ); # Removing all newline characters (editlive does not like them) # Read the file assocsiated with filehandle CLEANHTML into array @lines @lines = <CLEANHTML>; # Chop of the last character in every item in array #chop (@lines); # Join the array to a single string $line = join ( "" , @lines ); $line =~ s/\n/ /g; print CLEANNOBRHTML $line ; close (CLEANNOBRHTML); close (CLEANHTML); # Build insert statement based on information found in the html file &decodeHTMLTask( $file ); } # end foreach close (LOGFILE); ################################################################################# # SUB ROUTINES ################################################################################# ################################################################################# # COMMENTS: "decodeHTMLTask" #--------------------------- # # INPUT : FILENAME (TEXT STRING) EX. something.html # FUNCTION: PARSE THE FILES DOCUMENT TREE STRUCTURE AND GRAB THE INFORMATION # WHICH IS NEEDED. BUILD THE SQL INSERT STATEMENT BASED ON THIS # INFORMATION. # OUTPUT : SQL INSERT STATEMENT ################################################################################# sub decodeHTMLTask{ my ( $file ) = @_ ; my $tree = HTML::TreeBuilder->new; $tree ->parse_file( "${pathHtmlBackup}${file}.clean.nobr.html" ); #$tree->dump; # Uncomment this line for document tree dump print LOGFILE ( "Processing : ${pathHtmlBackup}${file}.clean.nobr.html...\n" ); print ( "Processing : ${pathHtmlBackup}${file}...\n" ); # Get title from html document. my $tref = $tree ->content->[1]->content->[0]->content->[2]->content->[1]->content->[1]; my $taskTitle = $tref ->as_text; # Get stage (På scenen) from html document my $tref = $tree ->content->[1]->content->[0]->content->[2]->content->[2]->content->[0]; my $onStage = $tref ->as_text; # Since we know that stage has to be either onStage or offStage we already know # which one it is. # Get priority (Need) from html document my $tref = $tree ->content->[1]->content->[0]->content->[2]->content->[2]->content->[2]; my $need = $tref ->as_text; # Since we know that need has to be either need or nice we already know # which one it is. # Determinig what priority id we need if ( $onStage eq "x" ) { if ( $need eq "x" ) { $priorityId = 1; } else { $priorityId = 2; } } else { if ( $need eq "x" ) { $priorityId = 3; } else { $priorityId = 4; } } # Get (Hvorfor) from html document my $tref = $tree ->content->[1]->content->[0]->content->[2]->content->[4]->content->[1]; my $hvorfor = $tref ->as_text; # Get (Hvordan) from html document my $tref = $tree ->content->[1]->content->[0]->content->[2]->content->[6]->content->[0]; my $hvordan = $tref ->as_HTML; # We need to remove a start/tags td (tabledata) fomr the hvordan data # to be able to use the data in to input template $hvordan =~ s/<td colspan=11 valign= "top" >//ig; $hvordan =~ s/<\/td>//ig; # Get (Links) from html document my $tref = $tree ->content->[1]->content->[0]->content->[2]->content->[8]->content->[0]->content->[0]; my $links = $tref ->as_HTML; # Clear memory $tree -> delete ; $sql = "select task_id_seq.nextval from dual" ; $sth = $dbh ->prepare( $sql ) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n" ;; $sth ->execute() || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n" ; $sth ->bind_col( 1, \ $task_id ); $sth ->fetch(); # Building the task description text my $HTMLTask = " <TABLE cellSpacing=0 cellPadding=0 border=1>< TR ><TD vAlign=top width=571><H5><FONT face=Verdana>Hvorfor</FONT></H5><P><FONT face=Verdana> $hvorfor </FONT></P><P> </P></TD></ TR >< TR ><TD vAlign=top width=571><H5><FONT face=Verdana>Hvordan</FONT></H5><P><FONT face=Verdana> $hvordan </FONT></P><P> </P></TD></ TR >< TR ><TD vAlign=top width=571><H5><FONT face=Verdana>Links</FONT></H5><P><FONT face=Verdana> $links </FONT></P><P> </P></TD></ TR ></TABLE> "; $HTMLTask =~ s/\n/ /g; # SQL statement needed to inser task into database my $sqlInsertTask = " INSERT INTO TASK ( task_id, gro_id, name, task_flag, text, priority_id, user_created, date_created, user_changed, date_changed) VALUES ( :task_id, task_admin_pack.getPrimgroid(), :taskTitle, 'T' , :HTMLTask, :priorityId, 'Task massload script' , sysdate, 'Task massload script' , sysdate) "; $sth = $dbh ->prepare( $sqlInsertTask ) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n" ;; $sth ->bind_param( ":task_id" , $task_id ); $sth ->bind_param( ":taskTitle" , $taskTitle ); $sth ->bind_param( ":HTMLTask" , $HTMLTask ); $sth ->bind_param( ":priorityId" , $priorityId ); $sth ->execute() || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n" ; print LOGFILE ( "Execute SQL : INSERT INTO TASK (task_id, gro_id, name, task_flag, text, priority_id, user_created, date_created, user_changed, date_changed) VALUES ($task_id, TASK_admin_pack.getPrimgroid(), '$taskTitle', 'T', '$HTMLTask', '$priorityId', 'Task massload script', sysdate, 'Task massload script', sysdate)\n;" ); # Generate insert statement needed to insert into all search my $sqlInsertAllSearch = " INSERT INTO all_search ( as_id, table_id, all_searchfield, com_id, gro_id, pro_id, table_name, id_field_name, owner_field_name, access_table_name) values ( as_seq.nextval, :task_id, :v_temptext, :v_componentID, NULL, NULL, NULL, NULL, NULL, NULL) "; $sth = $dbh ->prepare( $sqlInsertAllSearch ) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n" ;; $sth ->bind_param( ":task_id" , $task_id ); $sth ->bind_param( ":v_temptext" , $taskTitle . " " . $HTMLTask ); $sth ->bind_param( ":v_componentID" , 42); $sth ->execute() || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n" ; print LOGFILE ( "Execute SQL : INSERT INTO all_search (as_id, table_id, all_searchfield, com_id, gro_id, pro_id, table_name, id_field_name, owner_field_name, access_table_name) VALUES (as_seq.nextval, $p_taskid, '$taskTitle $HTMLTask', $v_componentID, $v_groid, NULL, NULL, NULL, NULL, NULL);\n" ); } |
This is the configuration settings use together with Html Tidy for windows
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | word-2000: yes show-body-only: yes clean: yes char-encoding: raw quiet: yes show-errors: 0 show-warnings: no drop-empty-paras: no indent-spaces: 3 new-inline-tags: o:p doctype: omit tidy-mark: no quote-ampersand: no write -back: yes force-output: yes |