The following script is used to import 600 Word documents into a database as cleaned html.
#!c:\oracle\ora81/Apache/perl/5.00503/bin/MSWin32-x86/perl.exe ################################################################################# # Requirements: # ------------ # # - Perl 5.00503 or later installed and working on your machine # - change the first line in this script to point to the perl executable # # # # Modules: # -------- # # - you will need (HTML::Parser, HTML::Treebuilder ) # - install instrucitons # win32: (you need Visual Studio to install manually. If you are using ActiveState # Perl then use the Perl Package Manager they provide else see below) # 1. go to www.cpan.org browse the modules and find the above mentioned # modules. # 2. download # 3. Unzip/Untar # 4. Open a command prompt in the HTML::Parser unziped directory # 5. type> perl makefile.pl # 6. type> nmake # 7. type> nmake test # 8. type> nmake install # 9. do the same for the HTML::Treebuilder directory # # # # # Description: # ------------ # # This script was created to import 600 word documents as html into a Oracle database. # # This script is step 2 in an migration process including an vbs (Visual Basic Script) # # Step 1: Convert all documents from MS Word to HTML # # Option Explicit # # Dim wordObj, fsObj # # Set wordObj = WScript.CreateObject("Word.Application") # Set fsObj = WScript.CreateObject("Scripting.FileSystemObject") # # ' SubRoutine call convert all files in folder to HTML # WordToHTML fsObj.GetFolder("..") # # wordObj.Quit # # Set fsObj = Nothing # Set wordObj = Nothing # # # # ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' # ' Subroutines # ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' # Sub WordToHTML(Folder) # On Error Resume Next # Dim File # For Each File In Folder.Files # Set opgaveDoc = wordObj.Documents.Open (File.Path) # wordobj.ActiveDocument.SaveAs replace(replace(replace(Replace(file.path, " ", "_"),"å","aa"),"æ","ae"),"ø","oe") & ".htm", 8 # wordobj.ActiveDocument.close # Next # End Sub # # # # Step 2: this script # - clean the MS HTML using HMTLTidy # - locate the data in the cleaned html document # - clean data of unneccesary formating # - insert the found data in the wanted format (HTML template) # - generate SQL statement to insert task into database # - execute SQL statement # - log the process # ################################################################################# ################################################################################# # INCLUDE MODULES ################################################################################# #use Carp; #use diagnostics; use DBI; use HTML::TreeBuilder; ################################################################################# # VARABLE DECLARATION ################################################################################# my $user = "USERNAME"; # database username my $passwd = "PASSWORD"; # database password my $driver = "Oracle"; # DBD (DataBase Driver) my $db_name = "DATABASENAME; # database server instance name (tnsname) my $pathHtmlBackup = ".\\temp\\"; # where to put the original backup file my $pathLogFile = ".\\"; # where to put the processing log file my $pathDocuments = ".\\doc\\"; # path to where the word documents are located my $doDatabaseUpdate = 1; # used during development 0=off, 1=on my $DBITraceEnabled = 0; # If you want DBI tracing: 1=on, 0=off my $DBITraceLevel = 2; # DBI tracing level (1-9). 1 or 2 should be sufficient my $DBITraceFilePath = "c:\\trace.log"; # Path to where you want the trace file. On win32 remember to escape the \ with # another \ so "c:\trace.log" would become "c:\\trace.log" ################################################################################# # SUB ROUTINE DECLARATION ################################################################################# sub decodeHTMLTask; ################################################################################# # MAIN SCRIPT FLOW ################################################################################# print "Connecting to database. Please wait ...\n"; my $dataSource = "dbi:$driver:${db_name}"; my $dbh = DBI->connect($dataSource, $user, $passwd) || die "Error connecting to database: $DBI::errstr\n"; if ($DBITraceEnabled) {$dbh->trace($DBITraceLevel, $DBITraceFilePath);} # Make sure buffer is big enough for CLOB, BLOB $dbh->{LongReadLen}=500 * 1024; # 500KB # Getting all the html docs from $pathDocuments @htmlFileList = `dir /b *.htm`; chop @htmlFileList; open (LOGFILE, ">migration.log.txt") || die ("Could not open log file"); # Process each an every file with a html extension found in the current directory foreach $file (@htmlFileList) { # Backup the converted word document $cmd = `copy $file ${pathHtmlBackup}${file}.org.html`; print LOGFILE ("Backup file : copy $file ${pathHtmlBackup}${file}.org.html\n"); # Run the html file through HTMLTidy which cleans out the html $cmd = `TidyDbg.exe -config clean.cfg $file`; print LOGFILE ("Clean file : TidyDbg.exe -config clean.cfg $file\n"); # Move the cleaned html file $cmd = `move $file ${pathHtmlBackup}${file}.clean.html`; print LOGFILE ("Move file : move $file ${pathHtmlBackup}${file}.clean.html\n"); open (CLEANHTML, "${pathHtmlBackup}${file}.clean.html") || die ("Cannot open file ${pathHtmlBackup}${file}.clean.html\n"); open (CLEANNOBRHTML, ">${pathHtmlBackup}${file}.clean.nobr.html") || die ("Cannot open file ${pathHtmlBackup}${file}.clean.nobr.html\n"); # Removing all newline characters (editlive does not like them) # Read the file assocsiated with filehandle CLEANHTML into array @lines @lines = <CLEANHTML>; # Chop of the last character in every item in array #chop (@lines); # Join the array to a single string $line = join ("",@lines); $line =~ s/\n/ /g; print CLEANNOBRHTML $line; close (CLEANNOBRHTML); close (CLEANHTML); # Build insert statement based on information found in the html file &decodeHTMLTask($file); } # end foreach close (LOGFILE); ################################################################################# # SUB ROUTINES ################################################################################# ################################################################################# # COMMENTS: "decodeHTMLTask" #--------------------------- # # INPUT : FILENAME (TEXT STRING) EX. something.html # FUNCTION: PARSE THE FILES DOCUMENT TREE STRUCTURE AND GRAB THE INFORMATION # WHICH IS NEEDED. BUILD THE SQL INSERT STATEMENT BASED ON THIS # INFORMATION. # OUTPUT : SQL INSERT STATEMENT ################################################################################# sub decodeHTMLTask{ my ($file) = @_; my $tree = HTML::TreeBuilder->new; $tree->parse_file("${pathHtmlBackup}${file}.clean.nobr.html"); #$tree->dump; # Uncomment this line for document tree dump print LOGFILE ("Processing : ${pathHtmlBackup}${file}.clean.nobr.html...\n"); print ("Processing : ${pathHtmlBackup}${file}...\n"); # Get title from html document. my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[1]->content->[1]; my $taskTitle = $tref->as_text; # Get stage (På scenen) from html document my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[2]->content->[0]; my $onStage = $tref->as_text; # Since we know that stage has to be either onStage or offStage we already know # which one it is. # Get priority (Need) from html document my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[2]->content->[2]; my $need = $tref->as_text; # Since we know that need has to be either need or nice we already know # which one it is. # Determinig what priority id we need if ($onStage eq "x") { if ($need eq "x") { $priorityId = 1; } else { $priorityId = 2; } } else { if ($need eq "x") { $priorityId = 3; } else { $priorityId = 4; } } # Get (Hvorfor) from html document my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[4]->content->[1]; my $hvorfor = $tref->as_text; # Get (Hvordan) from html document my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[6]->content->[0]; my $hvordan = $tref->as_HTML; # We need to remove a start/tags td (tabledata) fomr the hvordan data # to be able to use the data in to input template $hvordan =~ s/<td colspan=11 valign="top">//ig; $hvordan =~ s/<\/td>//ig; # Get (Links) from html document my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[8]->content->[0]->content->[0]; my $links = $tref->as_HTML; # Clear memory $tree->delete; $sql = "select task_id_seq.nextval from dual"; $sth = $dbh->prepare($sql) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";; $sth->execute() || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n"; $sth->bind_col( 1, \$task_id ); $sth->fetch(); # Building the task description text my $HTMLTask = "<br><TABLE cellSpacing=0 cellPadding=0 border=1><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Hvorfor</FONT></H5><P><FONT face=Verdana>$hvorfor</FONT></P><P> </P></TD></TR><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Hvordan</FONT></H5><P><FONT face=Verdana>$hvordan</FONT></P><P> </P></TD></TR><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Links</FONT></H5><P><FONT face=Verdana>$links</FONT></P><P> </P></TD></TR></TABLE><br>"; $HTMLTask =~ s/\n/ /g; # SQL statement needed to inser task into database my $sqlInsertTask = " INSERT INTO TASK ( task_id, gro_id, name, task_flag, text, priority_id, user_created, date_created, user_changed, date_changed) VALUES ( :task_id, task_admin_pack.getPrimgroid(), :taskTitle, 'T', :HTMLTask, :priorityId, 'Task massload script', sysdate, 'Task massload script', sysdate) "; $sth = $dbh->prepare($sqlInsertTask) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";; $sth->bind_param(":task_id", $task_id); $sth->bind_param(":taskTitle", $taskTitle); $sth->bind_param(":HTMLTask", $HTMLTask); $sth->bind_param(":priorityId", $priorityId); $sth->execute() || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n"; print LOGFILE ("Execute SQL : INSERT INTO TASK (task_id, gro_id, name, task_flag, text, priority_id, user_created, date_created, user_changed, date_changed) VALUES ($task_id, TASK_admin_pack.getPrimgroid(), '$taskTitle', 'T', '$HTMLTask', '$priorityId', 'Task massload script', sysdate, 'Task massload script', sysdate)\n;"); # Generate insert statement needed to insert into all search my $sqlInsertAllSearch = " INSERT INTO all_search ( as_id, table_id, all_searchfield, com_id, gro_id, pro_id, table_name, id_field_name, owner_field_name, access_table_name) values ( as_seq.nextval, :task_id, :v_temptext, :v_componentID, NULL, NULL, NULL, NULL, NULL, NULL) "; $sth = $dbh->prepare($sqlInsertAllSearch) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";; $sth->bind_param(":task_id", $task_id); $sth->bind_param(":v_temptext", $taskTitle . " " . $HTMLTask); $sth->bind_param(":v_componentID", 42); $sth->execute() || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n"; print LOGFILE ("Execute SQL : INSERT INTO all_search (as_id, table_id, all_searchfield, com_id, gro_id, pro_id, table_name, id_field_name, owner_field_name, access_table_name) VALUES (as_seq.nextval, $p_taskid, '$taskTitle $HTMLTask', $v_componentID, $v_groid, NULL, NULL, NULL, NULL, NULL);\n"); }
This is the configuration settings use together with Html Tidy for windows
word-2000: yes show-body-only: yes clean: yes char-encoding: raw quiet: yes show-errors: 0 show-warnings: no drop-empty-paras: no indent-spaces: 3 new-inline-tags: o:p doctype: omit tidy-mark: no quote-ampersand: no write-back: yes force-output: yes