Tuesday, August 12, 2003

Importing and converting 600 word documents into a database as clean html

The following script is used to import 600 Word documents into a database as cleaned html.

#!c:\oracle\ora81/Apache/perl/5.00503/bin/MSWin32-x86/perl.exe

#################################################################################
# Requirements:
# ------------
#
# - Perl 5.00503 or later installed and working on your machine
# - change the first line in this script to point to the perl executable
#
#
#
# Modules:
# --------
#
# - you will need (HTML::Parser, HTML::Treebuilder )
# - install instrucitons
#      win32: (you need Visual Studio to install manually. If you are using ActiveState
#             Perl then use the Perl Package Manager they provide else see below)
#          1. go to www.cpan.org browse the modules and find the above mentioned
#             modules.
#          2. download
#          3. Unzip/Untar
#          4. Open a command prompt in the HTML::Parser unziped directory
#          5. type> perl makefile.pl
#          6. type> nmake
#          7. type> nmake test
#          8. type> nmake install
#          9. do the same for the HTML::Treebuilder directory
#
#
#
#
# Description:
# ------------
#
# This script was created to import 600 word documents as html into a Oracle database.
#
# This script is step 2 in an migration process including an vbs (Visual Basic Script)
#
# Step 1: Convert all documents from MS Word to HTML
#
#           Option Explicit
#
#           Dim wordObj, fsObj
#
#           Set wordObj = WScript.CreateObject("Word.Application")
#           Set fsObj = WScript.CreateObject("Scripting.FileSystemObject")
#
#           ' SubRoutine call convert all files in folder to HTML
#           WordToHTML fsObj.GetFolder("..")
#
#           wordObj.Quit
#
#           Set fsObj = Nothing
#           Set wordObj = Nothing
#
#
#
#           '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
#           ' Subroutines
#           '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
#           Sub WordToHTML(Folder)
#              On Error Resume Next
#              Dim File
#              For Each File In Folder.Files
#                 Set opgaveDoc = wordObj.Documents.Open (File.Path)
#                 wordobj.ActiveDocument.SaveAs replace(replace(replace(Replace(file.path, " ", "_"),"å","aa"),"æ","ae"),"ø","oe") & ".htm", 8
#                 wordobj.ActiveDocument.close
#              Next
#           End Sub
#
#
#
# Step 2: this script
#         - clean the MS HTML using HMTLTidy
#         - locate the data in the cleaned html document
#         - clean data of unneccesary formating
#         - insert the found data in the wanted format (HTML template)
#         - generate SQL statement to insert task into database
#         - execute SQL statement
#         - log the process
#
#################################################################################


#################################################################################
# INCLUDE MODULES
#################################################################################

#use Carp;
#use diagnostics;
use DBI;
use HTML::TreeBuilder;



#################################################################################
# VARABLE DECLARATION
#################################################################################

my $user    = "USERNAME";                    # database username
my $passwd  = "PASSWORD";                    # database password
my $driver  = "Oracle";                   # DBD (DataBase Driver)
my $db_name = "DATABASENAME;                    # database server instance name (tnsname)

my $pathHtmlBackup = ".\\temp\\";         # where to put the original backup file
my $pathLogFile = ".\\";                  # where to put the processing log file
my $pathDocuments = ".\\doc\\";           # path to where the word documents are located

my $doDatabaseUpdate = 1;                 # used during development 0=off, 1=on

my $DBITraceEnabled = 0;                  # If you want DBI tracing: 1=on, 0=off
my $DBITraceLevel = 2;                    # DBI tracing level (1-9). 1 or 2 should be sufficient
my $DBITraceFilePath = "c:\\trace.log";   # Path to where you want the trace file. On win32 remember to escape the \ with
                                          # another \ so "c:\trace.log" would become "c:\\trace.log"



#################################################################################
# SUB ROUTINE DECLARATION
#################################################################################

sub decodeHTMLTask;



#################################################################################
# MAIN SCRIPT FLOW
#################################################################################

print "Connecting to database. Please wait ...\n";

my $dataSource = "dbi:$driver:${db_name}";
my $dbh = DBI->connect($dataSource, $user, $passwd) || die "Error connecting to database: $DBI::errstr\n";
if ($DBITraceEnabled) {$dbh->trace($DBITraceLevel, $DBITraceFilePath);}

# Make sure buffer is big enough for CLOB, BLOB
$dbh->{LongReadLen}=500 * 1024;  # 500KB

# Getting all the html docs from $pathDocuments
@htmlFileList = `dir /b *.htm`;
chop @htmlFileList;

open (LOGFILE, ">migration.log.txt") || die ("Could not open log file");

# Process each an every file with a html extension found in the current directory
foreach $file (@htmlFileList) {

   # Backup the converted word document
   $cmd = `copy $file ${pathHtmlBackup}${file}.org.html`;
   print LOGFILE ("Backup file : copy $file ${pathHtmlBackup}${file}.org.html\n");

   # Run the html file through HTMLTidy which cleans out the html
   $cmd = `TidyDbg.exe -config clean.cfg $file`;
   print LOGFILE ("Clean file  : TidyDbg.exe -config clean.cfg $file\n");

   # Move the cleaned html file
   $cmd = `move $file ${pathHtmlBackup}${file}.clean.html`;
   print LOGFILE ("Move file   : move $file ${pathHtmlBackup}${file}.clean.html\n");

   open (CLEANHTML, "${pathHtmlBackup}${file}.clean.html") || die ("Cannot open file ${pathHtmlBackup}${file}.clean.html\n");
   open (CLEANNOBRHTML, ">${pathHtmlBackup}${file}.clean.nobr.html") || die ("Cannot open file ${pathHtmlBackup}${file}.clean.nobr.html\n");

   # Removing all newline characters (editlive does not like them)

   # Read the file assocsiated with filehandle CLEANHTML into array @lines
   @lines = <CLEANHTML>;

   # Chop of the last character in every item in array
   #chop (@lines);

   # Join the array to a single string
   $line = join ("",@lines);
   $line =~ s/\n/ /g;

   print CLEANNOBRHTML $line;

   close (CLEANNOBRHTML);
   close (CLEANHTML);

   # Build insert statement based on information found in the html file
   &decodeHTMLTask($file);
} # end foreach

close (LOGFILE);


#################################################################################
# SUB ROUTINES
#################################################################################



#################################################################################
# COMMENTS: "decodeHTMLTask"
#---------------------------
#
# INPUT  : FILENAME (TEXT STRING) EX. something.html
# FUNCTION: PARSE THE FILES DOCUMENT TREE STRUCTURE AND GRAB THE INFORMATION
#           WHICH IS NEEDED. BUILD THE SQL INSERT STATEMENT BASED ON THIS
#           INFORMATION.
# OUTPUT  : SQL INSERT STATEMENT
#################################################################################
sub decodeHTMLTask{
   my ($file) = @_;

   my $tree = HTML::TreeBuilder->new;
   $tree->parse_file("${pathHtmlBackup}${file}.clean.nobr.html");

   #$tree->dump;           # Uncomment this line for document tree dump
   print LOGFILE ("Processing  : ${pathHtmlBackup}${file}.clean.nobr.html...\n");
   print ("Processing  : ${pathHtmlBackup}${file}...\n");

   # Get title from html document.
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[1]->content->[1];
   my $taskTitle = $tref->as_text;

   # Get stage (PÃ¥ scenen) from html document
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[2]->content->[0];
   my $onStage = $tref->as_text;
   # Since we know that stage has to be either onStage or offStage we already know
   # which one it is.

   # Get priority (Need) from html document
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[2]->content->[2];
   my $need = $tref->as_text;
   # Since we know that need has to be either need or nice we already know
   # which one it is.

   # Determinig what priority id we need
   if ($onStage eq "x") {
      if ($need eq "x") {
         $priorityId = 1;
      }
      else {
         $priorityId = 2;
      }
   }
   else {
      if ($need eq "x") {
         $priorityId = 3;
      }
      else {
         $priorityId = 4;
      }
   }

   # Get (Hvorfor) from html document
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[4]->content->[1];
   my $hvorfor = $tref->as_text;

   # Get (Hvordan) from html document
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[6]->content->[0];
   my $hvordan = $tref->as_HTML;

   # We need to remove a start/tags td (tabledata) fomr the hvordan data
   # to be able to use the data in to input template
   $hvordan =~ s/<td colspan=11 valign="top">//ig;
   $hvordan =~ s/<\/td>//ig;

   # Get (Links) from html document
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[8]->content->[0]->content->[0];
   my $links = $tref->as_HTML;

   # Clear memory
   $tree->delete;

   $sql = "select task_id_seq.nextval from dual";
   $sth = $dbh->prepare($sql) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";;
   $sth->execute()  || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n";
   $sth->bind_col( 1, \$task_id );
   $sth->fetch();

   # Building the task description text
   my $HTMLTask = "<br><TABLE cellSpacing=0 cellPadding=0 border=1><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Hvorfor</FONT></H5><P><FONT face=Verdana>$hvorfor</FONT></P><P>&nbsp;</P></TD></TR><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Hvordan</FONT></H5><P><FONT face=Verdana>$hvordan</FONT></P><P>&nbsp;</P></TD></TR><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Links</FONT></H5><P><FONT face=Verdana>$links</FONT></P><P>&nbsp;</P></TD></TR></TABLE><br>";
   $HTMLTask =~ s/\n/ /g;

   # SQL statement needed to inser task into database
   my $sqlInsertTask = "
      INSERT INTO TASK (
         task_id,
         gro_id,
         name,
         task_flag,
         text,
         priority_id,
         user_created,
         date_created,
         user_changed,
         date_changed)
      VALUES (
         :task_id,
         task_admin_pack.getPrimgroid(),
         :taskTitle,
         'T',
         :HTMLTask,
         :priorityId,
         'Task massload script',
         sysdate,
         'Task massload script',
         sysdate)
   ";

   $sth = $dbh->prepare($sqlInsertTask) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";;
   $sth->bind_param(":task_id", $task_id);
   $sth->bind_param(":taskTitle", $taskTitle);
   $sth->bind_param(":HTMLTask", $HTMLTask);
   $sth->bind_param(":priorityId", $priorityId);
   $sth->execute()  || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n";

   print LOGFILE ("Execute SQL : INSERT INTO TASK (task_id, gro_id, name, task_flag, text, priority_id, user_created, date_created, user_changed, date_changed) VALUES ($task_id, TASK_admin_pack.getPrimgroid(), '$taskTitle', 'T', '$HTMLTask', '$priorityId', 'Task massload script', sysdate, 'Task massload script', sysdate)\n;");

   # Generate insert statement needed to insert into all search
   my $sqlInsertAllSearch = "
      INSERT INTO all_search (
         as_id,
         table_id,
         all_searchfield,
         com_id,
         gro_id,
         pro_id,
         table_name,
         id_field_name,
         owner_field_name,
         access_table_name)
      values (
         as_seq.nextval,
         :task_id,
         :v_temptext,
         :v_componentID,
         NULL,
         NULL,
         NULL,
         NULL,
         NULL,
         NULL)
   ";

   $sth = $dbh->prepare($sqlInsertAllSearch) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";;
 $sth->bind_param(":task_id", $task_id);
 $sth->bind_param(":v_temptext", $taskTitle . " " . $HTMLTask);
 $sth->bind_param(":v_componentID", 42);
   $sth->execute()  || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n";

   print LOGFILE ("Execute SQL : INSERT INTO all_search (as_id, table_id, all_searchfield, com_id, gro_id, pro_id, table_name, id_field_name, owner_field_name, access_table_name) VALUES (as_seq.nextval, $p_taskid,  '$taskTitle $HTMLTask', $v_componentID, $v_groid, NULL, NULL, NULL, NULL, NULL);\n");

}

This is the configuration settings use together with Html Tidy for windows

word-2000: yes
show-body-only: yes
clean: yes
char-encoding: raw
quiet: yes
show-errors: 0
show-warnings: no
drop-empty-paras: no
indent-spaces: 3
new-inline-tags: o:p
doctype: omit
tidy-mark: no
quote-ampersand: no
write-back: yes
force-output: yes