The following script is used to import 600 Word documents into a database as cleaned html.
#!c:\oracle\ora81/Apache/perl/5.00503/bin/MSWin32-x86/perl.exe
#################################################################################
# Requirements:
# ------------
#
# - Perl 5.00503 or later installed and working on your machine
# - change the first line in this script to point to the perl executable
#
#
#
# Modules:
# --------
#
# - you will need (HTML::Parser, HTML::Treebuilder )
# - install instrucitons
# win32: (you need Visual Studio to install manually. If you are using ActiveState
# Perl then use the Perl Package Manager they provide else see below)
# 1. go to www.cpan.org browse the modules and find the above mentioned
# modules.
# 2. download
# 3. Unzip/Untar
# 4. Open a command prompt in the HTML::Parser unziped directory
# 5. type> perl makefile.pl
# 6. type> nmake
# 7. type> nmake test
# 8. type> nmake install
# 9. do the same for the HTML::Treebuilder directory
#
#
#
#
# Description:
# ------------
#
# This script was created to import 600 word documents as html into a Oracle database.
#
# This script is step 2 in an migration process including an vbs (Visual Basic Script)
#
# Step 1: Convert all documents from MS Word to HTML
#
# Option Explicit
#
# Dim wordObj, fsObj
#
# Set wordObj = WScript.CreateObject("Word.Application")
# Set fsObj = WScript.CreateObject("Scripting.FileSystemObject")
#
# ' SubRoutine call convert all files in folder to HTML
# WordToHTML fsObj.GetFolder("..")
#
# wordObj.Quit
#
# Set fsObj = Nothing
# Set wordObj = Nothing
#
#
#
# '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# ' Subroutines
# '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# Sub WordToHTML(Folder)
# On Error Resume Next
# Dim File
# For Each File In Folder.Files
# Set opgaveDoc = wordObj.Documents.Open (File.Path)
# wordobj.ActiveDocument.SaveAs replace(replace(replace(Replace(file.path, " ", "_"),"å","aa"),"æ","ae"),"ø","oe") & ".htm", 8
# wordobj.ActiveDocument.close
# Next
# End Sub
#
#
#
# Step 2: this script
# - clean the MS HTML using HMTLTidy
# - locate the data in the cleaned html document
# - clean data of unneccesary formating
# - insert the found data in the wanted format (HTML template)
# - generate SQL statement to insert task into database
# - execute SQL statement
# - log the process
#
#################################################################################
#################################################################################
# INCLUDE MODULES
#################################################################################
#use Carp;
#use diagnostics;
use DBI;
use HTML::TreeBuilder;
#################################################################################
# VARABLE DECLARATION
#################################################################################
my $user = "USERNAME"; # database username
my $passwd = "PASSWORD"; # database password
my $driver = "Oracle"; # DBD (DataBase Driver)
my $db_name = "DATABASENAME; # database server instance name (tnsname)
my $pathHtmlBackup = ".\\temp\\"; # where to put the original backup file
my $pathLogFile = ".\\"; # where to put the processing log file
my $pathDocuments = ".\\doc\\"; # path to where the word documents are located
my $doDatabaseUpdate = 1; # used during development 0=off, 1=on
my $DBITraceEnabled = 0; # If you want DBI tracing: 1=on, 0=off
my $DBITraceLevel = 2; # DBI tracing level (1-9). 1 or 2 should be sufficient
my $DBITraceFilePath = "c:\\trace.log"; # Path to where you want the trace file. On win32 remember to escape the \ with
# another \ so "c:\trace.log" would become "c:\\trace.log"
#################################################################################
# SUB ROUTINE DECLARATION
#################################################################################
sub decodeHTMLTask;
#################################################################################
# MAIN SCRIPT FLOW
#################################################################################
print "Connecting to database. Please wait ...\n";
my $dataSource = "dbi:$driver:${db_name}";
my $dbh = DBI->connect($dataSource, $user, $passwd) || die "Error connecting to database: $DBI::errstr\n";
if ($DBITraceEnabled) {$dbh->trace($DBITraceLevel, $DBITraceFilePath);}
# Make sure buffer is big enough for CLOB, BLOB
$dbh->{LongReadLen}=500 * 1024; # 500KB
# Getting all the html docs from $pathDocuments
@htmlFileList = `dir /b *.htm`;
chop @htmlFileList;
open (LOGFILE, ">migration.log.txt") || die ("Could not open log file");
# Process each an every file with a html extension found in the current directory
foreach $file (@htmlFileList) {
# Backup the converted word document
$cmd = `copy $file ${pathHtmlBackup}${file}.org.html`;
print LOGFILE ("Backup file : copy $file ${pathHtmlBackup}${file}.org.html\n");
# Run the html file through HTMLTidy which cleans out the html
$cmd = `TidyDbg.exe -config clean.cfg $file`;
print LOGFILE ("Clean file : TidyDbg.exe -config clean.cfg $file\n");
# Move the cleaned html file
$cmd = `move $file ${pathHtmlBackup}${file}.clean.html`;
print LOGFILE ("Move file : move $file ${pathHtmlBackup}${file}.clean.html\n");
open (CLEANHTML, "${pathHtmlBackup}${file}.clean.html") || die ("Cannot open file ${pathHtmlBackup}${file}.clean.html\n");
open (CLEANNOBRHTML, ">${pathHtmlBackup}${file}.clean.nobr.html") || die ("Cannot open file ${pathHtmlBackup}${file}.clean.nobr.html\n");
# Removing all newline characters (editlive does not like them)
# Read the file assocsiated with filehandle CLEANHTML into array @lines
@lines = <CLEANHTML>;
# Chop of the last character in every item in array
#chop (@lines);
# Join the array to a single string
$line = join ("",@lines);
$line =~ s/\n/ /g;
print CLEANNOBRHTML $line;
close (CLEANNOBRHTML);
close (CLEANHTML);
# Build insert statement based on information found in the html file
&decodeHTMLTask($file);
} # end foreach
close (LOGFILE);
#################################################################################
# SUB ROUTINES
#################################################################################
#################################################################################
# COMMENTS: "decodeHTMLTask"
#---------------------------
#
# INPUT : FILENAME (TEXT STRING) EX. something.html
# FUNCTION: PARSE THE FILES DOCUMENT TREE STRUCTURE AND GRAB THE INFORMATION
# WHICH IS NEEDED. BUILD THE SQL INSERT STATEMENT BASED ON THIS
# INFORMATION.
# OUTPUT : SQL INSERT STATEMENT
#################################################################################
sub decodeHTMLTask{
my ($file) = @_;
my $tree = HTML::TreeBuilder->new;
$tree->parse_file("${pathHtmlBackup}${file}.clean.nobr.html");
#$tree->dump; # Uncomment this line for document tree dump
print LOGFILE ("Processing : ${pathHtmlBackup}${file}.clean.nobr.html...\n");
print ("Processing : ${pathHtmlBackup}${file}...\n");
# Get title from html document.
my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[1]->content->[1];
my $taskTitle = $tref->as_text;
# Get stage (PÃ¥ scenen) from html document
my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[2]->content->[0];
my $onStage = $tref->as_text;
# Since we know that stage has to be either onStage or offStage we already know
# which one it is.
# Get priority (Need) from html document
my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[2]->content->[2];
my $need = $tref->as_text;
# Since we know that need has to be either need or nice we already know
# which one it is.
# Determinig what priority id we need
if ($onStage eq "x") {
if ($need eq "x") {
$priorityId = 1;
}
else {
$priorityId = 2;
}
}
else {
if ($need eq "x") {
$priorityId = 3;
}
else {
$priorityId = 4;
}
}
# Get (Hvorfor) from html document
my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[4]->content->[1];
my $hvorfor = $tref->as_text;
# Get (Hvordan) from html document
my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[6]->content->[0];
my $hvordan = $tref->as_HTML;
# We need to remove a start/tags td (tabledata) fomr the hvordan data
# to be able to use the data in to input template
$hvordan =~ s/<td colspan=11 valign="top">//ig;
$hvordan =~ s/<\/td>//ig;
# Get (Links) from html document
my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[8]->content->[0]->content->[0];
my $links = $tref->as_HTML;
# Clear memory
$tree->delete;
$sql = "select task_id_seq.nextval from dual";
$sth = $dbh->prepare($sql) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";;
$sth->execute() || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n";
$sth->bind_col( 1, \$task_id );
$sth->fetch();
# Building the task description text
my $HTMLTask = "<br><TABLE cellSpacing=0 cellPadding=0 border=1><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Hvorfor</FONT></H5><P><FONT face=Verdana>$hvorfor</FONT></P><P> </P></TD></TR><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Hvordan</FONT></H5><P><FONT face=Verdana>$hvordan</FONT></P><P> </P></TD></TR><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Links</FONT></H5><P><FONT face=Verdana>$links</FONT></P><P> </P></TD></TR></TABLE><br>";
$HTMLTask =~ s/\n/ /g;
# SQL statement needed to inser task into database
my $sqlInsertTask = "
INSERT INTO TASK (
task_id,
gro_id,
name,
task_flag,
text,
priority_id,
user_created,
date_created,
user_changed,
date_changed)
VALUES (
:task_id,
task_admin_pack.getPrimgroid(),
:taskTitle,
'T',
:HTMLTask,
:priorityId,
'Task massload script',
sysdate,
'Task massload script',
sysdate)
";
$sth = $dbh->prepare($sqlInsertTask) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";;
$sth->bind_param(":task_id", $task_id);
$sth->bind_param(":taskTitle", $taskTitle);
$sth->bind_param(":HTMLTask", $HTMLTask);
$sth->bind_param(":priorityId", $priorityId);
$sth->execute() || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n";
print LOGFILE ("Execute SQL : INSERT INTO TASK (task_id, gro_id, name, task_flag, text, priority_id, user_created, date_created, user_changed, date_changed) VALUES ($task_id, TASK_admin_pack.getPrimgroid(), '$taskTitle', 'T', '$HTMLTask', '$priorityId', 'Task massload script', sysdate, 'Task massload script', sysdate)\n;");
# Generate insert statement needed to insert into all search
my $sqlInsertAllSearch = "
INSERT INTO all_search (
as_id,
table_id,
all_searchfield,
com_id,
gro_id,
pro_id,
table_name,
id_field_name,
owner_field_name,
access_table_name)
values (
as_seq.nextval,
:task_id,
:v_temptext,
:v_componentID,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL)
";
$sth = $dbh->prepare($sqlInsertAllSearch) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";;
$sth->bind_param(":task_id", $task_id);
$sth->bind_param(":v_temptext", $taskTitle . " " . $HTMLTask);
$sth->bind_param(":v_componentID", 42);
$sth->execute() || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n";
print LOGFILE ("Execute SQL : INSERT INTO all_search (as_id, table_id, all_searchfield, com_id, gro_id, pro_id, table_name, id_field_name, owner_field_name, access_table_name) VALUES (as_seq.nextval, $p_taskid, '$taskTitle $HTMLTask', $v_componentID, $v_groid, NULL, NULL, NULL, NULL, NULL);\n");
}
This is the configuration settings use together with Html Tidy for windows
word-2000: yes show-body-only: yes clean: yes char-encoding: raw quiet: yes show-errors: 0 show-warnings: no drop-empty-paras: no indent-spaces: 3 new-inline-tags: o:p doctype: omit tidy-mark: no quote-ampersand: no write-back: yes force-output: yes
No comments:
Post a Comment