Tuesday, August 12, 2003

Importing and converting 600 word documents into a database as clean html

The following script is used to import 600 Word documents into a database as cleaned html.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
#!c:\oracle\ora81/Apache/perl/5.00503/bin/MSWin32-x86/perl.exe
 
#################################################################################
# Requirements:
# ------------
#
# - Perl 5.00503 or later installed and working on your machine
# - change the first line in this script to point to the perl executable
#
#
#
# Modules:
# --------
#
# - you will need (HTML::Parser, HTML::Treebuilder )
# - install instrucitons
#      win32: (you need Visual Studio to install manually. If you are using ActiveState
#             Perl then use the Perl Package Manager they provide else see below)
#          1. go to www.cpan.org browse the modules and find the above mentioned
#             modules.
#          2. download
#          3. Unzip/Untar
#          4. Open a command prompt in the HTML::Parser unziped directory
#          5. type> perl makefile.pl
#          6. type> nmake
#          7. type> nmake test
#          8. type> nmake install
#          9. do the same for the HTML::Treebuilder directory
#
#
#
#
# Description:
# ------------
#
# This script was created to import 600 word documents as html into a Oracle database.
#
# This script is step 2 in an migration process including an vbs (Visual Basic Script)
#
# Step 1: Convert all documents from MS Word to HTML
#
#           Option Explicit
#
#           Dim wordObj, fsObj
#
#           Set wordObj = WScript.CreateObject("Word.Application")
#           Set fsObj = WScript.CreateObject("Scripting.FileSystemObject")
#
#           ' SubRoutine call convert all files in folder to HTML
#           WordToHTML fsObj.GetFolder("..")
#
#           wordObj.Quit
#
#           Set fsObj = Nothing
#           Set wordObj = Nothing
#
#
#
#           '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
#           ' Subroutines
#           '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
#           Sub WordToHTML(Folder)
#              On Error Resume Next
#              Dim File
#              For Each File In Folder.Files
#                 Set opgaveDoc = wordObj.Documents.Open (File.Path)
#                 wordobj.ActiveDocument.SaveAs replace(replace(replace(Replace(file.path, " ", "_"),"Ã¥","aa"),"æ","ae"),"ø","oe") & ".htm", 8
#                 wordobj.ActiveDocument.close
#              Next
#           End Sub
#
#
#
# Step 2: this script
#         - clean the MS HTML using HMTLTidy
#         - locate the data in the cleaned html document
#         - clean data of unneccesary formating
#         - insert the found data in the wanted format (HTML template)
#         - generate SQL statement to insert task into database
#         - execute SQL statement
#         - log the process
#
#################################################################################
 
 
#################################################################################
# INCLUDE MODULES
#################################################################################
 
#use Carp;
#use diagnostics;
use DBI;
use HTML::TreeBuilder;
 
 
 
#################################################################################
# VARABLE DECLARATION
#################################################################################
 
my $user    = "USERNAME";                    # database username
my $passwd  = "PASSWORD";                    # database password
my $driver  = "Oracle";                   # DBD (DataBase Driver)
my $db_name = "DATABASENAME;                    # database server instance name (tnsname)
 
my $pathHtmlBackup = ".\\temp\\";         # where to put the original backup file
my $pathLogFile = ".\\";                  # where to put the processing log file
my $pathDocuments = ".\\doc\\";           # path to where the word documents are located
 
my $doDatabaseUpdate = 1;                 # used during development 0=off, 1=on
 
my $DBITraceEnabled = 0;                  # If you want DBI tracing: 1=on, 0=off
my $DBITraceLevel = 2;                    # DBI tracing level (1-9). 1 or 2 should be sufficient
my $DBITraceFilePath = "c:\\trace.log";   # Path to where you want the trace file. On win32 remember to escape the \ with
                                          # another \ so "c:\trace.log" would become "c:\\trace.log"
 
 
 
#################################################################################
# SUB ROUTINE DECLARATION
#################################################################################
 
sub decodeHTMLTask;
 
 
 
#################################################################################
# MAIN SCRIPT FLOW
#################################################################################
 
print "Connecting to database. Please wait ...\n";
 
my $dataSource = "dbi:$driver:${db_name}";
my $dbh = DBI->connect($dataSource, $user, $passwd) || die "Error connecting to database: $DBI::errstr\n";
if ($DBITraceEnabled) {$dbh->trace($DBITraceLevel, $DBITraceFilePath);}
 
# Make sure buffer is big enough for CLOB, BLOB
$dbh->{LongReadLen}=500 * 1024;  # 500KB
 
# Getting all the html docs from $pathDocuments
@htmlFileList = `dir /b *.htm`;
chop @htmlFileList;
 
open (LOGFILE, ">migration.log.txt") || die ("Could not open log file");
 
# Process each an every file with a html extension found in the current directory
foreach $file (@htmlFileList) {
 
   # Backup the converted word document
   $cmd = `copy $file ${pathHtmlBackup}${file}.org.html`;
   print LOGFILE ("Backup file : copy $file ${pathHtmlBackup}${file}.org.html\n");
 
   # Run the html file through HTMLTidy which cleans out the html
   $cmd = `TidyDbg.exe -config clean.cfg $file`;
   print LOGFILE ("Clean file  : TidyDbg.exe -config clean.cfg $file\n");
 
   # Move the cleaned html file
   $cmd = `move $file ${pathHtmlBackup}${file}.clean.html`;
   print LOGFILE ("Move file   : move $file ${pathHtmlBackup}${file}.clean.html\n");
 
   open (CLEANHTML, "${pathHtmlBackup}${file}.clean.html") || die ("Cannot open file ${pathHtmlBackup}${file}.clean.html\n");
   open (CLEANNOBRHTML, ">${pathHtmlBackup}${file}.clean.nobr.html") || die ("Cannot open file ${pathHtmlBackup}${file}.clean.nobr.html\n");
 
   # Removing all newline characters (editlive does not like them)
 
   # Read the file assocsiated with filehandle CLEANHTML into array @lines
   @lines = <CLEANHTML>;
 
   # Chop of the last character in every item in array
   #chop (@lines);
 
   # Join the array to a single string
   $line = join ("",@lines);
   $line =~ s/\n/ /g;
 
   print CLEANNOBRHTML $line;
 
   close (CLEANNOBRHTML);
   close (CLEANHTML);
 
   # Build insert statement based on information found in the html file
   &decodeHTMLTask($file);
} # end foreach
 
close (LOGFILE);
 
 
#################################################################################
# SUB ROUTINES
#################################################################################
 
 
 
#################################################################################
# COMMENTS: "decodeHTMLTask"
#---------------------------
#
# INPUT  : FILENAME (TEXT STRING) EX. something.html
# FUNCTION: PARSE THE FILES DOCUMENT TREE STRUCTURE AND GRAB THE INFORMATION
#           WHICH IS NEEDED. BUILD THE SQL INSERT STATEMENT BASED ON THIS
#           INFORMATION.
# OUTPUT  : SQL INSERT STATEMENT
#################################################################################
sub decodeHTMLTask{
   my ($file) = @_;
 
   my $tree = HTML::TreeBuilder->new;
   $tree->parse_file("${pathHtmlBackup}${file}.clean.nobr.html");
 
   #$tree->dump;           # Uncomment this line for document tree dump
   print LOGFILE ("Processing  : ${pathHtmlBackup}${file}.clean.nobr.html...\n");
   print ("Processing  : ${pathHtmlBackup}${file}...\n");
 
   # Get title from html document.
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[1]->content->[1];
   my $taskTitle = $tref->as_text;
 
   # Get stage (PÃ¥ scenen) from html document
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[2]->content->[0];
   my $onStage = $tref->as_text;
   # Since we know that stage has to be either onStage or offStage we already know
   # which one it is.
 
   # Get priority (Need) from html document
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[2]->content->[2];
   my $need = $tref->as_text;
   # Since we know that need has to be either need or nice we already know
   # which one it is.
 
   # Determinig what priority id we need
   if ($onStage eq "x") {
      if ($need eq "x") {
         $priorityId = 1;
      }
      else {
         $priorityId = 2;
      }
   }
   else {
      if ($need eq "x") {
         $priorityId = 3;
      }
      else {
         $priorityId = 4;
      }
   }
 
   # Get (Hvorfor) from html document
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[4]->content->[1];
   my $hvorfor = $tref->as_text;
 
   # Get (Hvordan) from html document
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[6]->content->[0];
   my $hvordan = $tref->as_HTML;
 
   # We need to remove a start/tags td (tabledata) fomr the hvordan data
   # to be able to use the data in to input template
   $hvordan =~ s/<td colspan=11 valign="top">//ig;
   $hvordan =~ s/<\/td>//ig;
 
   # Get (Links) from html document
   my $tref = $tree->content->[1]->content->[0]->content->[2]->content->[8]->content->[0]->content->[0];
   my $links = $tref->as_HTML;
 
   # Clear memory
   $tree->delete;
 
   $sql = "select task_id_seq.nextval from dual";
   $sth = $dbh->prepare($sql) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";;
   $sth->execute()  || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n";
   $sth->bind_col( 1, \$task_id );
   $sth->fetch();
 
   # Building the task description text
   my $HTMLTask = "
<TABLE cellSpacing=0 cellPadding=0 border=1><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Hvorfor</FONT></H5><P><FONT face=Verdana>$hvorfor</FONT></P><P>&nbsp;</P></TD></TR><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Hvordan</FONT></H5><P><FONT face=Verdana>$hvordan</FONT></P><P>&nbsp;</P></TD></TR><TR><TD vAlign=top width=571><H5><FONT face=Verdana>Links</FONT></H5><P><FONT face=Verdana>$links</FONT></P><P>&nbsp;</P></TD></TR></TABLE>
";
   $HTMLTask =~ s/\n/ /g;
 
   # SQL statement needed to inser task into database
   my $sqlInsertTask = "
      INSERT INTO TASK (
         task_id,
         gro_id,
         name,
         task_flag,
         text,
         priority_id,
         user_created,
         date_created,
         user_changed,
         date_changed)
      VALUES (
         :task_id,
         task_admin_pack.getPrimgroid(),
         :taskTitle,
         'T',
         :HTMLTask,
         :priorityId,
         'Task massload script',
         sysdate,
         'Task massload script',
         sysdate)
   ";
 
   $sth = $dbh->prepare($sqlInsertTask) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";;
   $sth->bind_param(":task_id", $task_id);
   $sth->bind_param(":taskTitle", $taskTitle);
   $sth->bind_param(":HTMLTask", $HTMLTask);
   $sth->bind_param(":priorityId", $priorityId);
   $sth->execute()  || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n";
 
   print LOGFILE ("Execute SQL : INSERT INTO TASK (task_id, gro_id, name, task_flag, text, priority_id, user_created, date_created, user_changed, date_changed) VALUES ($task_id, TASK_admin_pack.getPrimgroid(), '$taskTitle', 'T', '$HTMLTask', '$priorityId', 'Task massload script', sysdate, 'Task massload script', sysdate)\n;");
 
   # Generate insert statement needed to insert into all search
   my $sqlInsertAllSearch = "
      INSERT INTO all_search (
         as_id,
         table_id,
         all_searchfield,
         com_id,
         gro_id,
         pro_id,
         table_name,
         id_field_name,
         owner_field_name,
         access_table_name)
      values (
         as_seq.nextval,
         :task_id,
         :v_temptext,
         :v_componentID,
         NULL,
         NULL,
         NULL,
         NULL,
         NULL,
         NULL)
   ";
 
   $sth = $dbh->prepare($sqlInsertAllSearch) || die "\nPrepare error: $dbhI::err .... $dbhI::errstr\n";;
 $sth->bind_param(":task_id", $task_id);
 $sth->bind_param(":v_temptext", $taskTitle . " " . $HTMLTask);
 $sth->bind_param(":v_componentID", 42);
   $sth->execute()  || die "\nExecute error: $dbhI::err .... $dbhI::errstr\n";
 
   print LOGFILE ("Execute SQL : INSERT INTO all_search (as_id, table_id, all_searchfield, com_id, gro_id, pro_id, table_name, id_field_name, owner_field_name, access_table_name) VALUES (as_seq.nextval, $p_taskid,  '$taskTitle $HTMLTask', $v_componentID, $v_groid, NULL, NULL, NULL, NULL, NULL);\n");
 
}

This is the configuration settings use together with Html Tidy for windows

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
word-2000: yes
show-body-only: yes
clean: yes
char-encoding: raw
quiet: yes
show-errors: 0
show-warnings: no
drop-empty-paras: no
indent-spaces: 3
new-inline-tags: o:p
doctype: omit
tidy-mark: no
quote-ampersand: no
write-back: yes
force-output: yes