#!/usr/bin/env perl # sxw2txt -- Converts OpenOffice.org Writer files to plain text. # Copyright (C) 2004 Liam Morland # Copyright (C) 2006 Vincent Lefevre # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. # # Liam Morland # 86A McDougall Road, Waterloo, Ontario, N2L 5C5, CANADA # # Changes by Vincent Lefevre . # 2006-03-21: Better #! line, typos, error messages output to stderr, # better whitespace removal, clean execution of unzip (for # filenames with special characters). # 2006-03-22: Use Archive::Zip, otherwise fallback to the unzip command. # 2006-03-26: Conversions from Symbol aka "Standard Symbols L" characters # (in the private Unicode area U+F020..U+F0FF) into standard # Unicode characters. I don't know whether this private area # is used in a consistent way by OpenOffice, but IMHO, this # conversion is better than nothing. use strict; use warnings; my ($proc) = '$Id: sxw2txt 39225 2010-09-07 21:05:10Z vinc17/prunille $' =~ /^.Id: (\S+) / or die; # First argument is taken to be the input file. All other args are ignored. my $input_file = shift; # If we have a filename, try to get the content.xml from it, # otherwise print usage information. defined $input_file or $! = 1, die <new(); }; if (defined $zip) { $zip->read($input_file) and die "$proc: $input_file is unreadable or has an incorrect format\n"; $_ = $zip->contents('content.xml') or die "$proc: can't extract content.xml from $input_file\n"; } else { my $pid = open UNZIP, "-|"; defined $pid or die "$proc: can't fork: $!\n"; unless ($pid) # child { # It's probably better to get the possible error message from unzip, # to know the reason of the failure (e.g. "Permission denied"). #open STDERR, '/dev/null'; exec 'unzip', '-p', $input_file, 'content.xml'; die "$proc: exec unzip failed: $!\n"; } $_ = do { local $/; }; close UNZIP or die "$proc: can't extract content.xml from $input_file\n"; } # Convert the OOo XML to text with a series of regex substitutions. s,\n+, ,g; # Tables are wrapped with [begin-table] and [end-table]. # Rows and cells begin with [table-row] and [table-cell] respectively. s,]*)?>,\n\n[begin-table],g; s,,\n[end-table],g; s,]*)?>(<[^>]+>)*]*>,\n[table cell],g; s,]*)?>,\n\n[table row],g; # OOo tabs are made into tab characters. s,,\t,g; # Each list item is given a '*' as a bullet. # Sorry, no fancy support for nested lists yet. s,]*>,\n\n* ,g; # Skip two lines before each new paragraph. s,]*>,\n\n,g; # Get rid of any remaining tags. Want to add support for tags not # handled above? Do it above this line. s,<[^>]*>,,g; # Convert common entities into the appropriate character. s,<,<,g; s,>,>,g; s,',',g; s,",",g; s,&,&,g; # Convert "Standard Symbols L" characters into standard Unicode # characters. Note: I could have written # binmode UNZIP, ":utf8"; # just before reading UNZIP to use Unicode code points directly, # but this makes the substitutions very slow. my %symbol = ( "\x{80}\x{A1}" => "!", "\x{80}\x{A2}" => "\x{E2}\x{88}\x{80}", "\x{80}\x{A3}" => "#", "\x{80}\x{A4}" => "\x{E2}\x{88}\x{83}", "\x{80}\x{A5}" => "%", "\x{80}\x{A6}" => "&", "\x{80}\x{A7}" => "\x{C9}\x{9C}", "\x{80}\x{A8}" => "(", "\x{80}\x{A9}" => ")", "\x{80}\x{AA}" => "*", "\x{80}\x{AB}" => "+", "\x{80}\x{AC}" => ",", "\x{80}\x{AD}" => "-", "\x{80}\x{AE}" => ".", "\x{80}\x{AF}" => "/", "\x{80}\x{B0}" => "0", "\x{80}\x{B1}" => "1", "\x{80}\x{B2}" => "2", "\x{80}\x{B3}" => "3", "\x{80}\x{B4}" => "4", "\x{80}\x{B5}" => "5", "\x{80}\x{B6}" => "6", "\x{80}\x{B7}" => "7", "\x{80}\x{B8}" => "8", "\x{80}\x{B9}" => "9", "\x{80}\x{BA}" => ":", "\x{80}\x{BB}" => ";", "\x{80}\x{BC}" => "<", "\x{80}\x{BD}" => "=", "\x{80}\x{BE}" => ">", "\x{80}\x{BF}" => "?", "\x{81}\x{80}" => "\x{E2}\x{89}\x{85}", "\x{81}\x{81}" => "\x{CE}\x{91}", "\x{81}\x{82}" => "\x{CE}\x{92}", "\x{81}\x{83}" => "\x{CE}\x{A7}", "\x{81}\x{84}" => "\x{CE}\x{94}", "\x{81}\x{85}" => "\x{CE}\x{95}", "\x{81}\x{86}" => "\x{CE}\x{A6}", "\x{81}\x{87}" => "\x{CE}\x{93}", "\x{81}\x{88}" => "\x{CE}\x{97}", "\x{81}\x{89}" => "\x{CE}\x{99}", "\x{81}\x{8A}" => "\x{CF}\x{91}", "\x{81}\x{8B}" => "\x{CE}\x{9A}", "\x{81}\x{8C}" => "\x{CE}\x{9B}", "\x{81}\x{8D}" => "\x{CE}\x{9C}", "\x{81}\x{8E}" => "\x{CE}\x{9D}", "\x{81}\x{8F}" => "\x{CE}\x{9F}", "\x{81}\x{90}" => "\x{CE}\x{A0}", "\x{81}\x{91}" => "\x{CE}\x{98}", "\x{81}\x{92}" => "\x{CE}\x{A1}", "\x{81}\x{93}" => "\x{CE}\x{A3}", "\x{81}\x{94}" => "\x{CE}\x{A4}", "\x{81}\x{95}" => "\x{CE}\x{A5}", "\x{81}\x{96}" => "\x{CF}\x{82}", "\x{81}\x{97}" => "\x{CE}\x{A9}", "\x{81}\x{98}" => "\x{CE}\x{9E}", "\x{81}\x{99}" => "\x{CE}\x{A8}", "\x{81}\x{9A}" => "\x{CE}\x{96}", "\x{81}\x{9B}" => "[", "\x{81}\x{9C}" => "\x{E2}\x{88}\x{B4}", "\x{81}\x{9D}" => "]", "\x{81}\x{9E}" => "\x{E2}\x{8A}\x{A5}", "\x{81}\x{9F}" => "_", "\x{81}\x{A0}" => "\x{C2}\x{AF}", "\x{81}\x{A1}" => "\x{CE}\x{B1}", "\x{81}\x{A2}" => "\x{CE}\x{B2}", "\x{81}\x{A3}" => "\x{CF}\x{87}", "\x{81}\x{A4}" => "\x{CE}\x{B4}", "\x{81}\x{A5}" => "\x{CE}\x{B5}", "\x{81}\x{A6}" => "\x{CF}\x{95}", "\x{81}\x{A7}" => "\x{CE}\x{B3}", "\x{81}\x{A8}" => "\x{CE}\x{B7}", "\x{81}\x{A9}" => "\x{CE}\x{B9}", "\x{81}\x{AA}" => "\x{CF}\x{86}", "\x{81}\x{AB}" => "\x{CE}\x{BA}", "\x{81}\x{AC}" => "\x{CE}\x{BB}", "\x{81}\x{AD}" => "\x{CE}\x{BC}", "\x{81}\x{AE}" => "\x{CE}\x{BD}", "\x{81}\x{AF}" => "\x{CE}\x{BF}", "\x{81}\x{B0}" => "\x{CF}\x{80}", "\x{81}\x{B1}" => "\x{CE}\x{B8}", "\x{81}\x{B2}" => "\x{CF}\x{81}", "\x{81}\x{B3}" => "\x{CF}\x{83}", "\x{81}\x{B4}" => "\x{CF}\x{84}", "\x{81}\x{B5}" => "\x{CF}\x{85}", "\x{81}\x{B6}" => "\x{CF}\x{96}", "\x{81}\x{B7}" => "\x{CF}\x{89}", "\x{81}\x{B8}" => "\x{CE}\x{BE}", "\x{81}\x{B9}" => "\x{CF}\x{88}", "\x{81}\x{BA}" => "\x{CE}\x{B6}", "\x{81}\x{BB}" => "{", "\x{81}\x{BC}" => "|", "\x{81}\x{BD}" => "}", "\x{81}\x{BE}" => "~", "\x{82}\x{A1}" => "\x{CF}\x{92}", "\x{82}\x{A2}" => "\x{E2}\x{80}\x{B2}", "\x{82}\x{A3}" => "\x{E2}\x{89}\x{A4}", "\x{82}\x{A4}" => "/", "\x{82}\x{A5}" => "\x{E2}\x{88}\x{9E}", "\x{82}\x{A6}" => "f", "\x{82}\x{A7}" => "\x{E2}\x{99}\x{A3}", "\x{82}\x{A8}" => "\x{E2}\x{99}\x{A6}", "\x{82}\x{A9}" => "\x{E2}\x{99}\x{A5}", "\x{82}\x{AA}" => "\x{E2}\x{99}\x{A0}", "\x{82}\x{AB}" => "\x{E2}\x{86}\x{94}", "\x{82}\x{AC}" => "\x{E2}\x{86}\x{90}", "\x{82}\x{AD}" => "\x{E2}\x{86}\x{91}", "\x{82}\x{AE}" => "\x{E2}\x{86}\x{92}", "\x{82}\x{AF}" => "\x{E2}\x{86}\x{93}", "\x{82}\x{B0}" => "\x{C2}\x{B0}", "\x{82}\x{B1}" => "\x{C2}\x{B1}", "\x{82}\x{B2}" => "\x{E2}\x{80}\x{B3}", "\x{82}\x{B3}" => "\x{E2}\x{89}\x{A5}", "\x{82}\x{B4}" => "\x{C3}\x{97}", "\x{82}\x{B5}" => "\x{E2}\x{88}\x{9D}", "\x{82}\x{B6}" => "\x{E2}\x{88}\x{82}", "\x{82}\x{B7}" => "\x{E2}\x{80}\x{A2}", "\x{82}\x{B8}" => "\x{C3}\x{B7}", "\x{82}\x{B9}" => "\x{E2}\x{89}\x{A0}", "\x{82}\x{BA}" => "\x{E2}\x{89}\x{A1}", "\x{82}\x{BB}" => "\x{E2}\x{89}\x{88}", "\x{82}\x{BC}" => "\x{E2}\x{80}\x{A6}", "\x{82}\x{BD}" => "|", "\x{82}\x{BE}" => "\x{E2}\x{80}\x{94}", "\x{82}\x{BF}" => "\x{E2}\x{86}\x{B5}", "\x{83}\x{80}" => "\x{E2}\x{84}\x{B5}", "\x{83}\x{81}" => "\x{E2}\x{84}\x{91}", "\x{83}\x{82}" => "\x{E2}\x{84}\x{9C}", "\x{83}\x{83}" => "\x{E2}\x{84}\x{98}", "\x{83}\x{84}" => "\x{E2}\x{8A}\x{97}", "\x{83}\x{85}" => "\x{E2}\x{8A}\x{95}", "\x{83}\x{86}" => "\x{E2}\x{88}\x{85}", "\x{83}\x{87}" => "\x{E2}\x{88}\x{A9}", "\x{83}\x{88}" => "\x{E2}\x{88}\x{AA}", "\x{83}\x{89}" => "\x{E2}\x{8A}\x{83}", "\x{83}\x{8A}" => "\x{E2}\x{8A}\x{87}", "\x{83}\x{8B}" => "\x{E2}\x{8A}\x{84}", "\x{83}\x{8C}" => "\x{E2}\x{8A}\x{82}", "\x{83}\x{8D}" => "\x{E2}\x{8A}\x{86}", "\x{83}\x{8E}" => "\x{E2}\x{88}\x{88}", "\x{83}\x{8F}" => "\x{E2}\x{88}\x{89}", "\x{83}\x{90}" => "\x{E2}\x{88}\x{A0}", "\x{83}\x{91}" => "\x{E2}\x{88}\x{87}", "\x{83}\x{92}" => "\x{C2}\x{AE}", "\x{83}\x{93}" => "\x{C2}\x{A9}", "\x{83}\x{94}" => "\x{E2}\x{84}\x{A2}", "\x{83}\x{95}" => "\x{E2}\x{88}\x{8F}", "\x{83}\x{96}" => "\x{E2}\x{88}\x{9A}", "\x{83}\x{97}" => "\x{E2}\x{88}\x{99}", "\x{83}\x{98}" => "\x{C2}\x{AC}", "\x{83}\x{99}" => "\x{E2}\x{88}\x{A7}", "\x{83}\x{9A}" => "\x{E2}\x{88}\x{A8}", "\x{83}\x{9B}" => "\x{E2}\x{87}\x{94}", "\x{83}\x{9C}" => "\x{E2}\x{87}\x{90}", "\x{83}\x{9D}" => "\x{E2}\x{87}\x{91}", "\x{83}\x{9E}" => "\x{E2}\x{87}\x{92}", "\x{83}\x{9F}" => "\x{E2}\x{87}\x{93}", "\x{83}\x{A0}" => "\x{E2}\x{8B}\x{84}", "\x{83}\x{A1}" => "\x{E3}\x{80}\x{88}", "\x{83}\x{A2}" => "\x{C2}\x{AE}", "\x{83}\x{A3}" => "\x{C2}\x{A9}", "\x{83}\x{A4}" => "\x{E2}\x{84}\x{A2}", "\x{83}\x{A5}" => "\x{E2}\x{88}\x{91}", "\x{83}\x{A6}" => "\x{E2}\x{8E}\x{9B}", "\x{83}\x{A7}" => "\x{E2}\x{8E}\x{9C}", "\x{83}\x{A8}" => "\x{E2}\x{8E}\x{9D}", "\x{83}\x{A9}" => "\x{E2}\x{8E}\x{A1}", "\x{83}\x{AA}" => "\x{E2}\x{8E}\x{A2}", "\x{83}\x{AB}" => "\x{E2}\x{8E}\x{A3}", "\x{83}\x{AC}" => "\x{E2}\x{8E}\x{A7}", "\x{83}\x{AD}" => "\x{E2}\x{8E}\x{A8}", "\x{83}\x{AE}" => "\x{E2}\x{8E}\x{A9}", "\x{83}\x{AF}" => "\x{E2}\x{8E}\x{AA}", "\x{83}\x{B1}" => "\x{E3}\x{80}\x{89}", "\x{83}\x{B2}" => "\x{E2}\x{88}\x{AB}", "\x{83}\x{B3}" => "\x{E2}\x{8C}\x{A0}", "\x{83}\x{B4}" => "\x{E2}\x{8E}\x{AE}", "\x{83}\x{B5}" => "\x{E2}\x{8C}\x{A1}", "\x{83}\x{B6}" => "\x{E2}\x{8E}\x{9E}", "\x{83}\x{B7}" => "\x{E2}\x{8E}\x{9F}", "\x{83}\x{B8}" => "\x{E2}\x{8E}\x{A0}", "\x{83}\x{B9}" => "\x{E2}\x{8E}\x{A4}", "\x{83}\x{BA}" => "\x{E2}\x{8E}\x{A5}", "\x{83}\x{BB}" => "\x{E2}\x{8E}\x{A6}", "\x{83}\x{BC}" => "\x{E2}\x{8E}\x{AB}", "\x{83}\x{BD}" => "\x{E2}\x{8E}\x{AC}", "\x{83}\x{BE}" => "\x{E2}\x{8E}\x{AD}", ); s,\x{EF}(..),$symbol{$1},eg; # Remove extra whitespace and print the result, always ending with \n. s/\n{3,}/\n\n/sg; s/[^\S\n]*\n/\n/g; s,^\s*(.*?)\s*$,$1,s; print "$_\n";