aboutsummaryrefslogtreecommitdiffstats
path: root/_mutt/word2text.sh
blob: fe9682d1980864cd39681d8aaf1d7eeb810d1950 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/bin/sh
#
# word2text - convert MS Word files to ASCII text
#
# SYNOPSIS
#	word2text file
#
# DESCRIPTION
#	Word2text uses wvHtml, w3m and some perl glue to convert the MS
#	Word file specified by the argument to ASCII text on stdout.
#
#	wvHtml converts MS Word files to HTML, but is intended to be
#	used with a graphical browser such as Netscape Navigator, so it
#	converts certain graphical elements to image files and
#	corresponding <img> tags that the browser can render.  Since
#	this script uses a text based browser, it uses perl to eliminate
#	these <img> tags.
#
#	Compared to Quick View Plus (qvpview), the rendering of MS Word
#	documents done by word2text is usually more accurate.  Qvpview
#	doesn't render unrecognized characters well, if at all.  It also
#	renders numbered lists as bullet lists.
#
#	Compared to the plain text translations that some people include
#	in their e-mail along with the original MS Word attachments, the
#	rendering done by word2text is usually more readable:  vertical
#	spacing between paragraphs and list items is better and the
#	adjustment of text within paragraphs is better.
#
# BUGS
#	wvHtml occasionally dumps core.
#	With the wv-0.7.4 release, some unnumbered lists are rendered as
#	numbered.
#
# AUTHOR
#	Gary A. Johnson
#	<garyjohn@spk.agilent.com>
#
# REVISION HISTORY
#	2003-05-31
#		Between wv-0.5.42 and wv-0.7.4, wvHtml changed to insert
#		a <p> tag between each <li> tag and the following text,
#		causing the list-item text to start on the line
#		following the list-item bullet or number.  A perl
#		expression was added to this script to fix the problem.
# 	2003-02-19
#		The command-line arguments to wvHtml changed, requiring
#		the output file name as well as the input file name.
#		Therefore, '-' (stdout) was added to the command as the
#		output file name.

wvHtml "$1" - 2> /dev/null |
perl -0777 -p -e '
	s|<img .*?>||gs;		# Delete img tags.
	s|(<li.*?>)\s*<p>|\1|gs;	# Remove <p> tags immediately
					# following <li> tags.  (This
					# problem appeared somewhere
					# between wv-0.5.42 and
					# wv-0.7.4.)
' |
w3m -dump -T text/html |
perl -p -e '
	s/\n\s*\n/\n\n/gs;		# Delete extra whitespace
					# between lines.
	s/\xa0/ /gs;			# Change A0 spaces to ASCII
					# spaces.
'