#!/usr/bin/python

######################################################
## etext2palmdoc:
##
## This program is released under the GNU GPL, version 2 or later.
## See http://www.gnu.org/licenses/gpl.html
##
## What it does: 
## this script will "massage" a Project Gutenberg etext
## (http://www.gutenberg.org/)
## so it is ready for converting into a palm doc file (*.prc / *.pdb)
##
## The script joins lines back together and replaces some special chars
## The result is printed on stdout
##
## The Palm character set is equivalent to Windows codepage 1252
## (http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT)
##
## After using this utility, use a TXT to DOC converter
## such as MakeDocW (http://www.pierce.de/makedocw.html)
## or http://homepage.mac.com/pauljlucas/software/txt2pdbdoc
## to generate the palm doc file.
##
## A private note: I greatly respect what Project Gutenberg does,
## but at the same time I think its formatting standards are completely
## outdated (which is no surprise knowing it started in 1971) and a stupid
## design decision. In particular: truncating the lines to a fixed length
## and limiting to use ASCII as the encoding (as this rules out all non-english
## texts or cripples them severely).

import string, sys, os, re, getopt

## line seperator, newline is enough for pdb/prc eBooks (saves 1 byte)
#NEWLINE=os.linesep
NEWLINE='\n'

## list of characters that should be considered to end a paragraph,
## these lines will NOT be joined together
ENDNOJOIN='".;!?:*-+0123456789'

## The bookmark character to use, * usually works
## This character must not appear anywhere else in the text!!!
bookmarkchar = '*'
bookmarktrailer = '<'+bookmarkchar+' >'

## regular Expression to use for chapters,
## this will set an automatic bookmark for each position where this regexp is found
## We set this to at least 22 spaces before a word
bookmarkPattern=re.compile(r'^\s{22,}[A-Z]',re.U)

## paragraphpattern, to detect beginning of a paragrap, we add a newline here
## usually 3 spaces at line beginning
paragraphPattern=re.compile(r'^\s{1,3}',re.U)

##
## Removes anything before *END*THE SMALL PRINT! FOR PUBLIC DOMAIN ETEXTS* *END*
## to keep portable ebook files smaller
##
gutenbergHeaderPattern=re.compile(r'^\*END\*.*\*END\*',re.U)

## Globals
skipGutenbergHeader = 1


def FixStream(readfrom=sys.stdin, writeto=sys.stdout):
	"""Read from input stream, clean, write to output stream"""
	global skipGutenbergHeader
	line=readfrom.readline()
	outline=''
	while len(line)>0:
		if skipGutenbergHeader:
			if gutenbergHeaderPattern.search(line):
				skipGutenbergHeader = 0
			line=readfrom.readline()
			continue
		line = string.rstrip(line)
		if len(line)==0:
			# empty line/whitespace only, flush old content and add newline
			if len(outline)>0:
				writeto.write(outline+NEWLINE)
			writeto.write(NEWLINE)
			outline=''
		else:
			# we have some content
			# if we want bookmarks
			if len(bookmarkchar)>0 and bookmarkPattern.search(line):
				line = bookmarkchar+' '+string.strip(line)
			if paragraphPattern.search(line):
				if len(outline)>0:
					writeto.write(outline+NEWLINE)
				writeto.write(NEWLINE)
				outline=''
				string.strip(line)
			if line[-1] in ENDNOJOIN:
				# if the line ends with a normal char we join and flush it
				if len(outline)==0 or (outline[-1]==' ' or outline[-1]=='-'):
					outline = outline + string.strip(line)
				else:
					outline = outline + ' '+ string.strip(line)
				writeto.write(outline+NEWLINE)
				outline=''
			else:
				# we just keep joining
				if len(outline)==0 or (outline[-1]==' ' or outline[-1]=='-'):
					outline = outline + string.strip(line)
				else:
					outline = outline + ' '+ string.strip(line)
		line=readfrom.readline()
	# add bookmark identifier to end
	if len(bookmarkchar)>0:
		writeto.write(bookmarktrailer+NEWLINE)


def FixFile(filename=''):
	"""Check if file exists, rename it, open a new target file, call FixStream, delete old tmp file"""
	if os.path.isfile(filename) and not os.path.islink(filename):
		dirPart = os.path.dirname(filename)
		if len(dirPart)==0:
			dirPart=None
		oldName=filename+'.bak'
		try:
			os.rename(filename, oldName)
		except:
			sys.stderr.write('ERROR: cannot rename TMP file in directory \"'+oldName+'\"'+os.linesep)
			sys.exit(1)
		try:
			outfile=open(filename,'wb')
		except:
			sys.stderr.write('ERROR: cannot write new file in directory \"'+filename+'\"'+os.linesep)
			os.rename(oldName,filename)
			sys.exit(1)

		infile=open(oldName,'rb')
		FixStream(infile,outfile)
		infile.close()
		outfile.close()
		#os.remove(oldName)
	else:
		sys.stderr.write('ERROR: \"'+filename+'\" is not a regular file'+os.linesep)
		sys.exit(1)

def usage():
	"""Output usage message."""
	sys.stderr.write('Usage: '+sys.argv[0]+' [options] [ <directory> | <file1> <file2> ...]\n')
	sys.stderr.write('  This script will clean a project gutenberg etext\n')
	sys.stderr.write('  before conversion to pdb/prc.\n')
	sys.stderr.write('  If there is no file, reads from stdin.\n')
	sys.stderr.write('  Options:\n')
	sys.stderr.write('    [ -h | --help ]  print this message\n')
	sys.stderr.write('    [ -r N | --replace=N ]  replace N spaces with a tab everywhere\n')
	sys.stderr.write('\n')

def main():
	"""Main action."""
	# look for command line options: <source> <target>
	opts=[]
	args=[]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "hr:", ['help', 'replace'])
	except getopt.GetoptError:
		# print help information and exit:
		usage()
		sys.exit(2)
	output = None
	for o, a in opts:
		if o in ("-h", "--help"):
			usage()
			sys.exit(0)
		if o in ("-r", "--replace"):
			spaceReplace = int(a)
			spaceString = ' '*spaceReplace
	if len(args)==1:
		# we have one argument, see if it is a directory, if yes we do all files
		if os.path.isdir(args[0]):
			for item in os.listdir(args[0]):
				if (item!='.') or (item!='..'):
					FixFile(os.path.join(args[0],item))
		else:
			FixFile(args[0])
	elif len(args)>1:
		# we have multiple arguments, assume each is a file and treat it individually
		for item in args:
			FixFile(item)
	else:
		# assume stdin and stdout
		FixStream()


if __name__=="__main__":
	main()

