#!/usr/bin/python

import string,urllib,re,os, filecmp, time

# gets the comics from various sources, puts them into appropriate directories
# run it once a day to get them all
# released under the GPL, http://www.gnu.org/licenses/gpl.html

# where to put them, change this to your liking, then run the script
target_dir="D:\share\comic"

# dictionary of comics to get, adjust as necessary
# name value pairs indicate comic name, web site and regexp to grab the gif
comic_dict={'dilbert': ('http://www.dilbert.com/',re.compile(r'<img src=\"/(comics/dilbert/archive/images/dilbert[0-9]+)(\.gif)\"',re.I))}


# make index.html files in dirs
def makeindex(starting_dir):
	# index file
	idxfilename=os.path.join(starting_dir,"index.html")
	try:
		indexfile=open(idxfilename, 'w')
	except:
		sys.stderr.write('Error: Could not open index file \"'+idxfilename+'\" for writing.')
		sys.exit(1)
	indexfile.write('<html><head><title>Comix</title></head><body bgcolor="#FEFEF0">\n<P><H1>Comix</H1><P>\n')
	
	files = os.listdir(starting_dir)
	for file in files:
		if string.find(file,'.gif')>0:
			indexfile.write('<IMG SRC="'+file+'"><P>\n')

	indexfile.write('</body></html>\n')
	indexfile.close()


def getcomix():
	# check if our path exists
	try:
		if not os.path.isdir(target_dir):
			os.makedirs(target_dir)
	except:
		sys.stderr.write('Error: Could not create target directory \"'+target_dir+'\".')
		sys.exit(1)		

	# for each comic
	for comic in comic_dict.keys():
		print "###\nRetrieving: "+comic
		# check if subdir exists
		if not os.path.isdir(os.path.join(target_dir,comic)):
			os.makedirs(os.path.join(target_dir,comic))
		# set up the search
		comic_url,comic_rpat=comic_dict[comic]
		# get the html
		try:
			print " getting "+comic_url
			url_handle = urllib.urlopen(comic_url)
			html_content = url_handle.readlines()
		except:
			print "Error retrieving html."
		# find the gif path
		comic_path=""
		for line in html_content:
			ob=comic_rpat.search(line)
			if ob:
				comic_path=ob.group(1)+ob.group(2)
				break
		if comic_path=="":
			print "No matching image found. Quit."
			sys.exit(0)
		# get the gif
		print " image: "+comic_path
		try:
			filename, headers = urllib.urlretrieve(comic_url+comic_path,os.path.join(target_dir,comic,'today'+ob.group(2)))
		except IOError:
			print "Error retrieving image."
		# move it, robust
		year=time.strftime('%Y')
		week=time.strftime('%U')
		dow=time.strftime('%w')
		local_pathname=os.path.join(target_dir,comic,year,week)
		local_filename=os.path.join(local_pathname,dow+ob.group(2))
		if not os.path.isdir(local_pathname):
			os.makedirs(local_pathname)
		os.rename(os.path.join(target_dir,comic,'today'+ob.group(2)),local_filename)
		makeindex(local_pathname)


def removeduplicates(dirpath):
	if os.path.isdir(dirpath):
		flist=[]
		ed={}
		for fname in os.listdir(dirpath):
			if os.path.isfile(os.path.join(dirpath, fname)):
				flist.append(os.path.join(dirpath,fname))
		for fname in flist:
			if os.path.exists(fname):
				for cfile in flist:
					if os.path.exists(cfile):
						if cfile!=fname and filecmp.cmp(fname,cfile):
							print(" "+fname+": is equal to:  "+cfile)
							try:
								os.remove(cfile)
							except: pass


if __name__ == "__main__":
	getcomix()

	