#!/usr/bin/python

import string,urllib,re,os

# IMPORTANT: adjust the main function below before you run this!
#
# gets the free mp3s from www.amazon.de, puts them into a directory
# checks for duplicates to avoid unnecessary downloads
# run it once in a while to get them all
#
# Only works with amazon.de - does not work with amazon.com yet (feel free to add that)
# amazon.fr and amazon.co.uk don't have a download section (yet?)
#
# Is this script legal? ethically correct? I think yes, because
# amazon does not have a robots.txt file excluding automatic scripts
# from these directories, check yourself.
#
# released under the GPL, http://www.gnu.org/licenses/gpl.html

# get the starting url, find the regex and return the anchor part, in a list
def geturls(start_url,regpat):
	url_handle = urllib.urlopen(start_url)
	html_content = url_handle.readlines()
	resulturls=[]
	# do a multiline search in case tag is split over 2 lines
	prevline=""
	for line in html_content:
		if regpat.search(prevline+line):
			try:
				resulturls.append(regpat.search(prevline+line).group(1))
			except: pass
		prevline=line
	return resulturls

def uniq(mylist):
	"take a list as input and return a new list with all duplicates removed"
	sortdict={}
	for item in mylist:
		sortdict[item]=0
	newlist=[]
	for key in sortdict.keys():
		newlist.append(key)
	newlist.sort()
	return newlist

def getamazonmp3s(root_url,target_dir,musicpat,downloadpat,mp3pat):
	"Gets files from amazon.de or .com"
	# get amazons music page
	musicpage=geturls(root_url,musicpat)[-1]
	print '- "Musik" page: '+musicpage
	# get amazons free musik downloads page
	downloadpage=geturls(root_url+musicpage,downloadpat)[-1]
	print '- "Downloads" page: '+downloadpage
	# get all pages that offer mp3 downloads
	mp3pages=geturls(root_url+downloadpage,mp3pat)
	print "###\nFound "+str(len(mp3pages))+" pages that seem to have mp3s:"
	# now check each page for mp3 files to download
	pat=re.compile(r'<a href="?.*?=(http://[^">]*\.mp3)"?>.*?</a>',re.I|re.M|re.S)
	completelist=[]
	for page in mp3pages:
		mp3urllist=geturls(root_url+page,pat)
		print " - page: "+page+" has "+str(len(mp3urllist))+" mp3s."
		completelist=completelist+mp3urllist
	# sort complete list and remove double entries
	completelist=uniq(completelist)
	print "###\nFound "+str(len(completelist))+" mp3 files to download:"
	# now get the mp3 files
	filepat=re.compile(r'.*/([^/]*\.mp3).*',re.I)
	for mp3url in completelist:
		filename=""
		try:
			filename=filepat.search(mp3url).group(1)
		except: pass
		if filename!="":
			if not os.path.isfile(os.path.join(target_dir,filename)):
				print " - Retrieving: "+os.path.join(target_dir,filename)+" from "+mp3url
				filen, headers = urllib.urlretrieve(mp3url,os.path.join(target_dir,filename))
			else:
				print " - File: "+filename+" already exists."


#
# to make this work for other amazon regionals, change the root_url
# and change the regular expressions accordingly, e.g.:
#		from "Musik" to "Music"
#		from "Downloads" to "Free Downloads"
#		from "<sup>MP3</sup>" to "<img src="?[^"]*mp3-tiny.gif.*?alt=MP3.*?>"
#
if __name__ == "__main__":

	# where to put the files, change this to your liking, then run the script
	#target_dir="D:\share\music\amazon"
	target_dir="amazon"

	# url to start with
	root_url = "http://www.amazon.de"
	
	# amazon.de regexps to match the proper stuff, may need adjusting
	musicpat=re.compile(r'<a href="?([^">]*)"?>Musik</a>',re.I|re.M|re.S)
	dlpat=re.compile(r'<a href="?([^">]*)"?>Downloads</a>',re.I|re.M|re.S)
	mp3pat=re.compile(r'<a href="?([^">]*)"?>.*?<sup>MP3</sup>.*?</a>',re.I|re.M|re.S)
	print "##########\nGetting mp3s from: "+root_url
	getamazonmp3s(root_url,target_dir,musicpat,dlpat,mp3pat)

	# url to start with
	# uncommented, doesn't work
#	root_url = "http://www.amazon.com"
	
	# amazon.com: regexps to match the proper stuff, may need adjusting
#	musicpat=re.compile(r'<a href="?([^">]*)"?>Music</a>',re.I|re.M|re.S)
#	dlpat=re.compile(r'<a href="?([^">]*)"?>Free Downloads</a>',re.I|re.M|re.S)
#	mp3pat=re.compile(r'<a href="?([^">]*)"?>.*?<a href=[^>]*>.*?<img src="?[^"]*mp3-tiny.gif.*?alt=MP3.*?>.*?</a>',re.I|re.M|re.S)
#	print "##########\nGetting mp3s from: "+root_url
#	getamazonmp3s(root_url,target_dir,musicpat,dlpat,mp3pat)



