#!/usr/bin/env python

# Function/Purpose:
# -----------------
#  This script recurses into a hierarchy and finds a certain
#   number of jpeg images at random.
#  Next, it steganographically embeds a nonsense text document
#   into each image with a nonsense passphrase.
#  By design, no file will be more than once altered.  This
#   feature is achieved with a cache, and it also makes sure
#   that there is a backup of the original file.
#
#  The purpose is to make it easy to randomly produce
#   steganographically altered images (totally automated and
#   run from cron (unix) is the idea.)  The reason for wishing
#   to do this is that it should bog down any machines that might
#   be seeking and trying to break stego'd images on the web, and
#   so that one can provide 'plausible deniability' should one
#   ever actually _need_ to use steganography.
#
#  Also, I wanted to do a little more Python to jog my memory on
#   this and that.
#
#  Also, some of the ideas and functions may be of use to others
#   seeking to do similar things, but I'm certainly not a master
#   programmer!
#			
# Requires:
# ---------
#  'steghide' (http://steghide.sourceforge.net) program must be
#   installed.
#
#
# To Run: Just execute (after checking base_dir and so forth on new
#          structural changes to the source location.)
#
#
# TODO: -Add md5 checksuming to cache so that in case the user
#         replaces a web pic with a new one and doesn't change
#         the name, an old copy pulled from the cache won't
#	  overwirte it.  Should be really easy. [DONE!]
#
#	-The methods of excluding certain areas are not very
# 	  developed or flexible.
#
#	-I got in a real hurry near the end to get it working
#	  before heading out to Alaska.  Much clean-up needs to
#	  happen, especially in certain of the functions.
#
#	-There is noise emitted form the execution of external
#	  commands.  Fix this (w/ popen3() probably.)
#
#	-More sanity checking and elegant exceptions!


# Define Important Info
# ---------------------

base_dir	= "/usr/home/thuppi/pair_rsync"
top_walk_dir	= base_dir + "/public_html/t"
cache_dir	= base_dir + "/steg_cache"

    # Dirs named here won't be scanned for jpegs, but those
    #  underneath them may be?
ignore_dirs	=  ['.thumbnails',\
		    'MandM']

    # Ignore jpegs less than this size (in bytes.)
MIN_JPEG_SIZE		= 30000

    # Pick this many jpegs to process
NUM_JPEG_TO_PROC	= 4

    # Leave it at one...there is no quite mode yet anyway.
verbo	= 1


import os
import sys


def main():

    import shutil
    from os.path import join

    to_steg_list = []


	# Gets back a list in the form
	#   [[path,path_hash,name],[..],..]
	# which are a defined number of random images.
    files_to_steg = pick_jpegs()


	# Now determine if any of those files have already
	#  been done prior by looking in the cache for files
	#  of the form {orig_name}.{path_hash}
	# Whatever the case, make the result be of the form:
	#  IN cache: [['1',orig_fp,cache_fp,orig_md5],[...],...]]
	#  NO cache: [['0',orig_fp,cache_fp,'NA'],[...],...]]
    for pth, phash, jname in files_to_steg:
	cachename = jname + "." + str(phash)
	if os.path.isfile(join(cache_dir,cachename)):
	    orig_md5 = md5file(join(pth,jname))
	    to_steg_list.append(['1',join(pth,jname),\
		                join(cache_dir,cachename),orig_md5])
	else:
	    to_steg_list.append(['0',join(pth,jname),\
		                join(cache_dir,cachename),'NA'])

	# Now process the files depending on whether they are cached or not.
    for cflag, phtml_fp, cache_fp, phtml_md5 in to_steg_list:

	if cflag == '1':	# CACHED
		# Remove previously steged file
	    if verbo > 0:
		print "Re-stego'd from cache: ", phtml_fp, '\n'
	    try:
	        cache_md5_fh = open(cache_fp + '.md5','r')
	    except:
		    # Note that this exists because for a while
		    #  I ran this code before it had the md5 capability,
		    #  and have a cache with no md5's in it.
		print '!!Warning: No md5 file found in cache (unusual)!'
		print '            Writing one using existing cached file'
	        cache_md5 = md5file(cache_fp)
	        cache_md5_fh = open((cache_fp + '.md5'),'w')
	        cache_md5_fh.write(cache_md5)
	        cache_md5_fh.close
	        cache_md5_fh = open((cache_fp + '.md5'),'r')

	    cached_md5 = cache_md5_fh.readline()
	    cache_md5_fh.close

	    if cached_md5 != phtml_md5:		# Author changed file!
	        print "!!NOTE: Author has changed the file contents"
		print "         of an image.  Cached copy will be destroyed."
		os.remove(cache_fp)
		os.remove(cache_fp + '.md5')
	    else:
	        os.remove(phtml_fp)		# remove from pub_html
	        shutil.copy(cache_fp,phtml_fp)	# cached back into pub_html

	else:			# NOT CACHED
	    if verbo > 0:
		print "Caching original     : ", phtml_fp, '\n'
	    shutil.copy(phtml_fp,cache_fp)	# Put copy into cache


	    # get stego capacity for each
	capacity = get_capacity(phtml_fp)
	doc_to_embed = random_doc(int(capacity * .7))

	    # Write bogus doc into a temp file
	doc_file = join(cache_dir,'temp_doc')
	if os.path.isfile(doc_file):
	    os.remove(doc_file)
	doc_fh = open(doc_file,'w')
	for char in doc_to_embed:
	    doc_fh.write(char)
	doc_fh.write('\n')
	doc_fh.close()

	    # Get a passphrase
	passphrase = random_passphrase()

	    # Steg embed the thing
	stegcmd = 'steghide embed -cf '  + phtml_fp + \
	          ' -ef ' + doc_file + ' -p "' + passphrase + '"'
	x = docmd(stegcmd)

	    # Write the steged md5 into a file
        new_md5 = md5file(phtml_fp)
        cache_md5_fh = open((cache_fp + '.md5'),'w')
        cache_md5_fh.write(new_md5)
        cache_md5_fh.close

	    # Remove last doc temp file.
	if os.path.isfile(doc_file):
	    os.remove(doc_file)


    #
    # This function picks a globally defined number of jpeg files
    #  out of a hierarchy and returns them in the form:
    # [[path,path_hash,name],[..],..]
    #

def pick_jpegs():

	    # Initial setup stuff...
    from os.path import join, getsize
    import random
    import re

    jpeg_file_list = []
    ret_list_x = []
    ret_list = []

    jpeg_match_str = re.compile('(?!\S+\.jpe?g$)', re.IGNORECASE)
#m = re.compile('\S+\.jpe?g$', re.IGNORECASE)

	    # Walk top-down
    for root, dirs, files in os.walk(top_walk_dir):

	    # Wipe out any dirs that are named in the ignore_dirs list
	for ignore_dir in ignore_dirs:
	    if ignore_dir in dirs:
		dirs.remove(ignore_dir)

	    # get rid of non-jpeg files in list
	goodfiles = []
        for fnm in files:
	    if not jpeg_match_str.match(fnm):
		goodfiles.append(fnm)

	    # Add all .jpg images that meet requirements
	for jpeg in goodfiles:
	    if getsize(join(root, jpeg)) > MIN_JPEG_SIZE:
		jpeg_file_list.append([root,jpeg])

	# Uniq list here in case later algorithm has multiple
	#  overlapping reversions...
	#  (which should be avioded anyway for the sake of elegance.)
    jpeg_file_list = unique5(jpeg_file_list)

	# Feedback if desired...
    if verbo > 0:
        print "\nTotal Number jpegs found: ", len(jpeg_file_list), "\n" 

        # Pick out the correct number of jpegs from
        #  all those found at random.
    if len(jpeg_file_list) < NUM_JPEG_TO_PROC:
        print "Error: Found few .jpg files.  Check paths and knock-off list."
        print " Bailing...\n"
	    # TODO: do exception correctly!
	sys.exit(1)
    ret_list_x = random.sample(jpeg_file_list,NUM_JPEG_TO_PROC)

	# Add hash for cache functionality...
    for path, file in ret_list_x:
	ret_list.append([path,abs(hash(path)),file])

	# return
    return ret_list


    #
    # Get capacity using steghide --info
    #  output is 'int'
    #
    # TODO: totally hackish...redu!
    #
def get_capacity(image):

    import string
    out = docmd('steghide --info ' + image + ' -p ""')
    out = out.split('\n')[2] 
    out = int(string.atof(out.split(' ')[3]) * 1000)
    
    return (out)


    #
    # Passphrase Generator
    #
def random_passphrase():
    import string
    import random
    ret = ''
    num_words = random.randint(3,6)
    while num_words > 0:
	num_words = num_words - 1
	num_char = random.randint(4,8)
	while num_char > 0:
	    num_char = num_char - 1
	    ret = ret + (chr(random.randint(40, 126)))
        ret = ret + ' '
    ret = ret[0:-1]
    ret = ret.replace('\\','')
    ret = ret.replace('`','')
    return ret
    

    #
    # This function produces a nonesense document.  It takes
    #  a value which is the rough size of the document in bytes.
    #  The return document (as a string) will usually be slightly
    #  under the size requested, but could be slightly over.  In
    #  theory, the smaller the doc, the higher the potential to be
    #  substantially oversize, but it doesn't seem too common for
    #  docs over 5K in size at any rate.
    #
    # TODO: lots.  Add numbers, specifically.
    #
def random_doc(ksize):
    """Generate random strings of characters which
        looks, staistically, something like a
        normal text document.
    """

    import sys
    import string
    import random

    smallword = ""
    ret = []

    if ksize < 150:	# TODO: throw exception correctly.
	print "Error: can't generate a document so small."
	sys.exit(1)

	# This adjusts the rough size of the document produced.
	# The denominator was choosen imperically and should be
	#  adjusted if the various frequency tables change.
    num_sentences = int(ksize / 140)

    word_len_list = ['3']*196 + ['4']*139 + ['2']*131 + ['1']*110 + \
                    ['8']*78 +  ['7']*73 +  ['6']*65 +  ['5']*61 + \
		    ['9']*49 + ['10']*37 + ['11']*20 + ['12']*12 + \
		    ['14']*4 + ['17']*4

    char_freq_list = ['a']*80  + ['b']*16 + ['c']*30 + ['d']*44 + \
                     ['e']*120 + ['f']*25 + ['g']*17 + ['h']*64 + \
                     ['i']*80  + ['j']*4  + ['k']*8  + ['l']*40 + \
                     ['m']*30  + ['n']*80 + ['o']*80 + ['p']*17 + \
                     ['q']*5   + ['r']*62 + ['s']*80 + ['t']*90 + \
                     ['u']*34  + ['v']*12 + ['w']*20 + ['x']*4  + \
                     ['y']*20  + ['z']*2  +\
                     ['\n']*20 + ['\n\n']*5 + \
                     ['(']*3 + [')']*3 + ['$']*3

    num_words_list = ['1']*10 + ['2']*2  + ['3']*2  + ['4']*2  + \
                     ['5']*2  + ['6']*2  + ['7']*2  + ['8']*3  + \
                     ['9']*3  + ['10']*3 + ['11']*3 + ['12']*3 + \
                     ['13']*4 + ['14']*4 + ['15']*4 + ['16']*4 + \
                     ['17']*4 + ['18']*5 + ['19']*5 + ['20']*5 + \
                     ['21']*5 + ['22']*5 + ['23']*5 + ['24']*4 + \
                     ['25']*4 + ['26']*4 + ['27']*4 + ['28']*4 + \
                     ['29']*4 + ['30']*4 + ['31']*3 + ['32']*3 + \
                     ['33']*3 + ['34']*3 + ['35']*3 + ['36']*3 + \
                     ['37']*2 + ['38']*2 + ['39']*2 + ['40']*2 + \
                     ['41']*2 + ['42']*2 + ['43']*2 + ['44']*2 + \
                     ['45']*1 + ['46']*1 + ['47']*1 + ['48']*1 + \
                     ['49']*1 + ['50']*1 + ['51']*1 + ['52']*1

    doc = ''
    sentance = ''
    while num_sentences > 0:
        num_sentences = num_sentences - 1
	num_words = string.atoi(random.sample(num_words_list,1)[0])

	sentance = string.upper(l2s(random.sample(char_freq_list,1)))
	while num_words > 0:
	    num_words = num_words - 1
	    num_char = string.atoi(random.sample(word_len_list,1)[0])
	    word = l2s(random.sample(char_freq_list,num_char))
	    sentance = sentance + word + ' '
	sentance = sentance[0:-1]
	sentance = sentance + '.  '
	doc = doc + sentance


    return doc
	  

    #
    # A list-TO-string function
    #
def l2s(lst):
    ret = ''
    for el in lst:
	ret = ret + el
    return ret


    #
    # Generic list uniquer. --- Tim Peters - usenet ---
    #
def unique5(seq):
    try: # attempt fast algorithm
	d = {}
	for x in seq: d[x] = 0
	return d.keys()
    except TypeError: # have an unhashable object, use slow algorithm
#	print "Not Hashable"
	ret = []
	app = ret.append
	for x in seq:
	    if x not in ret: app(x)
	return ret


    #
    # MD5 a file
    #
def md5file( filename, chunk = 16*1024 ):
    import md5
    file = open( filename, 'rb' )
    try:
        check = md5.new()
        data = file.read( chunk )
        count = 1
        while data:
            check.update( data )
            data = file.read( chunk )
        return check.hexdigest()
    except:
        print 'Exception computing md5.'
        sys.exit(1)

	
    #
    # Execute a command
    #
    # TODO: find my old better one which uses popen3()!
    #
def docmd(cmd_string):
    import string,os
#    stdout, stdin, stderr = os.popen3(cmd_string)
    out = ''
    for line in os.popen(cmd_string).readlines():
	out = out + line
    return out


    #
    # The old name/main deal...
    #
	    
if __name__ == "__main__":
    main()
#    x = random_doc(1000)
#    x = random_passphrase()
#    x = get_capacity('/usr/home/thuppi/temp/new.jpg')
#    print x