#!/usr/bin/env python # Function/Purpose: # ----------------- # This script recurses into a hierarchy and finds a certain # number of jpeg images at random. # Next, it steganographically embeds a nonsense text document # into each image with a nonsense passphrase. # By design, no file will be more than once altered. This # feature is achieved with a cache, and it also makes sure # that there is a backup of the original file. # # The purpose is to make it easy to randomly produce # steganographically altered images (totally automated and # run from cron (unix) is the idea.) The reason for wishing # to do this is that it should bog down any machines that might # be seeking and trying to break stego'd images on the web, and # so that one can provide 'plausible deniability' should one # ever actually _need_ to use steganography. # # Also, I wanted to do a little more Python to jog my memory on # this and that. # # Also, some of the ideas and functions may be of use to others # seeking to do similar things, but I'm certainly not a master # programmer! # # Requires: # --------- # 'steghide' (http://steghide.sourceforge.net) program must be # installed. # # # To Run: Just execute (after checking base_dir and so forth on new # structural changes to the source location.) # # # TODO: -Add md5 checksuming to cache so that in case the user # replaces a web pic with a new one and doesn't change # the name, an old copy pulled from the cache won't # overwirte it. Should be really easy. [DONE!] # # -The methods of excluding certain areas are not very # developed or flexible. # # -I got in a real hurry near the end to get it working # before heading out to Alaska. Much clean-up needs to # happen, especially in certain of the functions. # # -There is noise emitted form the execution of external # commands. Fix this (w/ popen3() probably.) # # -More sanity checking and elegant exceptions! # Define Important Info # --------------------- base_dir = "/usr/home/thuppi/pair_rsync" top_walk_dir = base_dir + "/public_html/t" cache_dir = base_dir + "/steg_cache" # Dirs named here won't be scanned for jpegs, but those # underneath them may be? ignore_dirs = ['.thumbnails',\ 'MandM'] # Ignore jpegs less than this size (in bytes.) MIN_JPEG_SIZE = 30000 # Pick this many jpegs to process NUM_JPEG_TO_PROC = 4 # Leave it at one...there is no quite mode yet anyway. verbo = 1 import os import sys def main(): import shutil from os.path import join to_steg_list = [] # Gets back a list in the form # [[path,path_hash,name],[..],..] # which are a defined number of random images. files_to_steg = pick_jpegs() # Now determine if any of those files have already # been done prior by looking in the cache for files # of the form {orig_name}.{path_hash} # Whatever the case, make the result be of the form: # IN cache: [['1',orig_fp,cache_fp,orig_md5],[...],...]] # NO cache: [['0',orig_fp,cache_fp,'NA'],[...],...]] for pth, phash, jname in files_to_steg: cachename = jname + "." + str(phash) if os.path.isfile(join(cache_dir,cachename)): orig_md5 = md5file(join(pth,jname)) to_steg_list.append(['1',join(pth,jname),\ join(cache_dir,cachename),orig_md5]) else: to_steg_list.append(['0',join(pth,jname),\ join(cache_dir,cachename),'NA']) # Now process the files depending on whether they are cached or not. for cflag, phtml_fp, cache_fp, phtml_md5 in to_steg_list: if cflag == '1': # CACHED # Remove previously steged file if verbo > 0: print "Re-stego'd from cache: ", phtml_fp, '\n' try: cache_md5_fh = open(cache_fp + '.md5','r') except: # Note that this exists because for a while # I ran this code before it had the md5 capability, # and have a cache with no md5's in it. print '!!Warning: No md5 file found in cache (unusual)!' print ' Writing one using existing cached file' cache_md5 = md5file(cache_fp) cache_md5_fh = open((cache_fp + '.md5'),'w') cache_md5_fh.write(cache_md5) cache_md5_fh.close cache_md5_fh = open((cache_fp + '.md5'),'r') cached_md5 = cache_md5_fh.readline() cache_md5_fh.close if cached_md5 != phtml_md5: # Author changed file! print "!!NOTE: Author has changed the file contents" print " of an image. Cached copy will be destroyed." os.remove(cache_fp) os.remove(cache_fp + '.md5') else: os.remove(phtml_fp) # remove from pub_html shutil.copy(cache_fp,phtml_fp) # cached back into pub_html else: # NOT CACHED if verbo > 0: print "Caching original : ", phtml_fp, '\n' shutil.copy(phtml_fp,cache_fp) # Put copy into cache # get stego capacity for each capacity = get_capacity(phtml_fp) doc_to_embed = random_doc(int(capacity * .7)) # Write bogus doc into a temp file doc_file = join(cache_dir,'temp_doc') if os.path.isfile(doc_file): os.remove(doc_file) doc_fh = open(doc_file,'w') for char in doc_to_embed: doc_fh.write(char) doc_fh.write('\n') doc_fh.close() # Get a passphrase passphrase = random_passphrase() # Steg embed the thing stegcmd = 'steghide embed -cf ' + phtml_fp + \ ' -ef ' + doc_file + ' -p "' + passphrase + '"' x = docmd(stegcmd) # Write the steged md5 into a file new_md5 = md5file(phtml_fp) cache_md5_fh = open((cache_fp + '.md5'),'w') cache_md5_fh.write(new_md5) cache_md5_fh.close # Remove last doc temp file. if os.path.isfile(doc_file): os.remove(doc_file) # # This function picks a globally defined number of jpeg files # out of a hierarchy and returns them in the form: # [[path,path_hash,name],[..],..] # def pick_jpegs(): # Initial setup stuff... from os.path import join, getsize import random import re jpeg_file_list = [] ret_list_x = [] ret_list = [] jpeg_match_str = re.compile('(?!\S+\.jpe?g$)', re.IGNORECASE) #m = re.compile('\S+\.jpe?g$', re.IGNORECASE) # Walk top-down for root, dirs, files in os.walk(top_walk_dir): # Wipe out any dirs that are named in the ignore_dirs list for ignore_dir in ignore_dirs: if ignore_dir in dirs: dirs.remove(ignore_dir) # get rid of non-jpeg files in list goodfiles = [] for fnm in files: if not jpeg_match_str.match(fnm): goodfiles.append(fnm) # Add all .jpg images that meet requirements for jpeg in goodfiles: if getsize(join(root, jpeg)) > MIN_JPEG_SIZE: jpeg_file_list.append([root,jpeg]) # Uniq list here in case later algorithm has multiple # overlapping reversions... # (which should be avioded anyway for the sake of elegance.) jpeg_file_list = unique5(jpeg_file_list) # Feedback if desired... if verbo > 0: print "\nTotal Number jpegs found: ", len(jpeg_file_list), "\n" # Pick out the correct number of jpegs from # all those found at random. if len(jpeg_file_list) < NUM_JPEG_TO_PROC: print "Error: Found few .jpg files. Check paths and knock-off list." print " Bailing...\n" # TODO: do exception correctly! sys.exit(1) ret_list_x = random.sample(jpeg_file_list,NUM_JPEG_TO_PROC) # Add hash for cache functionality... for path, file in ret_list_x: ret_list.append([path,abs(hash(path)),file]) # return return ret_list # # Get capacity using steghide --info # output is 'int' # # TODO: totally hackish...redu! # def get_capacity(image): import string out = docmd('steghide --info ' + image + ' -p ""') out = out.split('\n')[2] out = int(string.atof(out.split(' ')[3]) * 1000) return (out) # # Passphrase Generator # def random_passphrase(): import string import random ret = '' num_words = random.randint(3,6) while num_words > 0: num_words = num_words - 1 num_char = random.randint(4,8) while num_char > 0: num_char = num_char - 1 ret = ret + (chr(random.randint(40, 126))) ret = ret + ' ' ret = ret[0:-1] ret = ret.replace('\\','') ret = ret.replace('`','') return ret # # This function produces a nonesense document. It takes # a value which is the rough size of the document in bytes. # The return document (as a string) will usually be slightly # under the size requested, but could be slightly over. In # theory, the smaller the doc, the higher the potential to be # substantially oversize, but it doesn't seem too common for # docs over 5K in size at any rate. # # TODO: lots. Add numbers, specifically. # def random_doc(ksize): """Generate random strings of characters which looks, staistically, something like a normal text document. """ import sys import string import random smallword = "" ret = [] if ksize < 150: # TODO: throw exception correctly. print "Error: can't generate a document so small." sys.exit(1) # This adjusts the rough size of the document produced. # The denominator was choosen imperically and should be # adjusted if the various frequency tables change. num_sentences = int(ksize / 140) word_len_list = ['3']*196 + ['4']*139 + ['2']*131 + ['1']*110 + \ ['8']*78 + ['7']*73 + ['6']*65 + ['5']*61 + \ ['9']*49 + ['10']*37 + ['11']*20 + ['12']*12 + \ ['14']*4 + ['17']*4 char_freq_list = ['a']*80 + ['b']*16 + ['c']*30 + ['d']*44 + \ ['e']*120 + ['f']*25 + ['g']*17 + ['h']*64 + \ ['i']*80 + ['j']*4 + ['k']*8 + ['l']*40 + \ ['m']*30 + ['n']*80 + ['o']*80 + ['p']*17 + \ ['q']*5 + ['r']*62 + ['s']*80 + ['t']*90 + \ ['u']*34 + ['v']*12 + ['w']*20 + ['x']*4 + \ ['y']*20 + ['z']*2 +\ ['\n']*20 + ['\n\n']*5 + \ ['(']*3 + [')']*3 + ['$']*3 num_words_list = ['1']*10 + ['2']*2 + ['3']*2 + ['4']*2 + \ ['5']*2 + ['6']*2 + ['7']*2 + ['8']*3 + \ ['9']*3 + ['10']*3 + ['11']*3 + ['12']*3 + \ ['13']*4 + ['14']*4 + ['15']*4 + ['16']*4 + \ ['17']*4 + ['18']*5 + ['19']*5 + ['20']*5 + \ ['21']*5 + ['22']*5 + ['23']*5 + ['24']*4 + \ ['25']*4 + ['26']*4 + ['27']*4 + ['28']*4 + \ ['29']*4 + ['30']*4 + ['31']*3 + ['32']*3 + \ ['33']*3 + ['34']*3 + ['35']*3 + ['36']*3 + \ ['37']*2 + ['38']*2 + ['39']*2 + ['40']*2 + \ ['41']*2 + ['42']*2 + ['43']*2 + ['44']*2 + \ ['45']*1 + ['46']*1 + ['47']*1 + ['48']*1 + \ ['49']*1 + ['50']*1 + ['51']*1 + ['52']*1 doc = '' sentance = '' while num_sentences > 0: num_sentences = num_sentences - 1 num_words = string.atoi(random.sample(num_words_list,1)[0]) sentance = string.upper(l2s(random.sample(char_freq_list,1))) while num_words > 0: num_words = num_words - 1 num_char = string.atoi(random.sample(word_len_list,1)[0]) word = l2s(random.sample(char_freq_list,num_char)) sentance = sentance + word + ' ' sentance = sentance[0:-1] sentance = sentance + '. ' doc = doc + sentance return doc # # A list-TO-string function # def l2s(lst): ret = '' for el in lst: ret = ret + el return ret # # Generic list uniquer. --- Tim Peters - usenet --- # def unique5(seq): try: # attempt fast algorithm d = {} for x in seq: d[x] = 0 return d.keys() except TypeError: # have an unhashable object, use slow algorithm # print "Not Hashable" ret = [] app = ret.append for x in seq: if x not in ret: app(x) return ret # # MD5 a file # def md5file( filename, chunk = 16*1024 ): import md5 file = open( filename, 'rb' ) try: check = md5.new() data = file.read( chunk ) count = 1 while data: check.update( data ) data = file.read( chunk ) return check.hexdigest() except: print 'Exception computing md5.' sys.exit(1) # # Execute a command # # TODO: find my old better one which uses popen3()! # def docmd(cmd_string): import string,os # stdout, stdin, stderr = os.popen3(cmd_string) out = '' for line in os.popen(cmd_string).readlines(): out = out + line return out # # The old name/main deal... # if __name__ == "__main__": main() # x = random_doc(1000) # x = random_passphrase() # x = get_capacity('/usr/home/thuppi/temp/new.jpg') # print x