##### Example 1: word count ------------------

# reading RDD
lines = sc.textFile("/user/pascepet/data/bible/bible.txt")

# line to words splitting
words = lines.flatMap(lambda line: line.split(" "))

# transformation to (key, value) pair
pairs = words.map(lambda word: (word, 1))

# summing 1s to every key
counts = pairs.reduceByKey(lambda a, b: a + b)

# look at result
counts.take(5)


##### Example 2: metadata extraction from an image ------------

# packages
from PIL import Image
from PIL.ExifTags import TAGS
import StringIO

# function for EXIF extraction from binary data 
def get_exif(str):
    ret = {}
    fn = StringIO.StringIO(str)
    i = Image.open(fn)
    info = i._getexif()
    if info:
        for tag, value in info.items():
            decoded = TAGS.get(tag, tag)
            ret[decoded] = value
    return ret

# function for EXIF data transformation
def exif_data_transform(exif_data):
    ret = {}
    ret['datetime'] = exif_data['DateTime'] if exif_data.has_key('DateTime') else ''
    ret['exptime'] = exif_data['ExposureTime'][0] * 1.0 / exif_data['ExposureTime'][1] \
        if exif_data.has_key('ExposureTime') else 0.0
    ret['width'] = exif_data['ExifImageWidth'] if exif_data.has_key('ExifImageWidth') else 0
    ret['height'] = exif_data['ExifImageHeight'] if exif_data.has_key('ExifImageHeight') else 0
    ret['orientation'] = exif_data['Orientation'] if exif_data.has_key('Orientation') else 1
    return ret

# reading a directory with images on HDFS, returning list of pairs (filename, binary contents of file)
imgs = sc.binaryFiles('/user/pascepet/data/images/')

# transformation: keeping filename and EXIF data as dict tag-value
imgs2 = imgs.map(lambda im: (im[0], get_exif(im[1])))

# ... next operations ...
# e. g. tags value processing
imgs3 = imgs2.map(lambda im: (im[0], exif_data_transform(im[1])))

# saving RDD to HDFS
imgs3.saveAsTextFile('/user/pascepet/data/images_stat')
 