Tuesday, March 24, 2015

Pyspark Recipes

Counting Lines

This program will load the Google Unigrams dataset and count the total number of lines:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("Unigrammer3").setMaster("local[16]").set("spark.executor.memory", "4g")
sc = SparkContext(conf=conf)

lines = sc.textFile("hdfs://master:9000/dat/GG/eng/01")
count = lines.count()
print "total lines %i" % (count)

Create an empty DataFrame with a given schema:

from pyspark.sql.types import *
field = [
    StructField("userid", IntegerType(), True), 
    StructField("posted_time", DateType(), True),
    StructField("category", StringType(), True),
    StructField("parent", StringType(), True),
    StructField("entity", StringType(), True),
    StructField("tweetid", IntegerType(), True),
    StructField("content", StringType(), True),
    StructField("lang", StringType(), True),
    StructField("search_object", StringType(), True),
    StructField("sony", BooleanType(), True),
    StructField("spam", BooleanType(), True),
    StructField("spam_match", StringType(), True),
    StructField("thandle", StringType(), True),
    StructField("turner", BooleanType(), True),
schema = StructType(field)

df = sqlContext.createDataFrame(sc.emptyRDD(), schema)

Merge multiple data frames:

from functools import reduce  # For Python 3.x
from pyspark.sql import DataFrame

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)
df = unionAll(
    df_fb[df_fb.entity == "despicable me 1"],
    df_fb[df_fb.entity == "elysium"],
    df_fb[df_fb.entity == "evil dead"])

