Counting Lines
This program will load the Google Unigrams dataset and count the total number of lines:
from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("Unigrammer3").setMaster("local[16]").set("spark.executor.memory", "4g") sc = SparkContext(conf=conf) lines = sc.textFile("hdfs://master:9000/dat/GG/eng/01") count = lines.count() print "total lines %i" % (count)
Create an empty DataFrame with a given schema:
%pyspark from pyspark.sql.types import * field = [ StructField("userid", IntegerType(), True), StructField("posted_time", DateType(), True), StructField("category", StringType(), True), StructField("parent", StringType(), True), StructField("entity", StringType(), True), StructField("tweetid", IntegerType(), True), StructField("content", StringType(), True), StructField("lang", StringType(), True), StructField("search_object", StringType(), True), StructField("sony", BooleanType(), True), StructField("spam", BooleanType(), True), StructField("spam_match", StringType(), True), StructField("thandle", StringType(), True), StructField("turner", BooleanType(), True), ] schema = StructType(field) df = sqlContext.createDataFrame(sc.emptyRDD(), schema)
Merge multiple data frames:
%pyspark from functools import reduce # For Python 3.x from pyspark.sql import DataFrame def unionAll(*dfs): return reduce(DataFrame.unionAll, dfs) df = unionAll( df_fb[df_fb.entity == "despicable me 1"], df_fb[df_fb.entity == "elysium"], df_fb[df_fb.entity == "evil dead"]) df.limit(2).show()
No comments:
Post a Comment