minTemperatures Flashcards
basic Spark import
from pyspark import SparkConf, SparkContext
create Spark conf (for my machine)
conf = SparkConf().setMaster(“local”).setAppName(“MinTemperatures”)
create Spark context
sc = SparkContext(conf = conf)
split delimited string into fields
fields = line.split(‘,’)
function syntax
def myFunctionName(myInput): ... return (...myOutputTuple...)
convert string to float
temperatureC = float(fields[3])
read text file into RDD
lines = sc.textFile(“file:///path/filename.csv”)
map an RDD
parsedLines = lines.map(parseLine) where parseLine is my map function
filter an RDD
minTemps = parsedLines.filter(lambda x: “TMIN” in x[1])
map subset of fields
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
reduce by key with min lambda
minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y))
move RDD to collection
results = minTemps.collect()
loop through a collection “results”
for result in results:
…some action…
print value result[0]
print result[0]
print tab escape sequence
\t