내용 |
from pyspark.sql import Row
lines = sc.textFile('jeju/jeju_2010.csv')
parts = lines.map(lambda l: l.split(','))
jeju2010 = parts.map(lambda p: Row(YEAR=2010, IN=p[0], OUT=p[1], INCOME=p[2]))
schema2010 = sqlContext.createDataFrame(jeju2010)
schema2010.registerTempTable('jeju2010')
lines = sc.textFile('jeju/jeju_2011.csv')
parts = lines.map(lambda l: l.split(','))
jeju2011 = parts.map(lambda p: Row(YEAR=2011, IN=p[0], OUT=p[1], INCOME=p[2]))
schema2011 = sqlContext.createDataFrame(jeju2011)
schema2011.registerTempTable('jeju2011')
lines = sc.textFile('jeju/jeju_2012.csv')
parts = lines.map(lambda l: l.split(','))
jeju2012 = parts.map(lambda p: Row(YEAR=2012, IN=p[0], OUT=p[1], INCOME=p[2]))
schema2012 = sqlContext.createDataFrame(jeju2012)
schema2012.registerTempTable('jeju2012')
jeju2010 = sqlContext.sql("select * from jeju2010 where INCOME != 'INCOME'")
jeju2011 = sqlContext.sql("select * from jeju2011 where INCOME != 'INCOME'")
jeju2012 = sqlContext.sql("select * from jeju2012 where INCOME != 'INCOME'")
jeju = jeju2010.unionAll(jeju2011).unionAll(jeju2012)
jeju2 = jeju.withColumn("IN", jeju["IN"].cast("int"))
jeju2 = jeju2.withColumn("OUT", jeju["OUT"].cast("int"))
jeju2 = jeju2.withColumn("INCOME", jeju["INCOME"].cast("int"))
|