from pyspark import SparkContext
sc = SparkContext("local", "My App")
rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
#獲得指定行的數據,返回RDD
def getnum(s):
#全局行號叠代
global counter
#指定行號
global row
counter += 1
if (counter ==row):
return s
counter = 0
row= 3
#cache()緩存數據
x1 = rows.filter(getnum).cache()
row= 4
x2 = rows.filter(getnum).cache()
#生成壹個包含兩個RDD中所有元素的RDD
xx = x1.union(x2)
print xx.collect()