@twein89
2016-06-22T14:19:04.000000Z
字数 2849
阅读 639
spark python
# Check that Spark is workingfrom pyspark.sql import Rowdata = [('Alice', 1), ('Bob', 2), ('Bill', 4)]df = sqlContext.createDataFrame(data, ['name', 'age'])fil = df.filter(df.age > 3).collect()print fil# If the Spark job doesn't work properly this will raise an AssertionErrorassert fil == [Row(u'Bill', 4)]
[Row(name=u'Bill', age=4)]
Let's load a text file.
# Check loading data with sqlContext.read.textimport os.pathbaseDir = os.path.join('databricks-datasets', 'cs100')inputPath = os.path.join('lab1', 'data-001', 'shakespeare.txt')fileName = os.path.join(baseDir, inputPath)dataDF = sqlContext.read.text(fileName)shakespeareCount = dataDF.count()print shakespeareCount# If the text file didn't load properly an AssertionError will be raisedassert shakespeareCount == 122395
122395
# TEST Compare with hash (2a)# Check our testing library/package# This should print '1 test passed.' on two linesfrom databricks_test_helper import Testtwelve = 12Test.assertEquals(twelve, 12, 'twelve should equal 12')Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554','twelve, once hashed, should equal the hashed value of 12')
# TEST Compare lists (2b)# This should print '1 test passed.'unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')]Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'), (5, 'b')],'unsortedList does not sort properly')
# Check matplotlib plottingimport matplotlib.pyplot as pltimport matplotlib.cm as cmfrom math import log# function for generating plot layoutdef preparePlot(xticks, yticks, figsize=(10.5, 6), hideLabels=False, gridColor='#999999', gridWidth=1.0):plt.close()fig, ax = plt.subplots(figsize=figsize, facecolor='white', edgecolor='white')ax.axes.tick_params(labelcolor='#999999', labelsize='10')for axis, ticks in [(ax.get_xaxis(), xticks), (ax.get_yaxis(), yticks)]:axis.set_ticks_position('none')axis.set_ticks(ticks)axis.label.set_color('#999999')if hideLabels: axis.set_ticklabels([])plt.grid(color=gridColor, linewidth=gridWidth, linestyle='-')map(lambda position: ax.spines[position].set_visible(False), ['bottom', 'top', 'left', 'right'])return fig, ax# generate layout and plot datax = range(1, 50)y = [log(x1 ** 2) for x1 in x]fig, ax = preparePlot(range(5, 60, 10), range(0, 12, 1))plt.scatter(x, y, s=14**2, c='#d6ebf2', edgecolors='#8cbfd0', alpha=0.75)ax.set_xlabel(r'$range(1, 50)$'), ax.set_ylabel(r'$\log_e(x^2)$')display(fig)pass
You should see a formula on the line below this one:
This formula is included inline with the text and is .
This formula shows log loss for single point. Log loss is defined as: