@semprelibera
2017-06-05T09:20:48.000000Z
字数 30121
阅读 785
Thesis
在PEER钢筋混凝土柱试验数据库中抓取试验信息
文件名为:scra.py
import reimport requestsimport sslfrom pandas import DataFrame# 由于目标网页用了https协议,需要证书验证防止第三方入侵,这里直接关闭验证,否则会读取失败ssl._create_default_https_context = ssl._create_unverified_contextfrom requests.packages.urllib3.exceptions import InsecureRequestWarningrequests.packages.urllib3.disable_warnings(InsecureRequestWarning)############################# Step1, 定义函数getlinkfile## 获取链接页的全部超级链接 ############################def getlinkfile(url):# 这是用来保存结果的listres = []# 读取网页源代码,存储在r中r = requests.get(url,verify = False)# 检验读取是否成功try:r.raise_for_status()except Exception as exc:print('There was a problem: %s' % (exc))# 把网页的内容以纯文本的形式存储到变量data中,data的类型为stringdata = r.text# 定义正则表达式,用来匹配网页上的超级链接hrefRegEx = re.compile(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')")# 匹配所有的超级链接,结果存储在link_list中link_list = hrefRegEx.findall(data)#mylinkfile = open('linkfile.txt','w')# 筛选掉我们不需要的超级链接for url in link_list:if 'xml&id' not in url:res.append('https://nisee.berkeley.edu' + url)# 返回结果return res####################################### Step2,定义函数getthedata,用来匹配每 ## 一个矩形截面柱信息页上我们需要信息 ######################################def getthedata(url):# 读取网页源代码r = requests.get(url,verify = False)# 检验读取是否成功try:r.raise_for_status()except Exception as exc:print('There was a problem: %s' % (exc))# 获取网页源代码,以纯文本形式保存在data中data = r.text# 用正则表达式分别匹配我们需要的矩形截面柱子的构件信息NameReExpr = re.compile("(Name:).*?\">(.*?)<\/td>")TypeReExpr = re.compile("(Type:).*?\">(.*?)<\/td>")f_cReExpr = re.compile("(Concrete Strength:).*?\">(.*?)\(")Tr_steelgradReExpr = re.compile("Transverse Steel:.*?\">(Grade):(.*?)<br>")Tr_yStreReExpr = re.compile("Transverse Steel:.*?>(Yield Stress:)(.*?)\(MPa\)")Tr_uStreReExpr = re.compile("Transverse Steel:.*?(Strength:)(.*?)\(MPa\)")Lg_steelgradReExpr = re.compile("Longitudinal Steel.*?\">(Grade):(.*?)<br>")Lg_yStreReExpr = re.compile("Longitudinal Steel.*?(Yield Stress:)(.*?)\(MPa\)")Lg_uStreReExpr = re.compile("Longitudinal Steel.*?(Strength:)(.*?)\(MPa\)")Geo_WidthReExpr = re.compile("(Width:)(.*?)\(mm\)")Geo_DepthReExpr = re.compile("(Depth:)(.*?)\(mm\)")Geo_LmeasureExpre = re.compile("(L-Measured:)(.*?)\(mm\)")Axi_loadReExpre = re.compile("(Axial Load:).*>(.*?)\(kN\)")Lg_DiaReExpre = re.compile("(Diameter:)<.*?\">(.*?)\(mm\)")Lg_NumbExpre = re.compile("(Number of Bars:)<.*?\">(.*?)<\/td>")Lg_ClearoverExpre = re.compile("(Clear cover:)(.*?)\(mm\)")Tr_ShearLagsExpre = re.compile("(Number of Shear Legs:).*?\">(\d)")Tr_DiaReExpre = re.compile("(Bar Diameter:)(.*?)\(mm\)")Tr_BarSpaceExpre = re.compile("(Hoop Spacing:)(.*?)\(mm\)")# 进行匹配,相应的结果以tuple的形式返回name = NameReExpr.search(data)typeof = TypeReExpr.search(data)f_c = f_cReExpr.search(data)Tr_steelgrad = Tr_steelgradReExpr.search(data)Tr_yStre = Tr_yStreReExpr.search(data)Tr_uStre = Tr_uStreReExpr.search(data)Lg_steelgrad = Lg_steelgradReExpr.search(data)Lg_yStre = Lg_yStreReExpr.search(data)Lg_uStre = Lg_uStreReExpr.search(data)Width = Geo_WidthReExpr.search(data)Depth = Geo_DepthReExpr.search(data)Length = Geo_LmeasureExpre.search(data)Axi_load = Axi_loadReExpre.search(data)Lg_Dia = Lg_DiaReExpre.search(data)Lg_Numb = Lg_NumbExpre.search(data)Lg_Clear =Lg_ClearoverExpre.search(data)Tr_ShearLags = Tr_ShearLagsExpre.search(data)Tr_Dia = Tr_DiaReExpre.search(data)Tr_BarSpace = Tr_BarSpaceExpre.search(data)# 返回结果return (name,typeof,f_c,Tr_steelgrad,Tr_yStre,Tr_uStre,Lg_steelgrad,Lg_yStre,Lg_uStre,Width,Depth,Length,Axi_load,Lg_Dia,Lg_Numb,Lg_Clear,Tr_ShearLags,Tr_Dia,Tr_BarSpace)######################################Step3、运用之前两个函数进行数据的抓取工作####################################### 定义字典,mydata,{feature1:[item1,item2,...],feature2:[item1,item2,...],...},# 用来存储二维数据表,它可以直接转化为DataFramemydata = {}# 用来计数的变量,由于前254条数据是矩形截面的,这里的代码写的不是很规范,矩形和圆形# 截面的两种情况,其实可以用条件控制语句实现。count = 1# 调用获取超级链接的函数,获取超链接页面中我们需要的全部超级链接,结果保存为listsourceurl = 'https://nisee.berkeley.edu/spd/servlet/search'myurllist = getlinkfile(sourceurl)# 这个list存放读取过的矩形截面链接myrectlist = []# 这些是mydata字典里面的keysmykeys = ('Test_Name','Type_of_Column','Concrete_Strength','Tr_steelgrad','Tr_ySre','Tr_uStre','Lg_steelgrad','Lg_yStre','Lg_uStre','Width','Depth','Length','Axi_load','Lg_Dia','Lg_Num','Lg_Clearover','Tr_ShearLags','Tr_Dia','Tr_BarSpace')# 初始化mydata字典for mykey in mykeys:mydata[mykey] = []# 用for循环读取所有的矩形截面构件信息for url in myurllist:print('-----------------------------------------------')print('Try getting ' + str(count) + ' of all the data')# 调用getthedata函数获取信息tableattrib = getthedata(url)print('Successfully logging in the site!')# 这里的if语句作用不大if tableattrib[1].group(2) == 'Rectangular':# 记录矩形截面柱的链接myrectlist.append(url)# 读取所有信息for (mykey,el) in zip(mykeys,tableattrib):mydata[mykey].append(el.group(2))count += 1# 读到254个的时候就进入圆形截面柱了,跳出循环(用条件控制可以解决此问题)if count == 254:break# 把结果存储为DataFrame格式mydataFrame = DataFrame(mydata)# 写入csv文件mydataFrame.to_csv("peerdatabase_rect.csv")# 用集合差运算获得圆形截面柱的超级链接spirurllist = list(set(myurllist)-set(myrectlist))# 把超级链接分为两部分存储,1、矩形截面柱。2、圆形截面柱rectfile = open("linkfile_rect.txt",'w')spirfile = open("linkfile_spir.txt",'w')# 存储矩形截面柱链接for url1 in myrectlist:rectfile.write('https://nisee.berkeley.edu' + url1 + '\n')# 存储圆形截面柱链接for url2 in spirurllist:spirfile.write('https://nisee.berkeley.edu' + url2 + '\n')rectfile.close()spirfile.close()
文件名为:scra_spir.py
import reimport sslimport requests#import pandas as pdfrom pandas import DataFramessl._create_default_https_context = ssl._create_unverified_contextfrom requests.packages.urllib3.exceptions import InsecureRequestWarningrequests.packages.urllib3.disable_warnings(InsecureRequestWarning)####################Step1, 定义相关函数##################### 定义函数getthedataofspir,用来抓取圆形截面柱的信息def getthedataofspir(url):# 流程与抓取矩形截面柱的过程类似r = requests.get(url,verify = False)# 检验是否读取成功try:r.raise_for_status()except Exception as exc:print('There was a problem: %s' % (exc))data = r.text# 用正则表达式匹配信息NameReExpr = re.compile("(Name:).*?\">(.*?)<\/td>")TypeReExpr = re.compile("(Type:).*?\">(.*?)<\/td>")f_cReExpr = re.compile("(Concrete Strength:).*?\">(.*?)\(")Tr_yStreReExpr = re.compile("Transverse Steel:.*?>(Yield Stress:)(.*?)\(MPa\)")Lg_yStreReExpr = re.compile("Longitudinal Steel.*?.*?\">(Yield Stress:)(.*?)\(")Lg_uStreReExpr = re.compile("Longitudinal Steel.*?(Strength:)(.*?)\(MPa\)")Geo_DiameterExpr = re.compile("(Diameter:).*?\">(.*?)\(mm\)")Geo_LmeasureExpre = re.compile("(L-Measured:)(.*?)\(mm\)")Axi_loadReExpre = re.compile("(Axial Load:).*>(.*?)\(kN\)")Lg_DiaReExpre = re.compile("(Diameter:)<.*?\">(.*?)\(mm\)")Lg_NumbExpre = re.compile("(Number of Bars:)<.*?\">(.*?)<\/td>")Lg_ClearoverExpre = re.compile("(Cover to Center of Hoop Bar).*?\">(.*?)\(mm\)")Tr_DiaReExpre = re.compile("(Diameter Spiral).*?\">(.*?)\(mm\)")Tr_BarSpaceExpre = re.compile("(Hoop Spacing, Sv).*?\">(.*?)\(mm\)")# 搜索需要的信息name = NameReExpr.search(data)typeof = TypeReExpr.search(data)f_c = f_cReExpr.search(data)Tr_yStre = Tr_yStreReExpr.search(data)Lg_yStre = Lg_yStreReExpr.search(data)Lg_uStre = Lg_uStreReExpr.search(data)Colum_Dia = Geo_DiameterExpr.search(data)Length = Geo_LmeasureExpre.search(data)Axi_load = Axi_loadReExpre.search(data)Lg_Dia = Lg_DiaReExpre.search(data)Lg_Numb = Lg_NumbExpre.search(data)Lg_Clear =Lg_ClearoverExpre.search(data)Tr_Dia = Tr_DiaReExpre.search(data)Tr_BarSpace = Tr_BarSpaceExpre.search(data)# 返回信息return (name,typeof,f_c,Tr_yStre,Lg_yStre,Lg_uStre,Colum_Dia,Length,Axi_load,Lg_Dia,Lg_Numb,Lg_Clear,Tr_Dia, Tr_BarSpace)##################Step2, 数据抓取 ################### 定义字典mydata,{feature1:[item1,item2,...],feature2:[item1,item2,...],...},# 用来存放二维数据表,可以直接转化为DataFramemydata = {}count = 1# 定义mydata里的keysmykeys = ('Test_Name','Type_of_Column','Concrete_Strength','Tr_ySre','Lg_yStre','Lg_uStre','Colum-Dia','Length','Axi_load','Lg_Dia','Lg_Num','Lg_Clearover','Tr_Dia','Tr_BarSpace')# 初始化mydata字典for mykey in mykeys:mydata[mykey] = []# 读取存放圆形截面柱超级链接的文件mysprifile = open("linkfile_spir.txt",'r')while True:try:print('-----------------------------------------------')print('Try getting ' + str(count) + ' of all the data')url = mysprifile.readline().strip()tableattrib = getthedataofspir(url)print('Successfully logging in the site!')for (mykey,el) in zip(mykeys,tableattrib):mydata[mykey].append(el.group(2))count += 1except:breakmysprifile.close()# 转化为DataFramemydataFrame = DataFrame(mydata)# 把数据存储到本地硬盘mydataFrame.to_csv("peerdatabase_spir_new.csv")
- Pan/Li的数据用,在线OCR软件读取即可
- 对于PEER数据的文件,手工在csv文件中加入一列,作为承载力
- 最后把三个文件改名为"01Result_Rect.csv","02Result_Spir.csv",和"03dataofLiBing.csv"
主要目的是把三个文件合并为一个数据表,anaofData
文件名为:datadigging.py
#!/usr/bin/env python3# -*- coding: utf-8 -*-"""Created on Fri Apr 14 16:03:11 2017@author: Bruce"""import pandas as pdimport Concrete_Strength_anaimport numpy as npimport pylab# 定义函数cleanofNumber,把带逗号','的字符串数字,转化为浮点数def cleanofNumber(Acolumn):temp = []for el in Acolumn:temp.append(np.float(el.replace(',','')))return temp# Reading data from raw csv files# 在原始csv文件中读取数据# Original data consists of three DataFrames with different data structure# 原始数据由三个不同数据结构的DataFrame构成data_rect = pd.read_csv("01Result_Rect.csv",index_col='ind')data_spir = pd.read_csv("02Result_Spir.csv",index_col= 'ind')data_libing = pd.read_csv("03dataofLiBing.csv",index_col= 'ind')############################## Step1,处理Pan/Li文献中的数据############################### 丢掉不需要的列ana_libing = data_libing.drop(['Reference','Configuration','ms'],axis=1)# 把以百分号为单位的配筋率转化为无量纲的配筋率ana_libing['Ratio_Lg'] = ana_libing['Ratio_Lg'] / 100ana_libing['Ratio_Tr'] = ana_libing['Ratio_Tr'] / 100######################## Step2,处理矩形截面数据######################### 计算配筋率data_rect['Ratio_Lg'] = (0.25 * np.pi * data_rect['Lg_Dia']**2) * data_rect['Lg_Num'] / (data_rect['Depth'] * data_rect['Width'])data_rect['Ratio_Tr'] = (0.25 * np.pi * data_rect['Tr_Dia']**2) * data_rect['Tr_ShearLags'] / (data_rect['Tr_BarSpace'] * data_rect['Width'])# 丢掉不需要的列ana_rect = data_rect.drop(['Lg_Clearover','Lg_Dia','Lg_Num','Lg_steelgrad','Lg_uStre','Tr_BarSpace','Tr_Dia','Tr_ShearLags','Tr_steelgrad','Tr_uStre','Type_of_Column'],axis = 1)########################## Step3,把数据中带有字符串## 的列全部转化为纯数字 #########################ana_rect['Axi_load'] = cleanofNumber(ana_rect['Axi_load'])ana_rect['Length'] = cleanofNumber(ana_rect['Length'])ana_rect['Tr_ySre']=cleanofNumber(ana_rect['Tr_ySre'])data_spir['Axi_load'] = cleanofNumber(data_spir['Axi_load'])data_spir['Length'] = cleanofNumber(data_spir['Length'])data_spir['Colum-Dia'] = cleanofNumber(data_spir['Colum-Dia'])data_spir['Tr_ySre']=cleanofNumber(data_spir['Tr_ySre'])########################## Step3,处理圆形截面的数据########################## 计算配筋率data_spir['Ratio_Lg'] = (data_spir['Lg_Dia']**2) * data_spir['Lg_Num'] / (data_spir['Colum-Dia']**2)data_spir['Ratio_Tr'] = (data_spir['Tr_Dia']**2) * np.pi * 0.5/ ((data_spir['Colum-Dia'] - data_spir['Lg_Clearover'] * 2) * data_spir['Tr_BarSpace'])# 计算轴压比data_spir['Ratio_AxisLoad'] = data_spir['Axi_load'] * 10**3 / (data_spir['Concrete_Strength'] * data_spir['Colum-Dia']**2 * np.pi * 0.25 )data_spir['Ratio_Aspect'] = data_spir['Length'] / data_spir['Colum-Dia']# 计算界面剂data_spir['Area'] = data_spir['Colum-Dia']**2 * np.pi *0.25# 丢掉不需要的列ana_spir = data_spir.drop(['Lg_Clearover','Lg_Dia','Lg_Num','Lg_uStre','Tr_BarSpace','Tr_Dia','Type_of_Column'],axis = 1)######################## Step4,三个数据表的合并######################### 先合并矩形截面数据和Pan/Li的数据,因为这些都是矩形截面数据anaOfData = pd.concat([ana_rect,ana_libing]) # just include rectangular section typle# 计算轴压比anaOfData['Ratio_AxisLoad'] = anaOfData['Axi_load'] * 10**3 / (anaOfData['Concrete_Strength'] * anaOfData['Depth'] * anaOfData['Width'])# 计算高深比anaOfData['Ratio_Aspect'] = anaOfData['Length'] / anaOfData['Depth']# 计算截面积anaOfData['Area'] = anaOfData['Depth'] * anaOfData['Width']# 把圆形截面数据也合并进来anaOfData = pd.concat([anaOfData,ana_spir])# 筛选出高深比小于4的短柱anaOfData = anaOfData[anaOfData['Ratio_Aspect'] <= 4 ] # give me all the short columns###################################### Step5,计算几列数据处理时候需要用的性质###################################### 平均等效剪应力的计算anaOfData['v_max'] = anaOfData['V_max'] * 10**3 / anaOfData['Area']# 归一化剪切承载力anaOfData['v_max/f_c'] = anaOfData['v_max'] / anaOfData['Concrete_Strength']# 丢掉无效数据anaOfData = anaOfData[anaOfData['v_max/f_c'] != 0 ]# 考虑钢筋屈服强度,对钢筋“配筋率”进行重新计算anaOfData['Ratio_Tr'] = anaOfData['Ratio_Tr'] * anaOfData['Tr_ySre']/anaOfData['Concrete_Strength']anaOfData['Ratio_Lg'] = anaOfData['Ratio_Lg'] * anaOfData['Lg_yStre']/anaOfData['Concrete_Strength']# 丢掉不需要的列anaOfData = anaOfData.drop(['Colum-Dia','Depth','Width'],axis=1)# 丢掉有na数据的列anaOfData = anaOfData.dropna()# 筛选出轴向受压力的数据,得到最终的数据表anaofDataanaOfData = anaOfData[anaOfData['Ratio_AxisLoad']>=0]################################## Step6,debug代码。总览一下数据分布##################################if __name__ == "__main__":# 不加分组,直接绘制散点图pylab.figure()pylab.plot(anaOfData['Concrete_Strength'],anaOfData['v_max'],'m.',label='experiment data')pylab.title('Effect of Concrete Strength')pylab.xlabel("$f_c^{\'}$/Mpa")pylab.ylabel('$v_{max}$/Mpa')pylab.legend()pylab.show()z1,p1,x1 = Concrete_Strength_ana.linearfit(1/anaOfData['Ratio_Aspect'],anaOfData['v_max/f_c'],0.25,1)r2 = Concrete_Strength_ana.getR2(1/anaOfData['Ratio_Aspect'],anaOfData['v_max/f_c'],p1)r2=round(r2,2)pylab.figure()pylab.plot(1/anaOfData['Ratio_Aspect'],anaOfData['v_max/f_c'],'m.',label='experiment data')pylab.plot(x1,p1(x1),'k-',label='linear fit with $R^2$= ' +str(r2))pylab.title('Effect of Aspect Ratio')pylab.xlabel(r"$1/\phi_{asp}$")pylab.ylabel('$v$')pylab.legend()pylab.show()anaOfaxis = anaOfData[anaOfData['Ratio_AxisLoad']>=0]pylab.figure()pylab.plot(anaOfaxis['Ratio_AxisLoad'],anaOfaxis['v_max/f_c'],'b.',label='experiment data')pylab.title('Effect of ratio of axis load')pylab.xlabel('$\psi_{axis}$')pylab.ylabel('$v$')pylab.legend()pylab.show()z2,p2,x2 = Concrete_Strength_ana.linearfit(anaOfData['Ratio_Lg'],anaOfData['v_max/f_c'],0,0.8)r2 = Concrete_Strength_ana.getR2(anaOfData['Ratio_Lg'],anaOfData['v_max/f_c'],p2)r2 = round(r2,2)pylab.figure()pylab.plot(anaOfData['Ratio_Lg'],anaOfData['v_max/f_c'],'b.',label = 'experiment data')pylab.plot(x2,p2(x2),'k-',label='linear fit with $R^2$= ' + str(r2))pylab.title(r'Effect of $\rho_l$')pylab.xlabel(r'$\rho_l$')pylab.ylabel(r'$v$')pylab.legend()pylab.show()baddata = anaOfData[(anaOfData['Ratio_Lg']>0.7) & (anaOfData['v_max/f_c']<0.075)]z3,p3,x3 = Concrete_Strength_ana.linearfit(anaOfData['Ratio_Tr'],anaOfData['v_max/f_c'],0,0.20)r2 = Concrete_Strength_ana.getR2(anaOfData['Ratio_Lg'],anaOfData['v_max/f_c'],p3)r2 = round(r2,2)pylab.figure()pylab.plot(anaOfData['Ratio_Tr'],anaOfData['v_max/f_c'],'b.',label = 'experiment data')# pylab.plot(x3,p3(x3),'k-',label='linear fit with $R^2$= ' + str(r2))pylab.title(r'Effect of $\rho_v$')pylab.xlabel(r'$\rho_v$')pylab.ylabel(r'$v$')pylab.legend()pylab.show()baddata = anaOfData[anaOfData['v_max/f_c']>0.175]
文件名为:Concrete_Strength_ana.py
#!/usr/bin/env python3# -*- coding: utf-8 -*-"""Created on Sat Apr 22 20:07:44 2017Mission: Find the influence of concrete strength@author: Bruce"""import datadiggingimport pylabimport pandas as pdimport numpy as np#########################Step1, 定义需要用到的函数########################## 根据数据表和列名,绘制数据分布柱状图def drawHist(data,column):"""To draw a histogram of a column in data.data is a DataFrame, and column is an attribute in it."""pylab.figure()pylab.hist(data[column])pylab.title('This is the distribution of ' + column)pylab.xlabel(column)pylab.ylabel("number")pylab.show()# 根据数据表和两个列名绘制二维分布图def drawHist2D(data,column1,column2):"""To draw a 2D histogram of two columns in data.data is a DataFrame, and column1, column2 are two attributes in it."""pylab.figure()pylab.plot(data[column1],data[column2],'b.')pylab.title('This is the distribution of ' + column1 + ' and ' + column2)pylab.xlabel(column1)pylab.ylabel(column2)pylab.legend()pylab.show()# 计算模型的回归判定系数def getR2(x,y,ypre):"""x is an array-like listy is an array-like listypre is the prediction model--------ReturnR^2 of the prediciton model--------"""x=np.array(x)y=np.array(y)preValue = [ypre(el) for el in x]preValue = np.array(preValue)res = 1 - np.sum((y-preValue)**2) / np.sum((y-np.average(y))**2)return res# 对数据做线性回归def linearfit(x,y,minimum,maximum):"""Return polynomial, a fucntion, and xrange"""z1 = np.polyfit(x,y,1)p1 = np.poly1d(z1)xp1 = np.linspace(minimum,maximum,100)return z1,p1,xp1if __name__ == "__main__":############# 绘图的过程############# 此过程可以预想定义函数之后在进行绘图,# 可以减少代码的重复量# 先根据高深比分类anaOfAxisLoad_3 = datadigging.anaOfData[datadigging.anaOfData['Ratio_Aspect'] == 3]anaOfAxisLoad_2 = datadigging.anaOfData[datadigging.anaOfData['Ratio_Aspect'] == 2]anaOfAxisLoad_4 = datadigging.anaOfData[datadigging.anaOfData['Ratio_Aspect'] == 4]anaOfAxisLoad_1 = datadigging.anaOfData[datadigging.anaOfData['Ratio_Aspect']<1.2]anaOfAxisLoad_1_5 = datadigging.anaOfData[(datadigging.anaOfData['Ratio_Aspect']<1.7) &(datadigging.anaOfData['Ratio_Aspect']>1.2)]anaOfAxisLoad_2_5 = datadigging.anaOfData[datadigging.anaOfData['Ratio_Aspect']==2.5]# 绘制各个分组的关系图print('Concrete Strength(aspectratio=3 and axisload = 0.08-0.13)')#anaOfAxis_more3 = anaOfAxisLoad_3[# (anaOfAxisLoad_3['Ratio_AxisLoad']>0.08) * (anaOfAxisLoad_3['Ratio_AxisLoad']<0.13)]pylab.figure()pylab.plot(anaOfAxisLoad_3['Concrete_Strength'],anaOfAxisLoad_3['v_max'],'k^')pylab.title('Concrete Strength with aspectratio 3')pylab.xlabel('Concrete Strength/MPa')pylab.ylabel('$v_{max}/Mpa$')pylab.show()print('-----------------------------------------------------------------------')print('Concrete Strength(aspectratio=2 and axisload = 0.08-0.13)')#anaOfAxis_more2 = anaOfAxisLoad_2[# (anaOfAxisLoad_2['Ratio_AxisLoad']>0.08) * (anaOfAxisLoad_3['Ratio_AxisLoad']<0.13)]pylab.figure()pylab.plot(anaOfAxisLoad_2['Concrete_Strength'],anaOfAxisLoad_2['v_max'],'g^')pylab.title('Concrete Strength with aspectratio 2')pylab.xlabel('Concrete Strength/MPa')pylab.ylabel('$v_{max}/Mpa$')pylab.show()print('-----------------------------------------------------------------------')pylab.figure()pylab.plot(anaOfAxisLoad_1['Concrete_Strength'],anaOfAxisLoad_1['v_max'],'g^')pylab.title('Concrete Strength with aspectratio 1')pylab.xlabel('Concrete Strength/MPa')pylab.ylabel('$v_{max}/Mpa$')pylab.show()print('-----------------------------------------------------------------------')anaOfAxis_more2_1 = anaOfAxisLoad_2[(anaOfAxisLoad_2['Ratio_AxisLoad']>0) & (anaOfAxisLoad_2['Ratio_AxisLoad']<0.2)]z1,p1,xp1 = linearfit(anaOfAxis_more2_1['Concrete_Strength'],anaOfAxis_more2_1['v_max'],0,90)pylab.figure()pylab.plot(anaOfAxis_more2_1['Concrete_Strength'],anaOfAxis_more2_1['v_max'],'g^',xp1,p1(xp1),'k-')pylab.title('Concrete Strength with aspectratio 2 and axisload = 0-0.2')pylab.xlabel('Concrete Strength/MPa')pylab.ylabel('$v_{max}/Mpa$')pylab.show()print('The slope k= ' + str(z1[0]))drawHist2D(anaOfAxis_more2_1,'Ratio_Tr','Ratio_Lg')print('-----------------------------------------------------------------------')anaOfAxis_more2_2 = anaOfAxisLoad_2[(anaOfAxisLoad_2['Ratio_AxisLoad']>0.2) & (anaOfAxisLoad_2['Ratio_AxisLoad']<0.5)]z2 = np.polyfit(anaOfAxis_more2_2['Concrete_Strength'],anaOfAxis_more2_2['v_max'],1)p2 = np.poly1d(z2)xp2 = np.linspace(0,125,125)pylab.figure()pylab.plot(anaOfAxis_more2_2['Concrete_Strength'],anaOfAxis_more2_2['v_max'],'m.',label = 'experiment data')r2 = getR2(anaOfAxis_more2_2['Concrete_Strength'],anaOfAxis_more2_2['v_max'],p2)r2 = round(r2,2)pylab.plot(xp2,p2(xp2),'k-',label = 'linear fit with $R^2$ = ' + str(r2))pylab.title('Effect of $f_c^{\'}$ with $\phi_{asp}$ = 2 and $\psi_{axis}$ = 0.2-0.5')pylab.xlabel('$f_c^{\'}$/MPa')pylab.ylabel('$v_{max}$/MPa')pylab.legend()pylab.show()print('The slope k= ' + str(z2[0]))print('-----------------------------------------------------------------------')pylab.figure()pylab.plot(anaOfAxis_more2_2['Concrete_Strength'],anaOfAxis_more2_2['v_max/f_c'],'m.',label = 'experiment data')pylab.title('Effect of $f_c^{\'}$ with $\phi_{asp}$ = 2 and $\psi_{axis}$ = 0.2-0.5')pylab.xlabel('$f_c^{\'}$/MPa')pylab.ylabel('$v_{max}$/MPa')pylab.ylim(0,0.200)pylab.legend()pylab.show()print('The slope k= ' + str(z2[0]))print('-----------------------------------------------------------------------')
文件名为:Ratio_Aspect_ana.py
#!/usr/bin/env python3# -*- coding: utf-8 -*-"""Created on Sun Apr 23 22:53:12 2017@author: Bruce"""import datadiggingimport Concrete_Strength_anaimport pylabimport pandas as pdimport numpy as np# 从datadigging文件导入数据anaofdata = datadigging.anaOfData.copy()# 定义字典,用于绘图的坐标显示dicOfKeys = {'Ratio_Aspect': r'$\phi_{asp}$','Ratio_AxisLoad':r'$\psi_{axis}$','Ratio_Lg':r'$\rho_l$','Ratio_Tr':r'$\rho_v$','Concrete_Strength':'$f_c^{\'}$','v_max/f_c':'$v$'}# 定义函数,绘制实验数据的分布图def drawHist(data,column,binst=None):"""To draw a histogram of a column in data.data is a DataFrame, and column is an attribute in it."""pylab.figure()if binst == None:pylab.hist(data[column],color='k')else:pylab.hist(data[column],bins=binst,color='k')pylab.title('The distribution of ' + dicOfKeys[column])pylab.xlabel(dicOfKeys[column])pylab.ylabel("# of sample")pylab.show()# 定义函数,绘制二维的数据分布图,或者二维散点图def drawHist2D(data,column1,column2,isHist = True):"""To draw a 2D histogram of two columns in data.data is a DataFrame, and column1, column2 are two attributes in it.isHist controls whether to draw a histogram or a scatter diagram"""data = data[data[column1]!=np.nan]data = data[data[column2]!=np.nan]pylab.figure()if isHist:pylab.hist2d(data[column1],data[column2],bins = 5)pylab.colorbar()else:pylab.plot(data[column1],data[column2],'ko')pylab.title('The distribution of ' + column1 + ' and ' + column2)pylab.xlabel(column1)pylab.ylabel(column2)pylab.show()# 定义函数,用于控制变量,同时绘制图像def drawRelationship(data,column1,column2,control1,control2,val1_1,val1_2,val2_1,val2_2,style,mytitle,xlabel,ylabel,legend1,xmin,xmax,myvarified,varified = False):"""data is a DataFrameTo filter the data with the condtion val1_1 < control1 < val1_2 ANDval2_1 < control2 < val2_2The x axis is column1 and y axis is column2style is used to set the style of the diagram----------ReturnThe a DataFrame after filtering, dataofDraw.----------"""dataOfDraw = data[(data[control1]>val1_1) & (data[control1]<val1_2) &(data[control2]>val2_1) & (data[control2]<val2_2)]if column1 == 'Ratio_Aspect':z1,p1,x1 = Concrete_Strength_ana.linearfit(1/dataOfDraw[column1],dataOfDraw[column2],xmin,xmax)r2 = Concrete_Strength_ana.getR2(1/dataOfDraw[column1],dataOfDraw[column2],p1)r2=round(r2,2)pylab.figure()pylab.plot(1/dataOfDraw[column1],dataOfDraw[column2],style,label = legend1)pylab.plot(x1,p1(x1),'k-',label="linear fit with $R^2$= " + str(r2))else:z1,p1,x1 = Concrete_Strength_ana.linearfit(dataOfDraw[column1],dataOfDraw[column2],xmin,xmax)r2 = Concrete_Strength_ana.getR2(dataOfDraw[column1],dataOfDraw[column2],p1)r2 = round(r2,2)pylab.figure()pylab.plot(dataOfDraw[column1],dataOfDraw[column2],style,label = legend1)pylab.plot(x1,p1(x1),'k-',label="linear fit with $R^2$= " + str(r2))if varified:pylab.plot(1/myvarified[column1],myvarified[column2],'rx',label='varified data')pylab.title(mytitle)pylab.xlabel(xlabel)pylab.ylabel(ylabel)pylab.xlim(xmin-0.005,xmax+0.005)pylab.ylim(0,0.200)pylab.legend()pylab.show()print("k = " + str(z1[0]))print("b = " + str(z1[1]))return dataOfDrawif __name__ == "__main__":################### 控制变量,进行绘图###################anaOfAxisLoad_2 = datadigging.anaOfData[datadigging.anaOfData['Ratio_Aspect'] == 2]drawHist2D(anaofdata,'Ratio_AxisLoad','Ratio_Aspect',False)drawHist2D(anaofdata,'Ratio_Lg','Ratio_Tr')drawHist(anaofdata,'Ratio_Lg')anaofAspect1 = anaofdata[(anaofdata['Ratio_AxisLoad']<0.9) & (anaofdata['Ratio_AxisLoad']>0.5)]myres1 = drawRelationship(anaofdata,'Ratio_Aspect','v_max/f_c','Ratio_AxisLoad','Ratio_Lg',0.1,0.2,0.30,0.40,'b.',r'Effect of $\phi_{asp}$ with $\psi_{axis}$=0.1-0.2 and $\rho_l$=0.3-0.4',r'1/$\phi_{asp}$','$v$','experiment data')myres2 = drawRelationship(anaofdata,'Ratio_Aspect','v_max/f_c','Ratio_AxisLoad','Ratio_Lg',0.3,1.0,0.30,0.40,'b.',r'Effect of $\phi_{asp}$ with $\psi_{axis}$=0.3-1.0 and $\rho_l$=0.30-0.40',r'1/$\phi_{asp}$','$v$','experiment data')myres3 = drawRelationship(anaofAspect1,'Ratio_Aspect','v_max/f_c','Ratio_Tr','Ratio_Lg',0.00,1.0,0.0,1.5,'b.',r'Effect of $\phi_{asp}$ with $\psi_{axis}$=0.5-0.8 and $\rho_v$=0.10-0.20',r'1/$\phi_{asp}$','$v$','experiment data',xmin=0.22,xmax=1.00)drawHist(myres3,'Ratio_AxisLoad')myvarified = drawRelationship(myres3,'Ratio_Aspect','v_max/f_c','Ratio_Tr','Ratio_AxisLoad',0.06,0.08,0.5,1.0,'b.',r'Effect of $\phi_{asp}$ with $\rho_v$=0.06-0.08 and $\rho_l$=0.3-0.4 and $\psi_{axis}=0-0.2$',r'1/$\phi_{asp}$','$v$','experiment data',xmin=0.25,xmax=1.00)myres4 = drawRelationship(myres3,'Ratio_Aspect','v_max/f_c','Ratio_Tr','Ratio_AxisLoad',0.06,0.08,0.0,0.3,'b.',r'Effect of $\phi_{asp}$ with $\rho_v$=0.06-0.08 and $\rho_l$=0.3-0.4 and $\psi_{axis}=0-0.2$',r'1/$\phi_{asp}$','$v$','experiment data',xmin=0.25,xmax=1.00,varified=True)
运用先前在Ratio_Aspect_ana.py文件中定义的几个函数drawHist,drawHist2D,drawRelationship,就可以进行数据分组、绘图工作。只需要简单改变输入参数即可。
文件名为:machine_learning.py
#!/usr/bin/env python3# -*- coding: utf-8 -*-"""Created on Tue Apr 25 10:22:01 2017@author: Bruce""""""1.Some basic idealsa, Hold-out Methon. Divide my data set(D) into two seperate subsets, e.g. 200(S)+100(T)Conventionly, we use 2/3 - 4/5 to train the LEARNING MACHINEb, how to apply stratified sampling with respect to my data set? It is continousdistributed.maybe simply the random.sample(population, k) method to do it(without replacement)?maybe it is to arbitrary considering the random distribution of originial data set.c, We need to repeat the random sampling process for many times, and then takethe average value of the output as well as its deviation.d, consider signal-to-noise ratio(SNR)--feature extraction?"""#from sklearn import linear_modelfrom sklearn import neural_networkfrom sklearn.preprocessing import StandardScalerfrom sklearn.externals import joblibimport datadiggingimport numpy as npimport pandas as pdimport pylab# 从anaOfData中丢去不需要的列,得到机器学习算法需要的数据表data_machinedicOfKeys = {'Ratio_Aspect': r'$\phi_{asp}$','Ratio_AxisLoad':r'$\psi_{axis}$','Ratio_Lg':r'$\rho_l$','Ratio_Tr':r'$\rho_v$'}data_machine = datadigging.anaOfData.drop(['Area','Axi_load','Concrete_Strength','Length','Lg_yStre','Test_Name','Tr_ySre','V_max','v_max'],axis=1)data_machine = data_machine.dropna()# 定义函数,用来绘制一张图,试验数据和模型预测的对比。def plotOne(totalRes,Model):"""totalRes is the total result of the experimentsModel is the mathematic model to be evalutated"""myX,myy = getXy(totalRes,'v_max/f_c')for col in ["Ratio_Aspect","Ratio_AxisLoad","Ratio_Tr","Ratio_Lg"]:# plot the actual and predicted test grouppylab.figure()# myX = scaler.inverse_transform(myX)pylab.plot(myX[col],Model.predict(myX)/myy,'k.',label="experiment data")# pylab.plot(myX[col],Model.predict(myX),'b^',# label="model prediction")pylab.title("comparison between model prediction and experiment data ")pylab.xlabel(dicOfKeys[col])pylab.ylabel("v")pylab.legend()pylab.show()# plot the hist for difference between actual and predicted valueprint("This is the difference distribution ")pylab.figure()pylab.hist(100 * (Model.predict(myX)-myy) / myy,15,color='k')pylab.title("$(v_{predicted} - v_{experiment})/v_{expermiment}$ distribution")pylab.xlabel("differences/%")pylab.ylabel("number")pylab.show()# 另一个绘图函数,临时定义。def plotOneMy(totalRes,Model,col,lengendlist):pylab.figure()for mydata,leg in zip(totalRes,lengendlist):pylab.plot(mydata[col],Model.predict(mydata),label=leg)pylab.title("Effect of " + col)pylab.xlabel("$\phi_{asp}$")pylab.xlim((0.8,4.2))pylab.ylim(0,0.20)pylab.ylabel("v_mac/f_c")pylab.legend()pylab.show()# 数据采样函数def sampleOfData(rawdata,num):"""rawdata is the total results of the experiment, a DataFramenum is an integer, samping number.Break the rawdata into two parts of size, (num) and (total - num).--------Return a tuple--------"""temp = rawdata.reset_index()temp = temp.drop('ind',axis = 1)myTrain = temp.sample(num)myTest = temp.drop(myTrain.index)return (myTrain,myTest)# 把数据表分解为X和y,一个是feature一个是labeldef getXy(data,label):feature = data.drop(label, axis = 1)label = data[label]return (feature,label)# 一次拟合试验def OneTrial(rawdata,numOfSample,predModel):"""Run one trialrawdata is the total result, numOfSample is the sample numberand preModel is the machine learning model we use, say, MLPResgressor.------Return the R2 of the model on testdata, the resultant regression modeland the corresponding test dataset.------"""mytrain,mytest = sampleOfData(rawdata,numOfSample)mytrainX,mytrainy = getXy(mytrain,'v_max/f_c')mytestX,mytesty = getXy(mytest,'v_max/f_c')model = predModel(activation='logistic',solver='lbfgs',hidden_layer_sizes=(2,))# scaler = StandardScaler()# scaler.fit(mytrainX)# mytrainX = scaler.transform(mytrainX)model.fit(mytrainX,mytrainy)# mytestX = scaler.transform(mytestX)mypre = model.predict(mytestX)R2= model.score(mytestX,mytesty)return (R2,mytestX,mytesty,mypre,model)# 一次模拟过程def oneSimulation(rawdata,numOfSample,predModel,numOfTrial,res=[],totalRes=[]):"""Run on simulationrawdata is the total result, numOfSample is the sample numberand preModel is the machine learning model we use, say, MLPResgressor.numOfTrial is the trial number, res and totalRes are lists to save resultResult is a tuple generating in OneTrial------Return the average R^2------"""print("Running total " + str(numOfTrial) + " trials using " +str(predModel.__name__) + " algorithm")for i in range(numOfTrial):resOfTrial = OneTrial(rawdata,numOfSample,predModel)res.append(resOfTrial[0])totalRes.append(resOfTrial)return sum(res)/len(res)# Run the simulation and then draw the figure.def plotFigure():"""--------Return the best model in all trials--------"""# 定义函数,用来绘图def plotOne(totalRes,Model):myX,myy = getXy(totalRes,'v_max/f_c')for col in ["Ratio_Aspect","Ratio_AxisLoad","Ratio_Tr","Ratio_Lg"]:# plot the actual and predicted test grouppylab.figure()pylab.plot(myX[col],myy,'k.',label="experiment data")pylab.plot(myX[col],Model.predict(myX),'b^',label="model prediction")pylab.title("Effect of " + col)pylab.xlabel(col)pylab.ylabel("v_mac/f_c")pylab.legend()pylab.show()# plot the hist for difference between actual and predicted valueprint("This is the difference distribution ")pylab.figure()pylab.hist(100 * (Model.predict(myX)-myy) / myy,15)pylab.title("predicted value - actual value distribution")pylab.xlabel("differences/%")pylab.ylabel("number")pylab.show()# 此函数用来返回偏差最小的模型的indexdef smallestIndex(mylist):temp = mylist.copy()res = -100for el in mylist:if el > res:res = elreturn temp.index(res)# 开始进行回归模拟过程res2 = []totalRes2 = []aveR2_2 = oneSimulation(data_machine,280,neural_network.MLPRegressor,800,res2,totalRes2)# 得到最佳模型的indexplotindex2 = smallestIndex(res2)resModel = totalRes2[plotindex2][4]# 打印信息print("average $R^2$ of neural_network.MLPRegressor is " +str(aveR2_2))print("smallest $R^2$ of neural_network.MLPRegressor is "+str(totalRes2[plotindex2][0]))# 绘图plotOne(data_machine,resModel)print("------------------------------------------------------------" + '\n')# 返回结果return resModel## 多次拟合选最佳的模型,并保存到本地#myresmodel = plotFigure()# 读取本地保存的最佳模型myplotModel = joblib.load('model86_1l2u.model')# 绘图plotOne(data_machine,myplotModel)if __name__ == "__main__":# 这一段是用来debug的代码mymodel = joblib.load('model85.model')def anaofRatioAspect():axisload = np.average(data_machine['Ratio_AxisLoad'])ratiolg = np.average(data_machine['Ratio_Lg'])ratiotr = np.average(data_machine['Ratio_Tr'])datalist = []for axisl in [axisload-0.2,axisload,axisload+0.2,axisload+0.4,axisload+0.6]:data_generate = []asp = 1while asp <=10:data_generate.append([asp,axisl,ratiolg,ratiotr])asp += 0.01data_generate=pd.DataFrame(data_generate)data_generate.columns=['Ratio_Aspect','Ratio_AxisLoad','Ratio_Lg','Ratio_Tr']datalist.append(data_generate)return datalistmyplot1 = anaofRatioAspect()plotOneMy(myplot1,mymodel,'Ratio_Aspect',['$\psi_{axis}$ = 0','$\psi_{axis}$ = 0.2','$\psi_{axis}$ = 0.4','$\psi_{axis}$ = 0.6','$\psi_{axis}$ = 0.8'])