[关闭]
@dragonfive 2015-08-11T02:53:19.000000Z 字数 6964 阅读 839

用c#编写爬虫在marinetraffic下载船只图片

c#编程


最近在做船只识别方面的事情,需要大量的正样本来训练adaboost分类器。于是到marinetraffic这个网站上下载船只图片。写个爬虫来自动下载显然很方便。

网站特点

在介绍爬虫之前首先了解一下marinetraffic这个网站的一些特点:
1. 会定期检测爬虫行为,如果认为有爬虫大量下载图片。会把该连接加入黑名单,后几天都没办法下载。
2. 船只图片资源差异大,有的船只有1000多张图,有的船只没有一张图,我们需要的是很多船只的很多张图,所以需要对下载的船只按优先级排序。
3. 用来训练分类器的正样本要求检测对象的分辨率一样,而marinetraffic网站下载的图片可以设置下在的图片的宽度,网站根据长宽比,生成相应的高度。所以,不同图片高度不一样,需要自己后期处理。

解决方案

  1. 针对爬虫检测,设置一个随机等待时间,10s左右。可以绕过网站爬虫行为检测。
  2. 对船只按照图片熟练排序,先下载图片数量多的,并且每个船只不用下载太多,保证图片的差异性。例如
  3. 在下载的时候使用统一的宽度。后期处理从图片中抠出分辨率一样的船只

爬虫源码

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Globalization;
  4. using System.IO;
  5. using System.Linq;
  6. using System.Net;
  7. using System.Runtime.Serialization.Formatters.Binary;
  8. using System.Text;
  9. using System.Text.RegularExpressions;
  10. using System.Threading;
  11. using System.Threading.Tasks;
  12. namespace 船只图像爬虫
  13. {
  14. class Program
  15. {
  16. static void download_all_shipid(List<string> shipid_list)
  17. {
  18. try
  19. {
  20. WebClient MyWebClient = new WebClient();
  21. MyWebClient.Headers["User-Agent"] = "blah";
  22. MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据;
  23. //Console.WriteLine("here1");
  24. //http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/
  25. //http://www.marinetraffic.com/en/ais/index/ships/all
  26. //http://www.marinetraffic.com/ais/index/ships/all/page:2/sort:COUNT_PHOTOS/direction:desc;
  27. for (int pageNum = 1; pageNum < 100; pageNum++)
  28. {
  29. Console.WriteLine("开始分析第" + pageNum + "张网页");
  30. MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据;
  31. MyWebClient.Headers["User-Agent"] = "blah";
  32. try
  33. {
  34. //Console.WriteLine("here0");
  35. Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/ais/index/ships/all/page:" + pageNum + "/sort:COUNT_PHOTOS/direction:desc/per_page:50"); //从指定网站下载数据
  36. //pageHtml = Encoding.Default.GetString(pageData); //如果获取网站页面采用的是GB2312,则使用这句;
  37. string pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句;
  38. //Console.WriteLine(pageHtml);//在控制台输入获取的内容;
  39. //Console.WriteLine("here1");
  40. int urlindex = -1;
  41. string org_label = "shipid:";
  42. urlindex = pageHtml.IndexOf(org_label, urlindex + 1);
  43. while (urlindex != -1)
  44. {
  45. int endOfUrl = pageHtml.IndexOf("/", urlindex + org_label.Length);
  46. //Console.WriteLine("here2");
  47. string shipid = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);
  48. if (!shipid_list.Contains(shipid))
  49. {
  50. Console.WriteLine("新增id:" + shipid);
  51. shipid_list.Add(shipid);
  52. }
  53. //Console.WriteLine("已有id:" + shipid);
  54. urlindex = pageHtml.IndexOf(org_label, urlindex + 1);
  55. }
  56. ///保存网页
  57. //using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本
  58. //{
  59. // sw.Write(pageHtml);
  60. //}
  61. Console.WriteLine("完成第" + pageNum + "页分析");
  62. }
  63. catch (WebException webEx)
  64. {
  65. Console.WriteLine(webEx.Message.ToString());
  66. }
  67. //下面是一个随机数的方法保证10秒后再下载,以绕过违规检测。
  68. Console.Write("绕开网站爬虫行为检测中......");
  69. Random rd = new Random();
  70. int time_sleep = rd.Next() % 10 + 10;
  71. Thread.Sleep(time_sleep * 1000);
  72. Console.WriteLine();
  73. }
  74. Console.WriteLine("分析结束");
  75. //下面把list内容保存进文件,使用序列化的方法;
  76. string file = @"C:\Users\dragonfive\Desktop\爬虫获得船只图片\第三批\0_100page_shipid.txt";
  77. using (FileStream fsWriter = new FileStream(file, FileMode.OpenOrCreate, FileAccess.Write))
  78. {
  79. //下面对stu进行序列化;
  80. BinaryFormatter bf = new BinaryFormatter();
  81. bf.Serialize(fsWriter, shipid_list);
  82. }
  83. }
  84. catch (WebException webEx)
  85. {
  86. Console.WriteLine(webEx.Message.ToString());
  87. }
  88. }
  89. /// <summary>
  90. /// 根据得到的ship_id获得该ship_id的所有图片;
  91. /// </summary>
  92. /// <param name="ship_id"></param>
  93. static void download_jpg(string ship_id)
  94. {
  95. try
  96. {
  97. Console.WriteLine("开始下载shipid为:"+ship_id+"的图片");
  98. WebClient MyWebClient = new WebClient();
  99. MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
  100. MyWebClient.Headers["User-Agent"] = "blah";
  101. //http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/
  102. //http://www.marinetraffic.com/en/photos/of/ships/shipid:371668/per_page:1000/page:1
  103. Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/photos/of/ships/shipid:" + ship_id + @"/per_page:100/page:1"); //从指定网站下载数据
  104. //string pageHtml = Encoding.Default.GetString(pageData); //如果获取网站页面采用的是GB2312,则使用这句
  105. string pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句
  106. //Console.WriteLine(pageHtml);//在控制台输入获取的内容
  107. Console.WriteLine("元网页已下载");
  108. //using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本
  109. //{
  110. // sw.Write(pageHtml);
  111. //}
  112. int urlindex = -1;
  113. string org_label = "data-original='";
  114. urlindex = pageHtml.IndexOf(org_label, urlindex + 1);
  115. int i = 0;
  116. //Directory.CreateDirectory(@"./" );
  117. while (urlindex != -1)
  118. {
  119. int endOfUrl = pageHtml.IndexOf("'", urlindex + org_label.Length);
  120. string url = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);
  121. ////下面是unicode编码转换为string的方式;
  122. //MatchCollection mc = Regex.Matches(strName, @"\\u([\w]{2})([\w]{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase);
  123. //byte[] bts = new byte[2];
  124. //foreach (Match m in mc)
  125. //{
  126. // bts[0] = (byte)int.Parse(m.Groups[2].Value, NumberStyles.HexNumber);
  127. // bts[1] = (byte)int.Parse(m.Groups[1].Value, NumberStyles.HexNumber);
  128. // musicName += Encoding.Unicode.GetString(bts);
  129. //}
  130. //Console.WriteLine("接下来下载的是:" + musicName);
  131. //下面是一个随机数的方法保证10秒后再下载,以绕过违规检测。
  132. Console.Write("绕过网站爬虫行为检测中......");
  133. Random rd = new Random();
  134. int time_sleep = rd.Next() % 10 + 10;
  135. Thread.Sleep(time_sleep * 1000);
  136. Console.WriteLine();
  137. try
  138. {
  139. //这是下载的命令;
  140. Console.WriteLine(url);
  141. MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
  142. MyWebClient.Headers["User-Agent"] = "blah";
  143. Byte[] jpgdata = MyWebClient.DownloadData(url); //从指定网页下载数据;
  144. //把下载的内容保存在一个地方;
  145. using (FileStream fs = new FileStream(@"C:\Users\dragonfive\Desktop\爬虫获得船只图片\第三批\" + ship_id + "_" + i + ".jpg", FileMode.OpenOrCreate, FileAccess.Write))
  146. {
  147. fs.Write(jpgdata, 0, jpgdata.Length);
  148. }
  149. }
  150. catch (WebException webEx)
  151. {
  152. Console.WriteLine("被捕获了吗?");
  153. Console.WriteLine(webEx.Message.ToString());
  154. }
  155. Console.WriteLine("成功下载第" + (i ++) + "张图片");
  156. urlindex = pageHtml.IndexOf(org_label, urlindex + 1);
  157. }
  158. ///保存网页
  159. //using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本
  160. //{
  161. // sw.Write(pageHtml);
  162. //}
  163. Console.WriteLine("*****************************************");
  164. Console.WriteLine("下载"+i+"ship_id"+ship_id+"的图片");
  165. Console.WriteLine("*****************************************");
  166. //Console.ReadLine(); //让控制台暂停,否则一闪而过了
  167. }
  168. catch (WebException webEx)
  169. {
  170. Console.WriteLine(webEx.Message.ToString());
  171. }
  172. }
  173. static void Main(string[] args)
  174. {
  175. List<string> shipid_list = new List<string>();
  176. //shipid_list.Add("371681");//暂时快速产生图片用这个;
  177. download_all_shipid(shipid_list);
  178. //string file = @"C:\Users\dragonfive\Desktop\爬虫获得船只图片\第三批\0_100page_shipid.txt";
  179. //using (FileStream fsReader = new FileStream(file, FileMode.Open, FileAccess.Read))
  180. //{
  181. // //下面进行反序列话;
  182. // BinaryFormatter bf = new BinaryFormatter();
  183. // shipid_list = (List<string>)bf.Deserialize(fsReader);
  184. // Console.WriteLine("成功载入" + shipid_list.Count + "shipid");
  185. //}
  186. ////371652 371668 371681 1252401
  187. //shipid_list.Remove("371652");
  188. //shipid_list.Remove("371668");
  189. //shipid_list.Remove("371681");
  190. //shipid_list.Remove("1252401");
  191. ////132264
  192. //shipid_list.Remove("371077");
  193. //shipid_list.Remove("132264");
  194. //shipid_list.Remove("224871");
  195. //shipid_list.Remove("279923");
  196. //shipid_list.Remove("369163");
  197. //shipid_list.Remove("266342");
  198. //shipid_list.Remove("371216");
  199. //shipid_list.Remove("368174");
  200. //shipid_list.Remove("369163");
  201. foreach (var ship_id in shipid_list)
  202. {
  203. download_jpg(ship_id);
  204. }
  205. Console.ReadLine(); //让控制台暂停,否则一闪而过了
  206. }
  207. }
  208. }
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注