[关闭]
@bergus 2015-11-08T14:24:31.000000Z 字数 3825 阅读 2659

将一个网络链接保存为mht格式的文件

python mht 网络链接 mime


  1. import base64
  2. import email
  3. import email.message
  4. import mimetypes
  5. import os
  6. import quopri
  7. import sys
  8. import urllib2
  9. from HTMLParser import HTMLParser
  10. from urlparse import urlparse
  11. class MHTHTMLParser(HTMLParser):
  12. def __init__(self):
  13. HTMLParser.__init__(self)
  14. self.urls = []
  15. def handle_starttag(self, tag, attrs):
  16. if not tag in ['link','script']:
  17. return
  18. attrs = dict(attrs)
  19. if 'src' in attrs.keys():
  20. a = attrs.get('src')
  21. if a and a.find('google') == -1:
  22. self.urls.append(a)
  23. elif 'stylesheet' in attrs.values():
  24. self.urls.append(attrs.get('href'))
  25. class URL2MHT(object):
  26. def __init__(self,url):
  27. self.domain = url.split(urlparse(url).path)[0]
  28. self.url = url
  29. def _head(self):
  30. a = email.message.Message()
  31. a["MIME-Version"] = "1.0"
  32. a["X-UnMHT-Save-State"] = "Current-State"
  33. a.add_header("Content-Type",
  34. "multipart/related",
  35. type="text/html",
  36. boundary="----=_Part_7C84B8F2_5B84C39F.150DBE9AC97")
  37. return a
  38. def mht(self):
  39. content = urllib2.urlopen(self.url).read()
  40. pmht = MHTHTMLParser()
  41. pmht.feed(content)
  42. pmht.close()
  43. head = self._head()
  44. head.attach(self._add(self.url))
  45. for url in pmht.urls:
  46. head.attach(self._add(url))
  47. return head
  48. def _add(self, url):
  49. m = email.message.Message()
  50. content = None
  51. local_url = None
  52. try:
  53. content = urllib2.urlopen(url)
  54. local_url = url
  55. except:
  56. local_url = self.domain+url
  57. content = urllib2.urlopen(local_url)
  58. content_type = content.headers.dict.get('content-type')
  59. content = content.read()
  60. if content_type and content_type.startswith("text/"):
  61. m["Content-Transfer-Encoding"] = "quoted-printable"
  62. m.set_payload(quopri.encodestring(content).decode("ascii"))
  63. else:
  64. m["Content-Transfer-Encoding"] = "base64"
  65. m.set_payload(base64.b64encode(content).decode("ascii"))
  66. m["Content-Location"] = local_url
  67. m["Content-Type"] = content_type
  68. return m
  69. url = 'http://www.cnblogs.com/weixliu/p/3554868.html'
  70. print URL2MHT(url).mht()
  71. # encoding=utf-8
  72. import base64
  73. import email
  74. import email.message
  75. import mimetypes
  76. import os
  77. import quopri
  78. import sys
  79. import urllib2
  80. from HTMLParser import HTMLParser
  81. from urlparse import urlparse
  82. import chardet
  83. reload(sys)
  84. sys.setdefaultencoding('utf-8')
  85. class MHTHTMLParser(HTMLParser):
  86. def __init__(self):
  87. HTMLParser.__init__(self)
  88. self.urls = []
  89. def handle_starttag(self, tag, attrs):
  90. if not tag in ['link']: # , 'script'
  91. return
  92. attrs = dict(attrs)
  93. a = attrs.get('src')
  94. if a and a.find('google') == -1:
  95. self.urls.append((a, attrs.get('type', 'text/javascript')))
  96. elif attrs.get('rel') == 'stylesheet':
  97. self.urls.append(
  98. (attrs.get('href'), attrs.get('type', 'text/css')))
  99. class URL2MHT(object):
  100. def __init__(self, url):
  101. uparse = urlparse(url)
  102. self.domain = uparse.scheme + "://" + uparse.netloc
  103. self.url = url
  104. self.header = {
  105. 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
  106. def _head(self):
  107. a = email.message.Message()
  108. a["MIME-Version"] = "1.0"
  109. a["X-UnMHT-Save-State"] = "Current-State"
  110. a.add_header("Content-Type",
  111. "multipart/related",
  112. type="text/html")
  113. return a
  114. def mht(self):
  115. content = urllib2.urlopen(
  116. urllib2.Request(self.url, None, self.header)).read()
  117. pmht = MHTHTMLParser()
  118. pmht.feed(content)
  119. pmht.close()
  120. head = self._head()
  121. head.attach(self._add(self.url, utype='text/html'))
  122. for url, utype in pmht.urls:
  123. head.attach(self._add(url, utype))
  124. return head
  125. def _add(self, url, utype=None):
  126. m = email.message.Message()
  127. content = None
  128. local_url = None
  129. if not urlparse(url).netloc:
  130. local_url = self.domain + url
  131. else:
  132. local_url = url
  133. ctn = None
  134. ecd = None
  135. content = urllib2.urlopen(
  136. urllib2.Request(local_url, None, self.header)).read()
  137. if utype and utype.startswith("text/"):
  138. ecd = "quoted-printable"
  139. ctn = quopri.encodestring(content)
  140. else:
  141. ecd = "base64"
  142. ctn = base64.b64encode(content)
  143. m["Content-Transfer-Encoding"] = ecd
  144. m["Content-Location"] = local_url
  145. m["Content-Type"] = utype
  146. m.set_payload(ctn)
  147. return m
  148. # url = 'http://www.cnblogs.com/weixliu/p/3554868.html'
  149. url = 'http://blog.csdn.net/zhaoyl03/article/details/8631645'
  150. # a = URL2MHT(url).mht().as_string(unixfrom=False)
  151. # print a
  152. # import codecs
  153. # fh = codecs.open("hello.mht", mode="wb", encoding="utf-8")
  154. # fh.write(a)
  155. # fh.close()
  156. x = open('hello.mht').read()
  157. print type(x)
  158. print chardet.detect(x)
  159. x = x.decode('utf-8')
  160. print type(x)
  161. print chardet.detect(x)
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注