@bergus 2015-11-08T14:24:31.000000Z 字数 3825 阅读 2823
将一个网络链接保存为mht格式的文件

python mht 网络链接 mime
import base64
import email
import email.message
import mimetypes
import os
import quopri
import sys
import urllib2
from HTMLParser import HTMLParser
from urlparse import urlparse
class MHTHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.urls = []
    def handle_starttag(self, tag, attrs):
        if not tag in ['link','script']:
            return
        attrs = dict(attrs)
        if 'src' in attrs.keys():
            a = attrs.get('src')
            if a and a.find('google') == -1:
                self.urls.append(a)
        elif 'stylesheet' in attrs.values():
            self.urls.append(attrs.get('href'))
class URL2MHT(object):
    def __init__(self,url):
        self.domain = url.split(urlparse(url).path)[0]
        self.url = url
    def _head(self):
        a = email.message.Message()
        a["MIME-Version"] = "1.0"
        a["X-UnMHT-Save-State"] = "Current-State"
        a.add_header("Content-Type", 
                     "multipart/related", 
                     type="text/html",
                     boundary="----=_Part_7C84B8F2_5B84C39F.150DBE9AC97")
        return a
    def mht(self):
        content = urllib2.urlopen(self.url).read()
        pmht = MHTHTMLParser()
        pmht.feed(content)
        pmht.close()
        head = self._head()
        head.attach(self._add(self.url))
        for url in pmht.urls:
            head.attach(self._add(url))
        return head
    def _add(self, url):
        m = email.message.Message()
        content = None
        local_url = None
        try:
            content = urllib2.urlopen(url)
            local_url = url
        except:
            local_url = self.domain+url
            content = urllib2.urlopen(local_url)
        content_type = content.headers.dict.get('content-type')
        content = content.read()
        if content_type and content_type.startswith("text/"):
            m["Content-Transfer-Encoding"] = "quoted-printable"
            m.set_payload(quopri.encodestring(content).decode("ascii"))
        else:
            m["Content-Transfer-Encoding"] = "base64"
            m.set_payload(base64.b64encode(content).decode("ascii"))
        m["Content-Location"] = local_url
        m["Content-Type"] = content_type
        return m
url = 'http://www.cnblogs.com/weixliu/p/3554868.html'    
print URL2MHT(url).mht()
# encoding=utf-8
import base64
import email
import email.message
import mimetypes
import os
import quopri
import sys
import urllib2
from HTMLParser import HTMLParser
from urlparse import urlparse
import chardet
reload(sys)
sys.setdefaultencoding('utf-8')
class MHTHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.urls = []
    def handle_starttag(self, tag, attrs):
        if not tag in ['link']:  # , 'script'
            return
        attrs = dict(attrs)
        a = attrs.get('src')
        if a and a.find('google') == -1:
            self.urls.append((a, attrs.get('type', 'text/javascript')))
        elif attrs.get('rel') == 'stylesheet':
            self.urls.append(
                (attrs.get('href'), attrs.get('type', 'text/css')))
class URL2MHT(object):
    def __init__(self, url):
        uparse = urlparse(url)
        self.domain = uparse.scheme + "://" + uparse.netloc
        self.url = url
        self.header = {
            'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
    def _head(self):
        a = email.message.Message()
        a["MIME-Version"] = "1.0"
        a["X-UnMHT-Save-State"] = "Current-State"
        a.add_header("Content-Type",
                     "multipart/related",
                     type="text/html")
        return a
    def mht(self):
        content = urllib2.urlopen(
            urllib2.Request(self.url, None, self.header)).read()
        pmht = MHTHTMLParser()
        pmht.feed(content)
        pmht.close()
        head = self._head()
        head.attach(self._add(self.url, utype='text/html'))
        for url, utype in pmht.urls:
            head.attach(self._add(url, utype))
        return head
    def _add(self, url, utype=None):
        m = email.message.Message()
        content = None
        local_url = None
        if not urlparse(url).netloc:
            local_url = self.domain + url
        else:
            local_url = url
        ctn = None
        ecd = None
        content = urllib2.urlopen(
            urllib2.Request(local_url, None, self.header)).read()
        if utype and utype.startswith("text/"):
            ecd = "quoted-printable"
            ctn = quopri.encodestring(content)
        else:
            ecd = "base64"
            ctn = base64.b64encode(content)
        m["Content-Transfer-Encoding"] = ecd
        m["Content-Location"] = local_url
        m["Content-Type"] = utype
        m.set_payload(ctn)
        return m
# url = 'http://www.cnblogs.com/weixliu/p/3554868.html'
url = 'http://blog.csdn.net/zhaoyl03/article/details/8631645'
# a = URL2MHT(url).mht().as_string(unixfrom=False)
# print a
# import codecs
# fh = codecs.open("hello.mht", mode="wb", encoding="utf-8")
# fh.write(a)
# fh.close()
x = open('hello.mht').read()
print type(x)
print chardet.detect(x)
x = x.decode('utf-8')
print type(x)
print chardet.detect(x)
将一个网络链接保存为mht格式的文件

内容目录