html转txt:python html转TXT python读取html指定区域文本内容转成txt文件

1、首先,通过python,去将读取遍历程序目录文件夹中,【html】文件夹里面的文件、文件

夹以及子目录、子目录里面的 ,获取到该目录下所有的【.html】文件后,返回一个list对象

2、遍历完成后得到一个html文件列表对象,将该列表交给html_to_txt方法,html_to_txt方法

里面循环逐个读取html文件中指定标签中

标签中

标签中的文字,和中指定标签

里面

标签的文字提取出来

3、读取到的文本内容输出到txt文件中,这里可以加上一个替换replace,把我们不需要的内

容替换之后,这里可以做多次替换,也可以加上换行之类的处理,再进行输出,可根据自己

的需求修改,如果有什么不明之处,可以提问

资源

main.py

```pythonimport globimport osimport reimport pypandocfrom selectolax.parser import HTMLParserfrom html.parser import HTMLParserfrom lxml import etreefrom Html_To_txt import html_to_txt# 解析本地html,返回字典数据类型def parse_html(file_path, vla=None): for ff in file_path: val = [] with open(ff, 'r', encoding='gbk') as f: html = etree.HTML(f.read()) Title = html.xpath("//*[@id='left']/div/div[2]/h1/text()") contents = html.xpath("//td[@class='info_content']/*") val.append(Title) for td in contents: val.append(td.text) a = html_for(td.xpath("./strong/a")) b = html_for(td.xpath("./strong/tail()")) vla.append(a.text) vla.append(b.text) txt = open(os.getcwd() + "\\txt\\" + ff.split('\\')[-1], 'w', encoding="utf-8") txt.write(val) txt.close() # res = {} # for div in divs: # key = div.xpath("./span[1]/text()")[0].replace('/', '_') # value = div.xpath("./span[2]/text()")[0] # res[key] = valuedef html_for(html_obj): aaa_ls = [] if len(html_obj) > 2: for bbb in html_obj: if len(bbb) > 2: aaa_ls.append(html_for(bbb).text) else: aaa_ls.append(bbb.text) else: a = html_obj[0].text b = html_obj[0].tail aaa_ls.append(str(html_obj[0].text + html_obj[0].tail)) return aaa_lsdef search_dir(pathstr, file_all=[]): files = os.listdir(pathstr) # 得到文件夹下的所有文件名称 # print(files) for file_str in files: # 遍历该文件夹 if os.path.isdir(pathstr + "\\" + file_str): # 是子文件夹 search_dir(pathstr + "\\" + file_str) else: # 是文件 if os.path.splitext(file_str)[1] == '.html': print(pathstr + "\\" + file_str) file_all.append(pathstr + "\\" + file_str) return file_all# 按间距中的绿色按钮以运行脚本。if __name__ == '__main__': path = os.getcwd() + r'\html' aaa = search_dir(path) html_to_txt(aaa)

Html_To_txt.py

import osfrom lxml import etreefrom sgmllib import SGMLParserclass GetIdList(SGMLParser): # def __init__(self, verbose=0): # super().__init__(verbose) # self.verbatim = 0 # self.getdata = False # self.flag = False # self.IDlist = [] def reset(self): self.IDlist = [] self.flag = False self.getdata = False self.verbatim = 0 SGMLParser.reset(self) # def start_div(self, attrs): # if self.flag: # self.verbatim += 1 # 进入子层div了,层数加1 # return # for k, v in attrs: # 遍历div的所有属性以及其值 # if k == 'class' and v == 'entry-content': # 确定进入了<div class='entry-content'> # self.flag = True # return # # def end_div(self): # 遇到</div> # if self.verbatim == 0: # self.flag = False # if self.flag: # 退出子层div了,层数减1 # self.verbatim -= 1 def start_div(self, attrs): if self.flag: self.verbatim += 1 # 进入子层td了,层数加1 return for k, v in attrs: # 遍历div的所有属性以及其值 if k == 'class' and v == 'article-content': # 确定进入了<div class='entry-content'> self.flag = True return def end_div(self): # 遇到</td> if self.verbatim == 0: self.flag = False if self.flag: # 退出子层td了,层数减1 self.verbatim -= 1 # def start_td(self, attrs): # if self.flag: # self.verbatim += 1 # 进入子层td了,层数加1 # return # for k, v in attrs: # 遍历div的所有属性以及其值 # if k == 'class' and v == 'info_content': # 确定进入了<div class='entry-content'> # self.flag = True # return # # def end_td(self): # 遇到</td> # if self.verbatim == 0: # self.flag = False # if self.flag: # 退出子层td了,层数减1 # self.verbatim -= 1 def start_h1(self, attrs): if not self.flag: return self.getdata = True def end_h1(self): # 遇到</p> if self.getdata: self.getdata = False def start_p(self, attrs): if not self.flag: return self.getdata = True def end_p(self): # 遇到</p> if self.getdata: self.getdata = False def handle_data(self, text): # 处理文本 if self.getdata: self.IDlist.append(text) def printID(self, new_file): f = open(os.getcwd() + "\\txt\\" + (new_file.split('\\'))[-1] + '.txt', 'w', encoding='gbk') j = 0 for i in self.IDlist: print(i) if '。' in i or j == 0: f.write(i + '\n') j += 1 else: f.write(i ) ##import urllib2##import datetime##vrg = (datetime.date(2012,2,19) - datetime.date.today()).days##strUrl = 'http://www.nod32id.org/nod32id/%d.html'%(200+vrg)##req = urllib2.Request(strUrl)#通过网络获取网页##response = urllib2.urlopen(req)##the_page = response.read()def html_to_txt(html_list): for a in html_list: the_page = a ff = open(the_page, 'r',encoding='utf-8') html = ff.read() lister = GetIdList() lister.feed(html) lister.printID(a)

相关推荐

相关文章