

open ( file_name, 'rb' ) t0 = time () n_documents = 0 for i, record in enumerate ( warc_file ): url, doc = read_doc ( record, parser ) if not doc or not url : continue n_documents += 1 if i > limit : break warc_file.

strip () if len ( html ) > 0 : text = parser ( html ) return url, text def process_warc ( file_name, parser, limit = 10000 ): warc_file = warc. split ( b ' \r\n\r\n ', maxsplit = 1 ) html = html. url text = None if url : payload = record.

text ( separator = ' \n ' ) return text def read_doc ( record, parser = get_text_selectolax ): url = record. body is None : return None for tag in tree. get_text ( separator = ' \n ' ) return text def get_text_selectolax ( html ): tree = HTMLParser ( html ) if tree. body if body is None : return None for tag in body. # coding: utf-8 from time import time import warc from bs4 import BeautifulSoup from selectolax.parser import HTMLParser def get_text_bs ( html ): tree = BeautifulSoup ( html, 'lxml' ) body = tree.
