index.py (3547B)
1 #!/usr/bin/env python3 2 3 import argparse 4 import html.parser 5 import os 6 import pathlib 7 8 import template 9 10 11 def dirpath(path): 12 if os.path.isdir(path): 13 return path 14 15 raise argparse.ArgumentError() 16 17 18 argparser = argparse.ArgumentParser( 19 description='Indexes a given set of html fragments') 20 21 argparser.add_argument('dst', type=argparse.FileType('w'), 22 help='The destination html fragment') 23 argparser.add_argument('art_tpl', metavar='element-tpl', type=argparse.FileType('r'), 24 help='The html fragment template for each list element') 25 argparser.add_argument('idx_tpl', metavar='index-tpl', type=argparse.FileType('r'), 26 help='The html fragment template for the index page') 27 argparser.add_argument('urlroot', type=str, 28 help='The base url for each article (e.g. "/sport")') 29 argparser.add_argument('srcs', nargs='*', 30 help='A list of html fragments to index') 31 32 33 class TagParser(html.parser.HTMLParser): 34 def __init__(self, **kwargs): 35 super().__init__() 36 37 self.filter_tag = kwargs.get('tag', None) 38 self.filter_id = kwargs.get('id', None) 39 self.found_data = '' 40 self.found_attrs = [] 41 42 self.have_tag = False 43 self.have_data = False 44 45 @staticmethod 46 def get_attr(key, attrs, default): 47 for attr in attrs: 48 if attr[0] == key: 49 return attr[1] 50 51 return default 52 53 def handle_starttag(self, tag, attrs): 54 if self.have_tag: 55 return 56 57 if self.filter_tag and self.filter_tag != tag: 58 return 59 60 if self.filter_id and self.filter_id != self.get_attr('id', attrs, ''): 61 return 62 63 self.have_tag = True 64 self.found_attrs = attrs 65 66 def handle_startendtag(self, tag, attrs): 67 self.handle_starttag(tag, attrs) 68 69 def handle_data(self, data): 70 if not self.have_tag: 71 return 72 73 if self.have_tag and self.have_data: 74 return 75 76 self.have_data = True 77 self.found_data = data 78 79 80 81 def index(srcs, article_tpl, index_tpl, urlroot): 82 content = '' 83 84 def by_post_date(srcpath): 85 # NOTE: this parser expects at least one element of type `<time>` and 86 # with an id of `post-date` to be present in the given article. 87 # if this requirement is not satisfied, it throws an error 88 parser = TagParser(tag='time', id='post-date') 89 content = template.content(srcpath) 90 parser.feed(content) 91 92 if not parser.have_tag or not parser.have_data: 93 raise Exception(f'Article: {srcpath} lacks element <time id="post-date" ... />') 94 95 isodate = TagParser.get_attr('datetime', parser.found_attrs, None) 96 97 return template.fromisodate(isodate) 98 99 for article in sorted(srcs, key=by_post_date, reverse=True): 100 srcpath = pathlib.Path(article) 101 102 keys = template.article_keys(srcpath) 103 keys |= { 104 'url': f'{urlroot}/{srcpath.name}', 105 } 106 107 content += template.instantiate(article_tpl, keys) 108 109 keys = { 'content': content } 110 111 return template.instantiate(index_tpl, keys) 112 113 114 if __name__ == '__main__': 115 args = argparser.parse_args() 116 117 article_tpl = '' 118 with args.art_tpl as tpl: 119 article_tpl = tpl.read() 120 121 index_tpl = '' 122 with args.idx_tpl as tpl: 123 index_tpl = tpl.read() 124 125 with args.dst as dst: 126 dst.write(index(args.srcs, article_tpl, index_tpl, args.urlroot)) 127