lektura

lektura.git
git clone git://git.lenczewski.org/lektura.git
Log | Files | Refs

index.py (3547B)


      1 #!/usr/bin/env python3
      2 
      3 import argparse
      4 import html.parser
      5 import os
      6 import pathlib
      7 
      8 import template
      9 
     10 
     11 def dirpath(path):
     12     if os.path.isdir(path):
     13         return path
     14 
     15     raise argparse.ArgumentError()
     16 
     17 
     18 argparser = argparse.ArgumentParser(
     19         description='Indexes a given set of html fragments')
     20 
     21 argparser.add_argument('dst', type=argparse.FileType('w'),
     22                        help='The destination html fragment')
     23 argparser.add_argument('art_tpl', metavar='element-tpl', type=argparse.FileType('r'),
     24                        help='The html fragment template for each list element')
     25 argparser.add_argument('idx_tpl', metavar='index-tpl', type=argparse.FileType('r'),
     26                        help='The html fragment template for the index page')
     27 argparser.add_argument('urlroot', type=str,
     28                        help='The base url for each article (e.g. "/sport")')
     29 argparser.add_argument('srcs', nargs='*',
     30                        help='A list of html fragments to index')
     31 
     32 
     33 class TagParser(html.parser.HTMLParser):
     34     def __init__(self, **kwargs):
     35         super().__init__()
     36 
     37         self.filter_tag = kwargs.get('tag', None)
     38         self.filter_id = kwargs.get('id', None)
     39         self.found_data = ''
     40         self.found_attrs = []
     41 
     42         self.have_tag = False
     43         self.have_data = False
     44 
     45     @staticmethod
     46     def get_attr(key, attrs, default):
     47         for attr in attrs:
     48             if attr[0] == key:
     49                 return attr[1]
     50 
     51         return default
     52 
     53     def handle_starttag(self, tag, attrs):
     54         if self.have_tag:
     55             return
     56 
     57         if self.filter_tag and self.filter_tag != tag:
     58             return
     59 
     60         if self.filter_id and self.filter_id != self.get_attr('id', attrs, ''):
     61             return
     62 
     63         self.have_tag = True
     64         self.found_attrs = attrs
     65 
     66     def handle_startendtag(self, tag, attrs):
     67         self.handle_starttag(tag, attrs)
     68 
     69     def handle_data(self, data):
     70         if not self.have_tag:
     71             return
     72 
     73         if self.have_tag and self.have_data:
     74             return
     75 
     76         self.have_data = True
     77         self.found_data = data
     78 
     79 
     80 
     81 def index(srcs, article_tpl, index_tpl, urlroot):
     82     content = ''
     83 
     84     def by_post_date(srcpath):
     85         # NOTE: this parser expects at least one element of type `<time>` and
     86         #       with an id of `post-date` to be present in the given article.
     87         #       if this requirement is not satisfied, it throws an error
     88         parser = TagParser(tag='time', id='post-date')     
     89         content = template.content(srcpath)
     90         parser.feed(content)
     91 
     92         if not parser.have_tag or not parser.have_data:
     93             raise Exception(f'Article: {srcpath} lacks element <time id="post-date" ... />')
     94 
     95         isodate = TagParser.get_attr('datetime', parser.found_attrs, None)
     96 
     97         return template.fromisodate(isodate)
     98 
     99     for article in sorted(srcs, key=by_post_date, reverse=True):
    100         srcpath = pathlib.Path(article)
    101 
    102         keys = template.article_keys(srcpath)
    103         keys |= {
    104                 'url': f'{urlroot}/{srcpath.name}',
    105         }
    106 
    107         content += template.instantiate(article_tpl, keys)
    108 
    109     keys = { 'content': content }
    110 
    111     return template.instantiate(index_tpl, keys)
    112 
    113 
    114 if __name__ == '__main__':
    115     args = argparser.parse_args()
    116 
    117     article_tpl = ''
    118     with args.art_tpl as tpl:
    119         article_tpl = tpl.read()
    120 
    121     index_tpl = ''
    122     with args.idx_tpl as tpl:
    123         index_tpl = tpl.read()
    124 
    125     with args.dst as dst:
    126         dst.write(index(args.srcs, article_tpl, index_tpl, args.urlroot))
    127