diff --git a/.gitignore b/.gitignore index 10aaf4a..0e031c6 100644 --- a/.gitignore +++ b/.gitignore @@ -6,5 +6,5 @@ .coverage *.pyc .tox/ -legi.sqlite* +*.sqlite* /tarballs/ diff --git a/README.md b/README.md index 4a06145..b50744c 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,14 @@ exemple avec [cron][cron] : (`chronic` fait partie des [`moreutils`](http://joeyh.name/code/moreutils/).) +L'option `--base JORF` permet de créer une base JORF au lieu d'une base LEGI. +Noter que l'option `--raw` est obligatoire pour les bases autres que LEGI. + +Une fois la base créée, l'option `--base` n'est plus plus nécessaire car sa +valeur est enregistrée dans les métadonnées de la base et est utilisée comme +valeur par défaut. Toutefois, il peut être vérifié que la base à mettre à +jour est du bon type est donnant ce paramètre `--base`. + ## Fonctionnalités ### Normalisation des titres et numéros diff --git a/legi/download.py b/legi/download.py index 284a5bb..ff52fa3 100644 --- a/legi/download.py +++ b/legi/download.py @@ -9,18 +9,21 @@ DILA_FTP_HOST = 'echanges.dila.gouv.fr' DILA_FTP_PORT = 21 -DILA_LEGI_DIR = '/LEGI' +DILA_LEGI_DIR = { + 'LEGI': '/LEGI', + 'JORF': '/JORF', +} -def download_legi(dst_dir): +def download_legi(dst_dir, base='LEGI'): if not os.path.exists(dst_dir): os.mkdir(dst_dir) local_files = {filename: {} for filename in os.listdir(dst_dir)} ftph = ftplib.FTP() ftph.connect(DILA_FTP_HOST, DILA_FTP_PORT) ftph.login() - ftph.cwd(DILA_LEGI_DIR) - remote_files = [filename for filename in ftph.nlst() if '.tar.gz' in filename and ('legi_' in filename or 'LEGI_' in filename)] + ftph.cwd(DILA_LEGI_DIR[base]) + remote_files = [filename for filename in ftph.nlst() if '.tar.gz' in filename and (base.lower()+'_' in filename or base+'_' in filename)] common_files = [f for f in remote_files if f in local_files] missing_files = [f for f in remote_files if f not in local_files] remote_files = {filename: {} for filename in remote_files} @@ -64,5 +67,9 @@ def download_legi(dst_dir): if __name__ == '__main__': p = argparse.ArgumentParser() p.add_argument('directory') + p.add_argument('--base', default='LEGI') args = p.parse_args() - download_legi(args.directory) + if args.base not in DILA_LEGI_DIR.keys(): + print('!> Non-existing database "'+args.base+'".') + raise SystemExit(1) + download_legi(args.directory, args.base) diff --git a/legi/sql/schema.sql b/legi/sql/schema.sql index f8853c8..70cedb7 100644 --- a/legi/sql/schema.sql +++ b/legi/sql/schema.sql @@ -18,7 +18,7 @@ CREATE TABLE textes CREATE TABLE textes_structs ( id char(20) unique not null , versions text -, dossier text not null +, dossier text , cid char(20) not null , mtime int not null ); @@ -49,7 +49,7 @@ CREATE TABLE textes_versions , nota text , abro text , rect text -, dossier text not null +, dossier text , cid char(20) not null , mtime int not null , texte_id int references textes @@ -63,7 +63,7 @@ CREATE TABLE sections , titre_ta text , commentaire text , parent char(20) -- REFERENCES sections(id) -, dossier text not null +, dossier text , cid char(20) not null , mtime int not null ); @@ -78,7 +78,7 @@ CREATE TABLE articles , type text , nota text , bloc_textuel text -, dossier text not null +, dossier text , cid char(20) not null , mtime int not null ); @@ -114,11 +114,11 @@ CREATE TABLE duplicate_files ( id char(20) not null , sous_dossier text not null , cid char(20) not null -, dossier text not null +, dossier text , mtime int not null , data text not null , other_cid char(20) not null -, other_dossier text not null +, other_dossier text , other_mtime int not null , UNIQUE (id, sous_dossier, cid, dossier) ); @@ -132,7 +132,7 @@ CREATE TABLE textes_versions_brutes , autorite text , num text , date_texte day -, dossier text not null +, dossier text , cid char(20) not null , mtime int not null ); diff --git a/legi/tar2sqlite.py b/legi/tar2sqlite.py index 8d559c8..ffe9248 100755 --- a/legi/tar2sqlite.py +++ b/legi/tar2sqlite.py @@ -42,13 +42,15 @@ def scrape_tags(attrs, root, wanted_tags, unwrap=False): ) -def suppress(get_table, db, liste_suppression): +def suppress(base, get_table, db, liste_suppression): counts = {} for path in liste_suppression: parts = path.split('/') - assert parts[0] == 'legi' - text_cid = parts[11] + if parts[0] == 'null': + continue + assert parts[0] == base.lower() text_id = parts[-1] + text_cid = parts[11] if base == 'LEGI' else text_id assert len(text_id) == 20 table = get_table(parts) db.run(""" @@ -124,7 +126,7 @@ def suppress(get_table, db, liste_suppression): """, (parts[3], text_cid, text_id)) count(counts, 'delete from duplicate_files', db.changes()) total = sum(counts.values()) - print("made", total, "changes in the database based on liste_suppression_legi.dat:", + print("made", total, "changes in the database based on liste_suppression_"+base.lower()+".dat:", json.dumps(counts, indent=4, sort_keys=True)) @@ -171,9 +173,14 @@ def process_archive(db, archive_path, process_links=True): update = db.update def get_table(parts): + if parts[-1][4:8] not in TABLES_MAP: + return None table = TABLES_MAP[parts[-1][4:8]] if table == 'textes_': - table += parts[13] + 's' + if parts[0] == 'legi': + table += parts[13] + 's' + elif parts[0] == 'jorf': + table += parts[3] + 's' return table counts = {} @@ -183,6 +190,8 @@ def count_one(k): except KeyError: counts[k] = 1 + base = db.one("SELECT value FROM db_meta WHERE key = 'base'") or 'LEGI' + skipped = 0 unknown_folders = {} liste_suppression = [] @@ -193,27 +202,53 @@ def count_one(k): if path[-1] == '/': continue parts = path.split('/') - if parts[-1] == 'liste_suppression_legi.dat': + if parts[-1] == 'liste_suppression_'+base.lower()+'.dat': liste_suppression += b''.join(entry.get_blocks()).decode('ascii').split() continue - if parts[1] == 'legi': + if parts[1] == base.lower(): path = path[len(parts[0])+1:] parts = parts[1:] - if not parts[2].startswith('code_et_TNC_'): + if parts[0] not in ['legi', 'jorf'] or \ + (parts[0] == 'legi' and not parts[2].startswith('code_et_TNC_')) or \ + (parts[0] == 'jorf' and parts[2] not in ['article', 'section_ta', 'texte']): # https://github.com/Legilibre/legi.py/issues/23 try: unknown_folders[parts[2]] += 1 except KeyError: unknown_folders[parts[2]] = 1 continue - dossier = parts[3] - text_cid = parts[11] + dossier = parts[3] if base == 'LEGI' else None + text_cid = parts[11] if base == 'LEGI' else None text_id = parts[-1][:-4] mtime = entry.mtime + # Read the file + xml.feed(b''.join(entry.get_blocks())) + root = xml.close() + tag = root.tag + meta = root.find('META') + + # Obtain the CID when database is not LEGI + if base != 'LEGI': + if tag in ['ARTICLE', 'SECTION_TA']: + contexte = root.find('CONTEXTE/TEXTE') + text_cid = attr(contexte, 'cid') + elif tag in ['TEXTELR', 'TEXTE_VERSION']: + meta_spec = meta.find('META_SPEC') + meta_chronicle = meta_spec.find('META_TEXTE_CHRONICLE') + text_cid = meta_chronicle.find('CID').text + else: + raise Exception('unexpected tag: '+tag) + # Skip the file if it hasn't changed, store it if it's a duplicate duplicate = False table = get_table(parts) + if table is None: + try: + unknown_folders[text_id] += 1 + except KeyError: + unknown_folders[text_id] = 1 + continue prev_row = db.one(""" SELECT mtime, dossier, cid FROM {0} @@ -270,11 +305,6 @@ def count_one(k): skipped += 1 continue - xml.feed(b''.join(entry.get_blocks())) - root = xml.close() - tag = root.tag - meta = root.find('META') - # Check the ID if tag == 'SECTION_TA': assert root.find('ID').text == text_id @@ -323,6 +353,9 @@ def count_one(k): ] elif tag == 'TEXTELR': assert table == 'textes_structs' + meta_spec = meta.find('META_SPEC') + meta_chronicle = meta_spec.find('META_TEXTE_CHRONICLE') + assert meta_chronicle.find('CID').text == text_cid scrape_tags(attrs, root, TEXTELR_TAGS) sommaires = [ { @@ -454,7 +487,7 @@ def count_one(k): print("skipped", x, "files in unknown folder `%s`" % d) if liste_suppression: - suppress(get_table, db, liste_suppression) + suppress(base, get_table, db, liste_suppression) def main(): @@ -467,6 +500,7 @@ def main(): p.add_argument('--pragma', action='append', default=[], help="Doc: https://www.sqlite.org/pragma.html | Example: journal_mode=WAL") p.add_argument('--raw', default=False, action='store_true') + p.add_argument('--base', choices=["LEGI", "JORF"]) p.add_argument('--skip-links', default=False, action='store_true', help="if set, all link metadata will be ignored (the `liens` table will be empty)") args = p.parse_args() @@ -475,7 +509,18 @@ def main(): os.mkdir(args.anomalies_dir) db = connect_db(args.db, pragmas=args.pragma) + base = db.one("SELECT value FROM db_meta WHERE key = 'base'") last_update = db.one("SELECT value FROM db_meta WHERE key = 'last_update'") + if not base: + base = args.base.upper() if args.base and not last_update else 'LEGI' + db.insert('db_meta', dict(key='base', value=base)) + if args.base and base != args.base.upper(): + print('!> Wrong database: requested '+args.base.upper()+' but existing database is '+base+'.') + raise SystemExit(1) + + if base != 'LEGI' and args.anomalies: + print("!> The --anomalies option can only be used with the LEGI base") + raise SystemExit(1) # Check and record the data mode db_meta_raw = db.one("SELECT value FROM db_meta WHERE key = 'raw'") @@ -488,6 +533,10 @@ def main(): if db_meta_raw != args.raw: db.insert('db_meta', dict(key='raw', value=args.raw), replace=True) + if base != 'LEGI' and not args.raw: + print("!> You need to use the --raw option when working with bases other than LEGI.") + raise SystemExit(1) + # Handle the --skip-links option has_links = bool(db.one("SELECT 1 FROM liens LIMIT 1")) if not args.skip_links and not has_links and last_update is not None: @@ -499,12 +548,12 @@ def main(): # Look for new archives in the given directory print("> last_update is", last_update) - archive_re = re.compile(r'(.+_)?legi(?P_global)?_(?P[0-9]{8}-[0-9]{6})\..+', flags=re.IGNORECASE) + archive_re = re.compile(r'(.+_)?'+base.lower()+r'(?P_global)?_(?P[0-9]{8}-[0-9]{6})\..+', flags=re.IGNORECASE) skipped = 0 archives = sorted([ (m.group('date'), bool(m.group('global')), m.group(0)) for m in [ archive_re.match(fn) for fn in os.listdir(args.directory) - if fnmatch(fn.lower(), '*legi_*.tar.*') + if fnmatch(fn.lower(), '*'+base.lower()+'_*.tar.*') ] ]) most_recent_global = [t[0] for t in archives if t[1]][-1]