From a70a8c138fb55d447c5605ecc9b0a618b8cc3c84 Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Wed, 17 Jun 2026 19:08:31 -0300 Subject: [PATCH 1/5] Altera script para nome do arquivo kbart para manter melhor conformidade --- run.sh | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/run.sh b/run.sh index 5e4afca..68d8ab7 100755 --- a/run.sh +++ b/run.sh @@ -21,6 +21,7 @@ readonly MAX_RETRIES="${MAX_RETRIES:-3}" readonly RETRY_DELAY="${RETRY_DELAY:-5}" readonly EXIT_ON_FAILURE="${EXIT_ON_FAILURE:-true}" readonly TIMESTAMP="$(date +%Y%m%d_%H%M%S)" +readonly REPORT_DATE="$(date +%F)" readonly MASTER_LOG="$LOG_DIR/master_$TIMESTAMP.log" readonly SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}" @@ -41,9 +42,13 @@ readonly CSV_FILES=( "documents_licenses" "journals" "journals_status_changes" - "journals_kbart" ) +kbart_csv_file() { + local acronym=$1 + echo "SciELO_${acronym}_AllTitles_${REPORT_DATE}.csv" +} + notify_slack() { local message=$1 [[ -z "$SLACK_WEBHOOK_URL" ]] && return 0 @@ -149,7 +154,7 @@ process_collection() { local item=$1 local is_network_mode=$2 local counter=$3 - local acron nationality acrond tail_head=1 + local acron nationality acrond kbart_file tail_head=1 validate_acronym "$item" || return 1 @@ -157,6 +162,7 @@ process_collection() { nationality=$(echo "$item" | cut -f2 -d- -s) acrond="$acron" [[ "$acron" == "scl" ]] && acrond="bra" + kbart_file=$(kbart_csv_file "$acrond") [[ $counter -gt 1 ]] && tail_head=2 log_message "Processando coleção: $item" @@ -182,7 +188,7 @@ process_collection() { has_errors=1 critical_errors=1 } - run_processing_command "export_kbart" "$acron" "$acrond" "" "journals_kbart.csv" || { + run_processing_command "export_kbart" "$acron" "$acrond" "" "$kbart_file" || { has_errors=1 critical_errors=1 } @@ -200,6 +206,7 @@ process_collection() { for csv in "${CSV_FILES[@]}"; do [[ -f "${csv}.csv" ]] && zip_files+=("${csv}.csv") done + [[ -f "$kbart_file" ]] && zip_files+=("$kbart_file") if [[ ${#zip_files[@]} -eq 0 ]]; then log_error "Nenhum arquivo CSV encontrado para $acron" @@ -216,6 +223,7 @@ process_collection() { for csv in "${CSV_FILES[@]}"; do [[ -f "${csv}.csv" ]] && tail -n +"$tail_head" "${csv}.csv" >> "$WORK_DIR/$NETWORK_DIR/${csv}.csv" done + [[ -f "$kbart_file" ]] && tail -n +"$tail_head" "$kbart_file" >> "$WORK_DIR/$NETWORK_DIR/$(kbart_csv_file network)" fi cd "$WORK_DIR" @@ -230,6 +238,9 @@ process_network_zip() { for csv in "${CSV_FILES[@]}"; do [[ -f "${csv}.csv" ]] && network_files+=("${csv}.csv") done + local network_kbart_file + network_kbart_file=$(kbart_csv_file network) + [[ -f "$network_kbart_file" ]] && network_files+=("$network_kbart_file") if [[ ${#network_files[@]} -eq 0 ]]; then log_error "Nenhum arquivo encontrado para criar tabs_network.zip" From 31c2d261318ac08579e2c82ada3d47b80932990f Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Wed, 17 Jun 2026 19:09:00 -0300 Subject: [PATCH 2/5] change nome do arquivo kbart --- docs/source/public_reports.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/public_reports.rst b/docs/source/public_reports.rst index f56b3f6..391926f 100644 --- a/docs/source/public_reports.rst +++ b/docs/source/public_reports.rst @@ -208,7 +208,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. Relatório de periódicos em formato Kbart ======================================== -**nome do arquivo:** journals_kbart.csv +**nome do arquivo:** SciELO__AllTitles_YYYY-MM-DD.csv **finalidade:** Relatório de periódicos no formato Kbart. From b545350247642c12ea6853a2ec19ca0655dfdd66 Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Wed, 17 Jun 2026 19:09:09 -0300 Subject: [PATCH 3/5] black --- export/kbart.py | 204 +++++++++++++++++++++++++----------------------- 1 file changed, 107 insertions(+), 97 deletions(-) diff --git a/export/kbart.py b/export/kbart.py index 7134fbb..7ec5ba0 100644 --- a/export/kbart.py +++ b/export/kbart.py @@ -3,17 +3,18 @@ Este processamento gera uma tabulação de periódicos seguindo o formato Kbart. Formato de saída (headers em inglês, conforme diretrizes KBART): -publication_title, print_identifier, online_identifier, date_first_issue_online, -num_first_vol_online, num_first_issue_online, date_last_issue_online, -num_last_vol_online, num_last_issue_online, title_url, first_author, title_id, -embargo_info, coverage_depth, coverage_notes, publisher_name, publication_type, -date_monograph_published_print, date_monograph_published_online, monograph_volume, -monograph_edition, first_editor, parent_publication_title_id, +publication_title, print_identifier, online_identifier, date_first_issue_online, +num_first_vol_online, num_first_issue_online, date_last_issue_online, +num_last_vol_online, num_last_issue_online, title_url, first_author, title_id, +embargo_info, coverage_depth, coverage_notes, publisher_name, publication_type, +date_monograph_published_print, date_monograph_published_online, monograph_volume, +monograph_edition, first_editor, parent_publication_title_id, preceding_publication_title_id, access_type """ + import argparse -import logging import codecs +import logging import re import utils @@ -23,37 +24,39 @@ # ISSN redirects for journals that changed their ISSN in URLs # Maps old ISSN to new ISSN ISSN_URL_REDIRECTS = { - '1575-0620': '2013-6463', # Revista española de sanidad penitenciaria (SciELO Spain) + "1575-0620": "2013-6463", # Revista española de sanidad penitenciaria (SciELO Spain) } # Pre-compile regex patterns for ISSN redirects for better performance _ISSN_REDIRECT_PATTERNS = { - old_issn: re.compile(r'([?&]pid=)' + re.escape(old_issn) + r'(&|$)') + old_issn: re.compile(r"([?&]pid=)" + re.escape(old_issn) + r"(&|$)") for old_issn in list(ISSN_URL_REDIRECTS.keys()) } -def _config_logging(logging_level='INFO', logging_file=None): +def _config_logging(logging_level="INFO", logging_file=None): allowed_levels = { - 'DEBUG': logging.DEBUG, - 'INFO': logging.INFO, - 'WARNING': logging.WARNING, - 'ERROR': logging.ERROR, - 'CRITICAL': logging.CRITICAL + "DEBUG": logging.DEBUG, + "INFO": logging.INFO, + "WARNING": logging.WARNING, + "ERROR": logging.ERROR, + "CRITICAL": logging.CRITICAL, } - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) - logger.setLevel(allowed_levels.get(logging_level, 'INFO')) + logger.setLevel(allowed_levels.get(logging_level, "INFO")) if logging_file: - hl = logging.FileHandler(logging_file, mode='a') + hl = logging.FileHandler(logging_file, mode="a") else: hl = logging.StreamHandler() hl.setFormatter(formatter) - hl.setLevel(allowed_levels.get(logging_level, 'INFO')) + hl.setLevel(allowed_levels.get(logging_level, "INFO")) logger.addHandler(hl) @@ -61,7 +64,6 @@ def _config_logging(logging_level='INFO', logging_file=None): class Dumper(object): - def __init__(self, collection, issns=None, output_file=None): self._ratchet = utils.ratchet_server() @@ -69,7 +71,11 @@ def __init__(self, collection, issns=None, output_file=None): self._publicationstats = utils.publicationstats_server() self.collection = collection self.issns = issns - self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file + self.output_file = ( + codecs.open(output_file, "w", encoding="utf-8") + if output_file + else output_file + ) header = [ "publication_title", "print_identifier", @@ -95,21 +101,21 @@ def __init__(self, collection, issns=None, output_file=None): "first_editor", "parent_publication_title_id", "preceding_publication_title_id", - "access_type" - + "access_type", ] - self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) + self.write(",".join(['"%s"' % i.replace('"', '""') for i in header])) def _first_included_document_by_journal(self, issn, collection): fid = self._publicationstats.first_included_document_by_journal( - issn, collection) + issn, collection + ) if not fid: return None - document = self._articlemeta.document(fid['pid'], fid['collection']) + document = self._articlemeta.document(fid["pid"], fid["collection"]) if not document.data: return None @@ -118,13 +124,12 @@ def _first_included_document_by_journal(self, issn, collection): def _last_included_document_by_journal(self, issn, collection): - lid = self._publicationstats.last_included_document_by_journal( - issn, collection) + lid = self._publicationstats.last_included_document_by_journal(issn, collection) if not lid: return None - document = self._articlemeta.document(lid['pid'], lid['collection']) + document = self._articlemeta.document(lid["pid"], lid["collection"]) if not document.data: return None @@ -135,7 +140,7 @@ def write(self, line): if not self.output_file: print(line) else: - self.output_file.write('%s\r\n' % line) + self.output_file.write("%s\r\n" % line) def run(self): for item in list(self.items()): @@ -147,65 +152,86 @@ def items(self): self.issns = [None] for issn in self.issns: - for data in self._articlemeta.journals(collection=self.collection, issn=issn): - current_status = utils.get_metadata_value(data, 'current_status') - if current_status != 'current': - logger.debug('Skipping non-active journal: %s (status: %s)' % (data.scielo_issn, current_status)) + for data in self._articlemeta.journals( + collection=self.collection, issn=issn + ): + current_status = utils.get_metadata_value(data, "current_status") + if current_status != "current": + logger.debug( + "Skipping non-active journal: %s (status: %s)" + % (data.scielo_issn, current_status) + ) continue - logger.debug('Reading document: %s' % data.scielo_issn) + logger.debug("Reading document: %s" % data.scielo_issn) yield self.fmt_csv(data) def fmt_csv(self, data): line = [] - first_document = self._first_included_document_by_journal(data.scielo_issn, data.collection_acronym) - last_document = self._last_included_document_by_journal(data.scielo_issn, data.collection_acronym) + first_document = self._first_included_document_by_journal( + data.scielo_issn, data.collection_acronym + ) + last_document = self._last_included_document_by_journal( + data.scielo_issn, data.collection_acronym + ) line.append(data.title) - line.append(data.print_issn or '') - line.append(data.electronic_issn or '') - line.append( - first_document.publication_date or '' if first_document else '') + line.append(data.print_issn or "") + line.append(data.electronic_issn or "") + line.append(first_document.publication_date or "" if first_document else "") line.append( - first_document.issue.volume or '' if first_document and first_document.issue else '') + first_document.issue.volume or "" + if first_document and first_document.issue + else "" + ) line.append( - first_document.issue.number or '' if first_document and first_document.issue else '') - if utils.get_metadata_value(data, 'current_status') != 'current': - line.append( - last_document.publication_date or '' if last_document else '') + first_document.issue.number or "" + if first_document and first_document.issue + else "" + ) + if utils.get_metadata_value(data, "current_status") != "current": + line.append(last_document.publication_date or "" if last_document else "") line.append( - last_document.issue.volume or '' if last_document and last_document.issue else '') + last_document.issue.volume or "" + if last_document and last_document.issue + else "" + ) line.append( - last_document.issue.number or '' if last_document and last_document.issue else '') + last_document.issue.number or "" + if last_document and last_document.issue + else "" + ) else: - line += ['', '', ''] + line += ["", "", ""] # Generate the URL - url = data.url().replace('sci_serial', 'sci_issues') - + url = data.url().replace("sci_serial", "sci_issues") + # Apply ISSN redirects for journals that changed their ISSN in URLs # This is necessary for journals that no longer use their print ISSN for old_issn, new_issn in list(ISSN_URL_REDIRECTS.items()): # Use pre-compiled regex pattern for better performance pattern = _ISSN_REDIRECT_PATTERNS[old_issn] - url = pattern.sub(r'\g<1>' + new_issn + r'\2', url) - + url = pattern.sub(r"\g<1>" + new_issn + r"\2", url) + line.append(url) - line.append('') # first_author - line.append(data.scielo_issn or '') - line.append('') # embargo_info - line.append('fulltext') # coverage_depth - line.append('') # coverage_notes - line.append(' '.join(data.publisher_name) if data.publisher_name else '') # publisher_name - line.append('Serial') # publication_type - line.append('') # date_monograph_published_print - line.append('') # date_monograph_published_online - line.append('') # monograph_volume - line.append('') # monograph_edition - line.append('') # first_editor - line.append('') # parent_publication_title_id - line.append('') # preceding_publication_title_id - line.append('F') # access_type - - joined_line = ','.join(['"%s"' % i.replace('"', '""') for i in line]) + line.append("") # first_author + line.append(data.scielo_issn or "") + line.append("") # embargo_info + line.append("fulltext") # coverage_depth + line.append("") # coverage_notes + line.append( + " ".join(data.publisher_name) if data.publisher_name else "" + ) # publisher_name + line.append("Serial") # publication_type + line.append("") # date_monograph_published_print + line.append("") # date_monograph_published_online + line.append("") # monograph_volume + line.append("") # monograph_edition + line.append("") # first_editor + line.append("") # parent_publication_title_id + line.append("") # preceding_publication_title_id + line.append("F") # access_type + + joined_line = ",".join(['"%s"' % i.replace('"', '""') for i in line]) return joined_line @@ -213,44 +239,28 @@ def fmt_csv(self, data): def main(): parser = argparse.ArgumentParser( - description='Export journals list in Kabart format' + description="Export journals list in Kabart format" ) - parser.add_argument( - 'issns', - nargs='*', - help='ISSN\'s separated by spaces' - ) + parser.add_argument("issns", nargs="*", help="ISSN's separated by spaces") - parser.add_argument( - '--collection', - '-c', - help='Collection Acronym' - ) + parser.add_argument("--collection", "-c", help="Collection Acronym") - parser.add_argument( - '--output_file', - '-r', - help='File to receive the dumped data' - ) + parser.add_argument("--output_file", "-r", help="File to receive the dumped data") - parser.add_argument( - '--logging_file', - '-o', - help='Full path to the log file' - ) + parser.add_argument("--logging_file", "-o", help="Full path to the log file") parser.add_argument( - '--logging_level', - '-l', - default='DEBUG', - choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], - help='Logggin level' + "--logging_level", + "-l", + default="DEBUG", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Logggin level", ) args = parser.parse_args() _config_logging(args.logging_level, args.logging_file) - logger.info('Dumping data for: %s' % args.collection) + logger.info("Dumping data for: %s" % args.collection) issns = None if len(args.issns) > 0: From 5f8c1215a3399d7e6bdf527614626259737c36bc Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 18 Jun 2026 15:01:10 -0300 Subject: [PATCH 4/5] =?UTF-8?q?Mant=C3=A9m=20protocolo=20HTTP=20para=20col?= =?UTF-8?q?e=C3=A7=C3=B5es=20definidas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- export/kbart.py | 9 +++++++++ tests/test_kbart.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 tests/test_kbart.py diff --git a/export/kbart.py b/export/kbart.py index 7ec5ba0..7cf0299 100644 --- a/export/kbart.py +++ b/export/kbart.py @@ -27,6 +27,8 @@ "1575-0620": "2013-6463", # Revista española de sanidad penitenciaria (SciELO Spain) } +HTTP_ONLY_COLLECTIONS = set(["bol", "col", "per", "cub", "sss", "ury"]) + # Pre-compile regex patterns for ISSN redirects for better performance _ISSN_REDIRECT_PATTERNS = { old_issn: re.compile(r"([?&]pid=)" + re.escape(old_issn) + r"(&|$)") @@ -34,6 +36,11 @@ } +def title_url_for_collection(url, collection): + if collection not in HTTP_ONLY_COLLECTIONS and url.startswith("http://"): + return "https://" + url[len("http://"):] + return url + def _config_logging(logging_level="INFO", logging_file=None): allowed_levels = { @@ -212,6 +219,8 @@ def fmt_csv(self, data): pattern = _ISSN_REDIRECT_PATTERNS[old_issn] url = pattern.sub(r"\g<1>" + new_issn + r"\2", url) + url = title_url_for_collection(url, self.collection) + line.append(url) line.append("") # first_author line.append(data.scielo_issn or "") diff --git a/tests/test_kbart.py b/tests/test_kbart.py new file mode 100644 index 0000000..5e91c6d --- /dev/null +++ b/tests/test_kbart.py @@ -0,0 +1,35 @@ +# coding: utf-8 +import unittest + +from export import kbart + + +class KbartTest(unittest.TestCase): + + def test_title_url_uses_https_for_updated_collections(self): + url = "http://www.scielo.br/scielo.php?script=sci_issues&pid=0100-879X" + + result = kbart.title_url_for_collection(url, "scl") + + self.assertEqual( + result, + "https://www.scielo.br/scielo.php?script=sci_issues&pid=0100-879X" + ) + + def test_title_url_keeps_http_for_collections_without_https(self): + url = "http://www.scielo.org.bo/scielo.php?script=sci_issues&pid=2077-3323" + + result = kbart.title_url_for_collection(url, "bol") + + self.assertEqual(result, url) + + def test_title_url_keeps_existing_https(self): + url = "https://www.scielo.br/scielo.php?script=sci_issues&pid=0100-879X" + + result = kbart.title_url_for_collection(url, "scl") + + self.assertEqual(result, url) + + +if __name__ == "__main__": + unittest.main() From 156b40fdbb3e5cba70b8845e83cd601bbe93bebd Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Fri, 19 Jun 2026 15:43:20 -0300 Subject: [PATCH 5/5] Update version xylose to version 1.35.14 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a1e278d..6bf3ee0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ lxml>=6.0.0 requests>=2.32.0 scieloh5m5 @ git+https://github.com/scieloorg/scieloh5m5.git@1.9.6 thriftpy2>=0.5.2 -xylose @ git+https://github.com/scieloorg/xylose.git@1.35.13 +xylose @ git+https://github.com/scieloorg/xylose.git@1.35.14