diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..2e524b0 --- /dev/null +++ b/.env.example @@ -0,0 +1,12 @@ +PROCESSING_SETTINGS_FILE= +ARTICLEMETA_THRIFTSERVER= +ARTICLEMETA_ADMINTOKEN= +RATCHET_THRIFTSERVER= +ACCESSSTATS_THRIFTSERVER= +CITEDBY_THRIFTSERVER= +PUBLICATIONSTATS_THRIFTSERVER= +SOLR_SEARCH_SCIELO_ORG= +SOLR_SEARCH_SCIELO_ORG_INDEX= +PUBLICATIONSTATS_TIMEOUT_MS=60000 +EXIT_ON_FAILURE=true +SLACK_WEBHOOK_URL= diff --git a/.github/workflows/master-quality.yml b/.github/workflows/master-quality.yml new file mode 100644 index 0000000..d1cd830 --- /dev/null +++ b/.github/workflows/master-quality.yml @@ -0,0 +1,26 @@ +name: Master Quality + +on: + push: + branches: + - codex/python3-14-migration + +jobs: + sonar: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: SonarQube Scan + uses: SonarSource/sonarqube-scan-action@v6 + env: + SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} + SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }} + + - name: Quality Gate + uses: SonarSource/sonarqube-quality-gate-action@v1 + env: + SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..9b762fc --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,117 @@ +name: Release + +on: + push: + tags: + - 'v*' + +jobs: + release: + runs-on: ubuntu-latest + permissions: + id-token: write + contents: read + + steps: + - uses: actions/checkout@v4 + + - name: Set version + run: echo "VERSION=${GITHUB_REF_NAME}" >> $GITHUB_ENV + + - name: Docker Login + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USER }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build image + run: | + docker build \ + -t infrascielo/processing:${VERSION} \ + -t infrascielo/processing:latest \ + . + + # 🔐 Scan único (policy) + - name: Trivy Image Scan + uses: aquasecurity/trivy-action@v0.36.0 + with: + image-ref: infrascielo/processing:${{ env.VERSION }} + severity: HIGH,CRITICAL + exit-code: 0 + + - name: Install Trivy CLI + run: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin + + # 📄 Relatório (evidência) + - name: Trivy Report + run: | + trivy image \ + --scanners vuln \ + --severity HIGH,CRITICAL \ + --format table \ + --output trivy-report.txt \ + infrascielo/processing:${VERSION} + + - uses: actions/upload-artifact@v4 + with: + name: trivy-report + path: trivy-report.txt + + # 📦 SBOM + - name: Generate SBOM (CycloneDX) + run: | + trivy image \ + --scanners vuln \ + --format cyclonedx \ + --output sbom-${VERSION}.json \ + infrascielo/processing:${VERSION} + + - uses: actions/upload-artifact@v4 + with: + name: sbom-${{ env.VERSION }} + path: sbom-${{ env.VERSION }}.json + + - name: Push image + run: | + docker push infrascielo/processing:${VERSION} + docker push infrascielo/processing:latest + + - name: Push image + run: | + docker push infrascielo/processing:${VERSION} + docker push infrascielo/processing:latest + + - name: Get image digest + run: | + DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' infrascielo/processing:${VERSION}) + echo "IMAGE_DIGEST=${DIGEST}" >> $GITHUB_ENV + + - name: Install Cosign + uses: sigstore/cosign-installer@v3 + + - name: Sign image with Cosign + env: + COSIGN_EXPERIMENTAL: "1" + COSIGN_YES: "true" + run: | + cosign sign ${IMAGE_DIGEST} + + - name: Verify image signature + env: + COSIGN_EXPERIMENTAL: "1" + run: | + cosign verify \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com \ + --certificate-identity-regexp "https://github.com/${{ github.repository }}/*" \ + ${IMAGE_DIGEST} + + - name: Attach SBOM attestation + env: + COSIGN_EXPERIMENTAL: "1" + COSIGN_YES: "true" + run: | + cosign attest \ + --predicate sbom-${VERSION}.json \ + --type cyclonedx \ + ${IMAGE_DIGEST} diff --git a/.gitignore b/.gitignore index 9dec496..4632b0e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,8 @@ .DS_Store +*.swp +.env +.env.* +!.env.example # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -57,5 +61,11 @@ docs/_build/ # PyBuilder target/ +# Runtime processing workspace +tmp_*/ +network/ +var/ +k8s/*secret.yaml + # pip source directory src/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8316d70 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,40 @@ +FROM python:3.14-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PROCESSING_SETTINGS_FILE=/app/config.ini \ + DOCKER_ENV=1 \ + WORK_DIR=/app \ + LOG_DIR=/var/log/processing \ + TABS_DIR=/var/www/static_scielo_org/tabs \ + PYTHONPATH=/app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + git \ + libxml2-dev \ + libxslt1-dev \ + zip \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt setup.py ./ +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && pip install --no-deps \ + accessstatsapi==1.2.1 \ + articlemetaapi==1.26.7 \ + citedbyapi==1.11.3 \ + publicationstatsapi==1.2.2 \ + packtools==2.6.4 + +COPY . . +RUN cp config.ini-TEMPLATE config.ini \ + && mkdir -p /var/log/processing /var/www/static_scielo_org/tabs \ + && pip install --no-deps -e . + +ENTRYPOINT ["/bin/bash", "/app/run.sh"] diff --git a/README.md b/README.md index f362c62..c91dc86 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,76 @@ e também para o envio de dados à parceiros. ## Requisitos: -* Python 2.7 +* Python 3.14 + +## Docker + +Para instalar as dependências e executar a suíte de testes: + +```bash +docker compose run --rm tests +``` + +Para executar o processamento via Docker, configure as variáveis em um arquivo +`.env` local, baseado em `.env.example`. O `.env` não deve ser versionado. + +```bash +cp .env.example .env +``` + +Preencha no `.env` os valores equivalentes ao antigo `config.ini`: + +```ini +ARTICLEMETA_THRIFTSERVER= +ARTICLEMETA_ADMINTOKEN= +RATCHET_THRIFTSERVER= +ACCESSSTATS_THRIFTSERVER= +CITEDBY_THRIFTSERVER= +PUBLICATIONSTATS_THRIFTSERVER= +SOLR_SEARCH_SCIELO_ORG= +SOLR_SEARCH_SCIELO_ORG_INDEX= +PUBLICATIONSTATS_TIMEOUT_MS=60000 +SLACK_WEBHOOK_URL= +``` + +Depois execute: + +```bash +docker compose run --rm processing "scl-BR" +``` + +Os logs ficam persistidos em `var/log/processing` e os arquivos ZIP gerados em +`var/tabs`. + +## Kubernetes / Argo + +O agendamento para Argo Workflows está em `k8s/argo-cronworkflow.yaml` com o cron: + +```text +0 3 1,8,15,22 1-12 * +``` + +O workflow executa as coleções sequencialmente. Quando uma coleção falha, o +script envia a notificação de erro e continua para a próxima; ao final, o job +termina com sucesso para não interromper as próximas execuções agendadas. + +Os valores sensíveis devem ser criados em um Secret do Kubernetes chamado +`processing-env`, baseado em `k8s/processing-env.secret.example.yaml`. Não +versione o Secret real. + +Volumes esperados: + +```text +processing-data -> /var/www/static_scielo_org/tabs +processing-logs -> /var/log/processing +``` + +Aplicação: + +```bash +kubectl apply -f k8s/processing-env.secret.yaml +kubectl apply -f k8s/argo-cronworkflow.yaml +``` ## Exportação de dados ao DOAJ diff --git a/accesses/documents_by_journals.py b/accesses/documents_by_journals.py index 7d2c4b1..49afc56 100644 --- a/accesses/documents_by_journals.py +++ b/accesses/documents_by_journals.py @@ -51,35 +51,35 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"publishing year") - header.append(u"accesses year") - header.append(u"accesses to html") - header.append(u"accesses to abstract") - header.append(u"accesses to pdf") - header.append(u"accesses to epdf") - header.append(u"total accesses") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("publishing year") + header.append("accesses year") + header.append("accesses to html") + header.append("accesses to abstract") + header.append("accesses to pdf") + header.append("accesses to epdf") + header.append("total accesses") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -103,17 +103,17 @@ def fmt_csv(self, data): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'journal') + line.append('journal') line.append(data.collection_acronym) line.append(data.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.title) - line.append(u';'.join(data.subject_areas or [])) + line.append(';'.join(data.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.subject_areas or []) > 2 else '0') line.append(data.current_status) diff --git a/accesses/dumpdata.py b/accesses/dumpdata.py index 542742d..1d0be66 100644 --- a/accesses/dumpdata.py +++ b/accesses/dumpdata.py @@ -61,7 +61,7 @@ def pdf_keys(fulltexts): if not 'pdf' in fulltexts: return keys - for language, url in fulltexts['pdf'].items(): + for language, url in list(fulltexts['pdf'].items()): path = REGEX_PDF_PATH.search(url) if path: keys.append(path.group()) @@ -166,7 +166,7 @@ def join_metadata_with_accesses(document, accesses_date, accesses): if document.original_title(): data['document_title'] = document.original_title() elif document.translated_titles(): - for language, title in document.translated_titles().items(): + for language, title in list(document.translated_titles().items()): if title: data['document_title'] = title break @@ -192,7 +192,7 @@ def join_metadata_with_accesses(document, accesses_date, accesses): data['access_html'] = accesses.get('html', 0) data['access_pdf'] = accesses.get('pdf', 0) data['access_epdf'] = accesses.get('readcube', 0) - data['access_total'] = sum([v for i, v in accesses.items()]) + data['access_total'] = sum([v for i, v in list(accesses.items())]) return data @@ -216,7 +216,7 @@ def joining_monthly(joined_data, atype, data): if 'total' in data: del(data['total']) - for year, months in data.items(): + for year, months in list(data.items()): del(months['total']) for month in months: @@ -234,9 +234,9 @@ def joining_dayly(joined_data, atype, data): if 'total' in data: del(data['total']) - for year, months in data.items(): + for year, months in list(data.items()): del(months['total']) - for month, days in months.items(): + for month, days in list(months.items()): del(days['total']) for day in days: dt = '%s-%s-%s' % (year[1:], month[1:], day[1:]) @@ -253,7 +253,7 @@ def joining_dayly(joined_data, atype, data): joining = joining_dayly for data in accesses: - for key, value in data.items(): + for key, value in list(data.items()): if not key in ['abstract', 'html', 'pdf', 'readcube']: continue joined_data = joining(joined_data, key, value) @@ -280,37 +280,37 @@ def __init__(self, collection, issns=None, from_date=FROM, until_date=UNTIL, else: self.fmt = self.fmt_csv header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"document publishing ID (PID SciELO)") - header.append(u"document publishing year") - header.append(u"document type") - header.append(u'document is citable') - header.append(u"issue") - header.append(u"issue title") - header.append(u"document title") - header.append(u"processing date") - header.append(u"publication date at SciELO") - header.append(u"publication date") - header.append(u"access date") - header.append(u"access year") - header.append(u"access month") - header.append(u"access to abstract") - header.append(u"access to html") - header.append(u"access to pdf") - header.append(u"access to epdf") - header.append(u"access total") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("document publishing ID (PID SciELO)") + header.append("document publishing year") + header.append("document type") + header.append('document is citable') + header.append("issue") + header.append("issue title") + header.append("document title") + header.append("processing date") + header.append("publication date at SciELO") + header.append("publication date") + header.append("access date") + header.append("access year") + header.append("access month") + header.append("access to abstract") + header.append("access to html") + header.append("access to pdf") + header.append("access to epdf") + header.append("access total") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def get_accesses(self, issn): for document in self._articlemeta.documents(collection=self.collection, issn=issn): @@ -333,7 +333,7 @@ def get_accesses(self, issn): accesses, self.from_date, self.until_date, self.dayly_granularity) - for adate, adata in joined_accesses.items(): + for adate, adata in list(joined_accesses.items()): try: yield join_metadata_with_accesses(document, adate, adata) except Exception as e: @@ -341,7 +341,7 @@ def get_accesses(self, issn): def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) @@ -358,20 +358,20 @@ def fmt_csv(self, data): line.append('document') line.append(data['collection']) line.append(data['issn']) - line.append(u';'.join(data['issns'])) + line.append(';'.join(data['issns'])) line.append(data['journal_title']) line.append(', '.join(data['subject_areas'])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data['subject_areas'] or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data['subject_areas'] or []) > 2 else '0') line.append(data['journal_current_status']) line.append(data['pid']) line.append(data['publication_year']) line.append(data['document_type']) - line.append(u'1' if data['document_type'].lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append('1' if data['document_type'].lower() in choices.CITABLE_DOCUMENT_TYPES else '0') line.append(data['issue']) line.append(data['issue_title']) line.append(data['document_title']) @@ -397,7 +397,7 @@ def run(self): if not self.output_file: for issn in self.issns: for data in self.get_accesses(issn=issn): - print(self.fmt(data)) + print((self.fmt(data))) exit() for issn in self.issns: diff --git a/bibliometric/citedby_document.py b/bibliometric/citedby_document.py index a584b34..e51f42b 100644 --- a/bibliometric/citedby_document.py +++ b/bibliometric/citedby_document.py @@ -59,38 +59,38 @@ def __init__(self, collection, issns=None, output_file=None, output_format=OUTPU if output_format != 'json': header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"document publication ID (PID SciELO)") - header.append(u"document publication year") - header.append(u"document type") - header.append(u"document is citable") - header.append(u"document title") - header.append(u"cited publication ID (PID SciELO)") - header.append(u"cited by issn") - header.append(u"cited by journal") - header.append(u"cited by document publication year") - header.append(u"cited by document title") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("document publication ID (PID SciELO)") + header.append("document publication year") + header.append("document type") + header.append("document is citable") + header.append("document title") + header.append("cited publication ID (PID SciELO)") + header.append("cited by issn") + header.append("cited by journal") + header.append("cited by document publication year") + header.append("cited by document title") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -130,23 +130,23 @@ def fmt_csv(self, content): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'document') + line.append('document') line.append(data.collection_acronym) line.append(data.journal.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.journal.title) - line.append(u';'.join(data.journal.subject_areas or [])) + line.append(';'.join(data.journal.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.journal.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) line.append(data.document_type) - line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append('1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') line.append(data.original_title() or '') line.append(citedby.get('code', '')) line.append(citedby.get('issn', '')) diff --git a/bibliometric/citedby_journal.py b/bibliometric/citedby_journal.py index 970f1c8..da36b6b 100644 --- a/bibliometric/citedby_journal.py +++ b/bibliometric/citedby_journal.py @@ -70,32 +70,32 @@ def __init__(self, collection, issns=None, output_file=None, output_format=OUTPU if output_format != 'json': header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"has optimized queries") - header.append(u"publications from (year)") - header.append(u"cited publications from (year)") - header.append(u"total of citations") + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("has optimized queries") + header.append("publications from (year)") + header.append("cited publications from (year)") + header.append("total of citations") - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -135,17 +135,17 @@ def fmt_csv(self, content): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'journal') + line.append('journal') line.append(data.collection_acronym) line.append(data.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.title) - line.append(u';'.join(data.subject_areas or [])) + line.append(';'.join(data.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.subject_areas or []) > 2 else '0') line.append(data.current_status) line.append('1' if self._citedby.has_optmized_journal_queries(data.scielo_issn) else '0') diff --git a/bibliometric/impact_factor.py b/bibliometric/impact_factor.py index 0a0a594..632acda 100644 --- a/bibliometric/impact_factor.py +++ b/bibliometric/impact_factor.py @@ -51,35 +51,35 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"base year") - header.append(u"imediacity") - header.append(u"SciELO impact 1 year") - header.append(u"SciELO impact 2 years") - header.append(u"SciELO impact 3 years") - header.append(u"SciELO impact 4 years") - header.append(u"SciELO impact 5 years") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("base year") + header.append("imediacity") + header.append("SciELO impact 1 year") + header.append("SciELO impact 2 years") + header.append("SciELO impact 3 years") + header.append("SciELO impact 4 years") + header.append("SciELO impact 5 years") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -103,17 +103,17 @@ def fmt_csv(self, data): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'journal') + line.append('journal') line.append(data.collection_acronym) line.append(data.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.title) - line.append(u';'.join(data.subject_areas)) + line.append(';'.join(data.subject_areas)) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.subject_areas]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.subject_areas or []) > 2 else '0') line.append(data.current_status) diff --git a/choices.py b/choices.py index 3436185..802cbf1 100644 --- a/choices.py +++ b/choices.py @@ -1,299 +1,299 @@ # coding: utf-8 THEMATIC_AREAS = sorted([ - u"agricultural sciences", - u"applied social sciences", - u"biological sciences", - u"engineering", - u"exact and earth sciences", - u"health sciences", - u"human sciences", - u"linguistics, letters and arts" + "agricultural sciences", + "applied social sciences", + "biological sciences", + "engineering", + "exact and earth sciences", + "health sciences", + "human sciences", + "linguistics, letters and arts" ]) CITABLE_DOCUMENT_TYPES = ( 'data-article', - u'article-commentary', - u'brief-report', - u'case-report', - u'rapid-communication', - u'research-article', - u'review-article' + 'article-commentary', + 'brief-report', + 'case-report', + 'rapid-communication', + 'research-article', + 'review-article' ) ISO_3166 = { - 'BD': u'Bangladesh', - 'BE': u'Belgium', - 'BF': u'Burkina Faso', - 'BG': u'Bulgaria', - 'BA': u'Bosnia and Herzegovina', - 'BB': u'Barbados', - 'WF': u'Wallis and Futuna', - 'BL': u'Saint Barthélemy', - 'BM': u'Bermuda', - 'BN': u'Brunei Darussalam', - 'BO': u'Bolivia', - 'BH': u'Bahrain', - 'BI': u'Burundi', - 'BJ': u'Benin', - 'BT': u'Bhutan', - 'BU': u'Burma', - 'BV': u'Bouvet Island', - 'BW': u'Botswana', - 'WS': u'Samoa', - 'BQ': u'British Antarctic Territory', - 'BR': u'Brazil', - 'BS': u'Bahamas', - 'JE': u'Jersey', - 'WK': u'Wake Island', - 'BY': u'Byelorussian SSR', - 'BZ': u'Belize', - 'RU': u'Russian Federation', - 'RW': u'Rwanda', - 'PC': u'Pacific Islands', - 'TL': u'Timor-Leste', - 'JT': u'Johnston Island', - 'TM': u'Turkmenistan', - 'TJ': u'Tajikistan', - 'RO': u'Romania', - 'RH': u'Southern Rhodesia', - 'TK': u'Tokelau', - 'GW': u'Guinea-Bissau', - 'GU': u'Guam', - 'GT': u'Guatemala', - 'GS': u'South Georgia and the South Sandwich Islands', - 'GR': u'Greece', - 'GQ': u'Equatorial Guinea', - 'GP': u'Guadeloupe', - 'JP': u'Japan', - 'GY': u'Guyana', - 'GG': u'Guernsey', - 'GF': u'French Guiana', - 'GE': u'Gilbert and Ellice Islands', - 'GD': u'Grenada', - 'GB': u'United Kingdom', - 'GA': u'Gabon', - 'SV': u'El Salvador', - 'GN': u'Guinea', - 'GM': u'Gambia', - 'GL': u'Greenland', - 'GI': u'Gibraltar', - 'GH': u'Ghana', - 'OM': u'Oman', - 'TN': u'Tunisia', - 'JM': u'Jamaica', - 'JO': u'Jordan', - 'HR': u'Croatia', - 'HV': u'Upper Volta', - 'HT': u'Haiti', - 'HU': u'Hungary', - 'HK': u'Hong Kong', - 'HN': u'Honduras', - 'HM': u'Heard Island and McDonald Islands', - 'VD': u'Viet-Nam', - 'VE': u'Venezuela', - 'PR': u'Puerto Rico', - 'PS': u'Palestine', - 'UA': u'Ukraine', - 'PW': u'Palau', - 'PT': u'Portugal', - 'PU': u'United States Miscellaneous Pacific Islands', - 'PZ': u'Panama Canal Zone', - 'PY': u'Paraguay', - 'IQ': u'Iraq', - 'PA': u'Panama', - 'PF': u'French Polynesia', - 'PG': u'Papua New Guinea', - 'PE': u'Peru', - 'PK': u'Pakistan', - 'PH': u'Philippines', - 'PN': u'Pitcairn', - 'PL': u'Poland', - 'PM': u'Saint Pierre and Miquelon', - 'ZM': u'Zambia', - 'EH': u'Western Sahara', - 'EE': u'Estonia', - 'EG': u'Egypt', - 'ZA': u'South Africa', - 'EC': u'Ecuador', - 'IT': u'Italy', - 'VN': u'Viet Nam', - 'SB': u'Solomon Islands', - 'ET': u'Ethiopia', - 'SO': u'Somalia', - 'ZW': u'Zimbabwe', - 'SA': u'Saudi Arabia', - 'ES': u'Spain', - 'ER': u'Eritrea', - 'ME': u'Montenegro', - 'MD': u'Moldova', - 'MG': u'Madagascar', - 'MF': u'Saint Martin', - 'MA': u'Morocco', - 'MC': u'Monaco', - 'UZ': u'Uzbekistan', - 'MM': u'Myanmar', - 'ML': u'Mali', - 'MO': u'Macao', - 'MN': u'Mongolia', - 'MI': u'Midway Islands', - 'MH': u'Marshall Islands', - 'MK': u'Macedonia', - 'MU': u'Mauritius', - 'MT': u'Malta', - 'MW': u'Malawi', - 'MV': u'Maldives', - 'MQ': u'Martinique', - 'MP': u'Northern Mariana Islands', - 'MS': u'Montserrat', - 'MR': u'Mauritania', - 'IM': u'Isle of Man', - 'UG': u'Uganda', - 'TZ': u'Tanzania', - 'MY': u'Malaysia', - 'MX': u'Mexico', - 'IL': u'Israel', - 'FQ': u'French Southern and Antarctic Territories', - 'FR': u'France', - 'IO': u'British Indian Ocean Territory', - 'SH': u'Saint Helena', - 'RE': u'Réunion', - 'SJ': u'Svalbard and Jan Mayen', - 'FI': u'Finland', - 'FJ': u'Fiji', - 'FK': u'Falkland Islands', - 'FM': u'Micronesia', - 'FO': u'Faroe Islands', - 'NH': u'New Hebrides', - 'NI': u'Nicaragua', - 'NL': u'Netherlands', - 'NO': u'Norway', - 'NA': u'Namibia', - 'VU': u'Vanuatu', - 'NC': u'New Caledonia', - 'NE': u'Niger', - 'NF': u'Norfolk Island', - 'NG': u'Nigeria', - 'NZ': u'New Zealand', - 'ZR': u'Zaire', - 'NP': u'Nepal', - 'NQ': u'Dronning Maud Land', - 'NR': u'Nauru', - 'NT': u'Neutral Zone', - 'NU': u'Niue', - 'CK': u'Cook Islands', - 'CI': u"Côte d'Ivoire", - 'CH': u'Switzerland', - 'CO': u'Colombia', - 'CN': u'China', - 'CM': u'Cameroon', - 'CL': u'Chile', - 'CC': u'Cocos', - 'CA': u'Canada', - 'CG': u'Congo', - 'CF': u'Central African Republic', - 'CD': u'Congo', - 'CZ': u'Czech Republic', - 'CY': u'Cyprus', - 'CX': u'Christmas Island', - 'CS': u'Czechoslovakia', - 'CR': u'Costa Rica', - 'CW': u'Curaçao', - 'CV': u'Cabo Verde', - 'CU': u'Cuba', - 'CT': u'Canton and Enderbury Islands', - 'SZ': u'Swaziland', - 'SY': u'Syrian Arab Republic', - 'SX': u'Sint Maarten', - 'KG': u'Kyrgyzstan', - 'KE': u'Kenya', - 'SS': u'South Sudan', - 'SR': u'Suriname', - 'KI': u'Kiribati', - 'KH': u'Cambodia', - 'KN': u'Saint Kitts and Nevis', - 'KM': u'Comoros', - 'ST': u'Sao Tome and Principe', - 'SK': u'Slovakia', - 'KR': u'Korea', - 'SI': u'Slovenia', - 'KP': u'Korea', - 'KW': u'Kuwait', - 'SN': u'Senegal', - 'SM': u'San Marino', - 'SL': u'Sierra Leone', - 'SC': u'Seychelles', - 'KZ': u'Kazakhstan', - 'KY': u'Cayman Islands', - 'SG': u'Singapore', - 'SE': u'Sweden', - 'SD': u'Sudan', - 'DO': u'Dominican Republic', - 'DM': u'Dominica', - 'DJ': u'Djibouti', - 'DK': u'Denmark', - 'VG': u'Virgin Islands', - 'DD': u'German Democratic Republic', - 'DE': u'Germany', - 'YE': u'Yemen', - 'YD': u'Yemen', - 'DZ': u'Algeria', - 'US': u'United States', - 'DY': u'Dahomey', - 'UY': u'Uruguay', - 'YU': u'Yugoslavia', - 'YT': u'Mayotte', - 'UM': u'United States Minor Outlying Islands', - 'LB': u'Lebanon', - 'LC': u'Saint Lucia', - 'LA': u"Lao People's Democratic Republic", - 'TV': u'Tuvalu', - 'TW': u'Taiwan', - 'TT': u'Trinidad and Tobago', - 'TR': u'Turkey', - 'LK': u'Sri Lanka', - 'TP': u'East Timor', - 'LI': u'Liechtenstein', - 'LV': u'Latvia', - 'TO': u'Tonga', - 'LT': u'Lithuania', - 'LU': u'Luxembourg', - 'LR': u'Liberia', - 'LS': u'Lesotho', - 'TH': u'Thailand', - 'TF': u'French Southern Territories', - 'TG': u'Togo', - 'TD': u'Chad', - 'TC': u'Turks and Caicos Islands', - 'LY': u'Libya', - 'VA': u'Holy See', - 'VC': u'Saint Vincent and the Grenadines', - 'AE': u'United Arab Emirates', - 'AD': u'Andorra', - 'AG': u'Antigua and Barbuda', - 'AF': u'Afghanistan', - 'AI': u'Anguilla', - 'VI': u'Virgin Islands', - 'IS': u'Iceland', - 'IR': u'Iran', - 'AM': u'Armenia', - 'AL': u'Albania', - 'AO': u'Angola', - 'AN': u'Netherlands Antilles', - 'AQ': u'Antarctica', - 'AS': u'American Samoa', - 'AR': u'Argentina', - 'AU': u'Australia', - 'AT': u'Austria', - 'AW': u'Aruba', - 'IN': u'India', - 'AX': u'Âland Islands', - 'AZ': u'Azerbaijan', - 'IE': u'Ireland', - 'ID': u'Indonesia', - 'RS': u'Serbia', - 'QA': u'Qatar', - 'MZ': u'Mozambique' + 'BD': 'Bangladesh', + 'BE': 'Belgium', + 'BF': 'Burkina Faso', + 'BG': 'Bulgaria', + 'BA': 'Bosnia and Herzegovina', + 'BB': 'Barbados', + 'WF': 'Wallis and Futuna', + 'BL': 'Saint Barthélemy', + 'BM': 'Bermuda', + 'BN': 'Brunei Darussalam', + 'BO': 'Bolivia', + 'BH': 'Bahrain', + 'BI': 'Burundi', + 'BJ': 'Benin', + 'BT': 'Bhutan', + 'BU': 'Burma', + 'BV': 'Bouvet Island', + 'BW': 'Botswana', + 'WS': 'Samoa', + 'BQ': 'British Antarctic Territory', + 'BR': 'Brazil', + 'BS': 'Bahamas', + 'JE': 'Jersey', + 'WK': 'Wake Island', + 'BY': 'Byelorussian SSR', + 'BZ': 'Belize', + 'RU': 'Russian Federation', + 'RW': 'Rwanda', + 'PC': 'Pacific Islands', + 'TL': 'Timor-Leste', + 'JT': 'Johnston Island', + 'TM': 'Turkmenistan', + 'TJ': 'Tajikistan', + 'RO': 'Romania', + 'RH': 'Southern Rhodesia', + 'TK': 'Tokelau', + 'GW': 'Guinea-Bissau', + 'GU': 'Guam', + 'GT': 'Guatemala', + 'GS': 'South Georgia and the South Sandwich Islands', + 'GR': 'Greece', + 'GQ': 'Equatorial Guinea', + 'GP': 'Guadeloupe', + 'JP': 'Japan', + 'GY': 'Guyana', + 'GG': 'Guernsey', + 'GF': 'French Guiana', + 'GE': 'Gilbert and Ellice Islands', + 'GD': 'Grenada', + 'GB': 'United Kingdom', + 'GA': 'Gabon', + 'SV': 'El Salvador', + 'GN': 'Guinea', + 'GM': 'Gambia', + 'GL': 'Greenland', + 'GI': 'Gibraltar', + 'GH': 'Ghana', + 'OM': 'Oman', + 'TN': 'Tunisia', + 'JM': 'Jamaica', + 'JO': 'Jordan', + 'HR': 'Croatia', + 'HV': 'Upper Volta', + 'HT': 'Haiti', + 'HU': 'Hungary', + 'HK': 'Hong Kong', + 'HN': 'Honduras', + 'HM': 'Heard Island and McDonald Islands', + 'VD': 'Viet-Nam', + 'VE': 'Venezuela', + 'PR': 'Puerto Rico', + 'PS': 'Palestine', + 'UA': 'Ukraine', + 'PW': 'Palau', + 'PT': 'Portugal', + 'PU': 'United States Miscellaneous Pacific Islands', + 'PZ': 'Panama Canal Zone', + 'PY': 'Paraguay', + 'IQ': 'Iraq', + 'PA': 'Panama', + 'PF': 'French Polynesia', + 'PG': 'Papua New Guinea', + 'PE': 'Peru', + 'PK': 'Pakistan', + 'PH': 'Philippines', + 'PN': 'Pitcairn', + 'PL': 'Poland', + 'PM': 'Saint Pierre and Miquelon', + 'ZM': 'Zambia', + 'EH': 'Western Sahara', + 'EE': 'Estonia', + 'EG': 'Egypt', + 'ZA': 'South Africa', + 'EC': 'Ecuador', + 'IT': 'Italy', + 'VN': 'Viet Nam', + 'SB': 'Solomon Islands', + 'ET': 'Ethiopia', + 'SO': 'Somalia', + 'ZW': 'Zimbabwe', + 'SA': 'Saudi Arabia', + 'ES': 'Spain', + 'ER': 'Eritrea', + 'ME': 'Montenegro', + 'MD': 'Moldova', + 'MG': 'Madagascar', + 'MF': 'Saint Martin', + 'MA': 'Morocco', + 'MC': 'Monaco', + 'UZ': 'Uzbekistan', + 'MM': 'Myanmar', + 'ML': 'Mali', + 'MO': 'Macao', + 'MN': 'Mongolia', + 'MI': 'Midway Islands', + 'MH': 'Marshall Islands', + 'MK': 'Macedonia', + 'MU': 'Mauritius', + 'MT': 'Malta', + 'MW': 'Malawi', + 'MV': 'Maldives', + 'MQ': 'Martinique', + 'MP': 'Northern Mariana Islands', + 'MS': 'Montserrat', + 'MR': 'Mauritania', + 'IM': 'Isle of Man', + 'UG': 'Uganda', + 'TZ': 'Tanzania', + 'MY': 'Malaysia', + 'MX': 'Mexico', + 'IL': 'Israel', + 'FQ': 'French Southern and Antarctic Territories', + 'FR': 'France', + 'IO': 'British Indian Ocean Territory', + 'SH': 'Saint Helena', + 'RE': 'Réunion', + 'SJ': 'Svalbard and Jan Mayen', + 'FI': 'Finland', + 'FJ': 'Fiji', + 'FK': 'Falkland Islands', + 'FM': 'Micronesia', + 'FO': 'Faroe Islands', + 'NH': 'New Hebrides', + 'NI': 'Nicaragua', + 'NL': 'Netherlands', + 'NO': 'Norway', + 'NA': 'Namibia', + 'VU': 'Vanuatu', + 'NC': 'New Caledonia', + 'NE': 'Niger', + 'NF': 'Norfolk Island', + 'NG': 'Nigeria', + 'NZ': 'New Zealand', + 'ZR': 'Zaire', + 'NP': 'Nepal', + 'NQ': 'Dronning Maud Land', + 'NR': 'Nauru', + 'NT': 'Neutral Zone', + 'NU': 'Niue', + 'CK': 'Cook Islands', + 'CI': "Côte d'Ivoire", + 'CH': 'Switzerland', + 'CO': 'Colombia', + 'CN': 'China', + 'CM': 'Cameroon', + 'CL': 'Chile', + 'CC': 'Cocos', + 'CA': 'Canada', + 'CG': 'Congo', + 'CF': 'Central African Republic', + 'CD': 'Congo', + 'CZ': 'Czech Republic', + 'CY': 'Cyprus', + 'CX': 'Christmas Island', + 'CS': 'Czechoslovakia', + 'CR': 'Costa Rica', + 'CW': 'Curaçao', + 'CV': 'Cabo Verde', + 'CU': 'Cuba', + 'CT': 'Canton and Enderbury Islands', + 'SZ': 'Swaziland', + 'SY': 'Syrian Arab Republic', + 'SX': 'Sint Maarten', + 'KG': 'Kyrgyzstan', + 'KE': 'Kenya', + 'SS': 'South Sudan', + 'SR': 'Suriname', + 'KI': 'Kiribati', + 'KH': 'Cambodia', + 'KN': 'Saint Kitts and Nevis', + 'KM': 'Comoros', + 'ST': 'Sao Tome and Principe', + 'SK': 'Slovakia', + 'KR': 'Korea', + 'SI': 'Slovenia', + 'KP': 'Korea', + 'KW': 'Kuwait', + 'SN': 'Senegal', + 'SM': 'San Marino', + 'SL': 'Sierra Leone', + 'SC': 'Seychelles', + 'KZ': 'Kazakhstan', + 'KY': 'Cayman Islands', + 'SG': 'Singapore', + 'SE': 'Sweden', + 'SD': 'Sudan', + 'DO': 'Dominican Republic', + 'DM': 'Dominica', + 'DJ': 'Djibouti', + 'DK': 'Denmark', + 'VG': 'Virgin Islands', + 'DD': 'German Democratic Republic', + 'DE': 'Germany', + 'YE': 'Yemen', + 'YD': 'Yemen', + 'DZ': 'Algeria', + 'US': 'United States', + 'DY': 'Dahomey', + 'UY': 'Uruguay', + 'YU': 'Yugoslavia', + 'YT': 'Mayotte', + 'UM': 'United States Minor Outlying Islands', + 'LB': 'Lebanon', + 'LC': 'Saint Lucia', + 'LA': "Lao People's Democratic Republic", + 'TV': 'Tuvalu', + 'TW': 'Taiwan', + 'TT': 'Trinidad and Tobago', + 'TR': 'Turkey', + 'LK': 'Sri Lanka', + 'TP': 'East Timor', + 'LI': 'Liechtenstein', + 'LV': 'Latvia', + 'TO': 'Tonga', + 'LT': 'Lithuania', + 'LU': 'Luxembourg', + 'LR': 'Liberia', + 'LS': 'Lesotho', + 'TH': 'Thailand', + 'TF': 'French Southern Territories', + 'TG': 'Togo', + 'TD': 'Chad', + 'TC': 'Turks and Caicos Islands', + 'LY': 'Libya', + 'VA': 'Holy See', + 'VC': 'Saint Vincent and the Grenadines', + 'AE': 'United Arab Emirates', + 'AD': 'Andorra', + 'AG': 'Antigua and Barbuda', + 'AF': 'Afghanistan', + 'AI': 'Anguilla', + 'VI': 'Virgin Islands', + 'IS': 'Iceland', + 'IR': 'Iran', + 'AM': 'Armenia', + 'AL': 'Albania', + 'AO': 'Angola', + 'AN': 'Netherlands Antilles', + 'AQ': 'Antarctica', + 'AS': 'American Samoa', + 'AR': 'Argentina', + 'AU': 'Australia', + 'AT': 'Austria', + 'AW': 'Aruba', + 'IN': 'India', + 'AX': 'Âland Islands', + 'AZ': 'Azerbaijan', + 'IE': 'Ireland', + 'ID': 'Indonesia', + 'RS': 'Serbia', + 'QA': 'Qatar', + 'MZ': 'Mozambique' } -ISO_3166_COUNTRY_AS_KEY = {value: key for key, value in ISO_3166.items()} \ No newline at end of file +ISO_3166_COUNTRY_AS_KEY = {value: key for key, value in list(ISO_3166.items())} \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..b0c2439 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,34 @@ +services: + processing: + build: + context: . + image: scielo-processing:python3.14 + environment: + PROCESSING_SETTINGS_FILE: ${PROCESSING_SETTINGS_FILE:-} + ARTICLEMETA_THRIFTSERVER: ${ARTICLEMETA_THRIFTSERVER:-} + ARTICLEMETA_ADMINTOKEN: ${ARTICLEMETA_ADMINTOKEN:-} + RATCHET_THRIFTSERVER: ${RATCHET_THRIFTSERVER:-} + ACCESSSTATS_THRIFTSERVER: ${ACCESSSTATS_THRIFTSERVER:-} + CITEDBY_THRIFTSERVER: ${CITEDBY_THRIFTSERVER:-} + PUBLICATIONSTATS_THRIFTSERVER: ${PUBLICATIONSTATS_THRIFTSERVER:-} + SOLR_SEARCH_SCIELO_ORG: ${SOLR_SEARCH_SCIELO_ORG:-} + SOLR_SEARCH_SCIELO_ORG_INDEX: ${SOLR_SEARCH_SCIELO_ORG_INDEX:-} + PUBLICATIONSTATS_TIMEOUT_MS: ${PUBLICATIONSTATS_TIMEOUT_MS:-60000} + EXIT_ON_FAILURE: ${EXIT_ON_FAILURE:-true} + SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL:-} + LOG_DIR: /var/log/processing + TABS_DIR: /var/www/static_scielo_org/tabs + volumes: + - .:/app + - ./var/log/processing:/var/log/processing + - ./var/tabs:/var/www/static_scielo_org/tabs + + tests: + image: scielo-processing:python3.14 + build: + context: . + entrypoint: ["python", "-m", "unittest", "discover", "-s", "tests", "-v"] + environment: + PROCESSING_SETTINGS_FILE: /app/config.ini-TEMPLATE + volumes: + - .:/app diff --git a/docs/source/public_reports.rst b/docs/source/public_reports.rst index f56b3f6..391926f 100644 --- a/docs/source/public_reports.rst +++ b/docs/source/public_reports.rst @@ -208,7 +208,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. Relatório de periódicos em formato Kbart ======================================== -**nome do arquivo:** journals_kbart.csv +**nome do arquivo:** SciELO__AllTitles_YYYY-MM-DD.csv **finalidade:** Relatório de periódicos no formato Kbart. diff --git a/evaluation/altmetrics.py b/evaluation/altmetrics.py index 5e4c625..8e972a8 100644 --- a/evaluation/altmetrics.py +++ b/evaluation/altmetrics.py @@ -11,7 +11,7 @@ # Python 3 and 2 Compatibilility try: - import urlparse as parse # Python 2 + import urllib.parse as parse # Python 2 except: from urllib import parse # Python3 @@ -69,34 +69,34 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"document publishing ID (PID SciELO)") - header.append(u"document publishing year") - header.append(u"document type") - header.append(u"document is citable") - header.append(u"score") - header.append(u'altmetrics url') - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("document publishing ID (PID SciELO)") + header.append("document publishing year") + header.append("document type") + header.append("document is citable") + header.append("score") + header.append('altmetrics url') + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) def altmetrics_items_by_journals(self, issn): @@ -153,9 +153,9 @@ def fmt_csv(self, data, altmetrics): if doi: article = self._articlemeta.document(doi.upper(), self.collection) - publication_date = article.publication_date if article and article.data else u'not defined' - publisher_id = article.publisher_id if article and article.data else u'not defined' - document_type = article.document_type if article and article.data else u'not defined' + publication_date = article.publication_date if article and article.data else 'not defined' + publisher_id = article.publisher_id if article and article.data else 'not defined' + document_type = article.document_type if article and article.data else 'not defined' score = altmetrics.get('score', None) @@ -167,33 +167,33 @@ def fmt_csv(self, data, altmetrics): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'document') + line.append('document') line.append(data.collection_acronym) line.append(data.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.title) - line.append(u';'.join(data.subject_areas or [])) + line.append(';'.join(data.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.subject_areas or []) > 2 else '0') line.append(data.current_status) line.append(publisher_id) - if publication_date == u'not define': + if publication_date == 'not define': line.append(document_type) else: line.append(publication_date[0:4]) line.append(document_type) - if document_type == u'not define': + if document_type == 'not define': line.append(document_type) else: - line.append(u'1' if document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else u'0') - line.append(str(score) or u'0') - line.append(details_url or u'not defined') + line.append('1' if document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append(str(score) or '0') + line.append(details_url or 'not defined') - return u','.join([u'"%s"' % i.replace(u'"', u'""') for i in line]) + return ','.join(['"%s"' % i.replace('"', '""') for i in line]) def main(): diff --git a/export/doaj_journals.py b/export/doaj_journals.py index 11ee731..7a8ba36 100644 --- a/export/doaj_journals.py +++ b/export/doaj_journals.py @@ -77,9 +77,9 @@ def __init__(self, collection, issns=None, output_file=None): self.doaj_journals = Journals() self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file - header = [u"coleção",u"issn scielo",u"issn impresso",u"issn eletrônico",u"título",u"ID no DOAJ",u"Provider no DOAJ",u"Status no DOAJ"] + header = ["coleção","issn scielo","issn impresso","issn eletrônico","título","ID no DOAJ","Provider no DOAJ","Status no DOAJ"] - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def get_doaj_journal(self, issns): data = {} @@ -110,12 +110,12 @@ def get_doaj_journal(self, issns): def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) def items(self): diff --git a/export/dump_articles.py b/export/dump_articles.py index 42dfdda..9f136cf 100644 --- a/export/dump_articles.py +++ b/export/dump_articles.py @@ -86,22 +86,22 @@ def run(self): logger.info('XML Format: %s', self.xml_format) with zipfile.ZipFile(self.zip_name, 'w', compression=zipfile.ZIP_DEFLATED, allowZip64=True) as thezip: - for pid, collection, document in self.items(): + for pid, collection, document in list(self.items()): logger.debug('Loading XML file for %s', '_'.join([collection, pid])) collection = trans_acronym.get(collection, collection) issn = pid[1:10] xml_file = '{0}/{1}/{2}.xml'.format(collection, issn, pid) - thezip.writestr(xml_file, bytes(document.encode('utf-8'))) + thezip.writestr(xml_file, document.encode('utf-8')) readmef = open(os.path.dirname(__file__)+'/templates/dumparticle_readme.txt', 'r').read() readme = '{0}\r\n* Documents updated at: {1}\r\n'.format(readmef, datetime.datetime.now().isoformat()) - thezip.writestr("README.txt", bytes(readme.encode('utf-8'))) + thezip.writestr("README.txt", readme.encode('utf-8')) if self.xml_format == 'xmlwos': xsd = getschema() if xsd: - thezip.writestr("schema/ThomsonReuters_publishing.xsd", bytes(xsd.encode('utf-8'))) + thezip.writestr("schema/ThomsonReuters_publishing.xsd", xsd.encode('utf-8')) logger.info('Zip created: %s', self.zip_name) logger.info('Processing finished') diff --git a/export/exdoaj.py b/export/exdoaj.py index 1148617..6c94cc1 100644 --- a/export/exdoaj.py +++ b/export/exdoaj.py @@ -75,7 +75,7 @@ def _doaj_id_by_meta(self, issn, publication_year, title): for char in title: if char in ['+','-','&','|','!','(',')','{','}','[',']','^','"','~','*','?',':','\\']: - escaped_title += u'\\'+char + escaped_title += '\\'+char continue escaped_title += char @@ -152,7 +152,7 @@ def authenticated_session(self): logger.debug('Authentication attempt done') return None - if u'Incorrect' in request.text: + if 'Incorrect' in request.text: logger.debug('Incorrect username or password') return None @@ -192,7 +192,7 @@ def send_xml(self, file_name, file_data): logger.debug('Fail to send document to DOAJ') return False - if u'File uploaded and waiting to be processed' in response.text: + if 'File uploaded and waiting to be processed' in response.text: logger.info('Document Sent') return True else: @@ -232,7 +232,7 @@ def run(self): except Exception as e: logger.exception(e) logger.error('Fail to read document: %s_%s' % (document.publisher_id, document.collection_acronym)) - xml = u'' + xml = '' if self.validate_schema and not self.xml_is_valid(xml): logger.error('Fail to parse xml document: %s_%s' % (document.publisher_id, document.collection_acronym)) diff --git a/export/gen_doaj_correctionsdb.py b/export/gen_doaj_correctionsdb.py index 2d9a4c0..8f161f7 100644 --- a/export/gen_doaj_correctionsdb.py +++ b/export/gen_doaj_correctionsdb.py @@ -6,7 +6,7 @@ import logging import argparse import functools -from urllib import quote +from urllib.parse import quote import requests diff --git a/export/kbart.py b/export/kbart.py index 5811528..7cf0299 100644 --- a/export/kbart.py +++ b/export/kbart.py @@ -3,17 +3,18 @@ Este processamento gera uma tabulação de periódicos seguindo o formato Kbart. Formato de saída (headers em inglês, conforme diretrizes KBART): -publication_title, print_identifier, online_identifier, date_first_issue_online, -num_first_vol_online, num_first_issue_online, date_last_issue_online, -num_last_vol_online, num_last_issue_online, title_url, first_author, title_id, -embargo_info, coverage_depth, coverage_notes, publisher_name, publication_type, -date_monograph_published_print, date_monograph_published_online, monograph_volume, -monograph_edition, first_editor, parent_publication_title_id, +publication_title, print_identifier, online_identifier, date_first_issue_online, +num_first_vol_online, num_first_issue_online, date_last_issue_online, +num_last_vol_online, num_last_issue_online, title_url, first_author, title_id, +embargo_info, coverage_depth, coverage_notes, publisher_name, publication_type, +date_monograph_published_print, date_monograph_published_online, monograph_volume, +monograph_edition, first_editor, parent_publication_title_id, preceding_publication_title_id, access_type """ + import argparse -import logging import codecs +import logging import re import utils @@ -23,37 +24,46 @@ # ISSN redirects for journals that changed their ISSN in URLs # Maps old ISSN to new ISSN ISSN_URL_REDIRECTS = { - '1575-0620': '2013-6463', # Revista española de sanidad penitenciaria (SciELO Spain) + "1575-0620": "2013-6463", # Revista española de sanidad penitenciaria (SciELO Spain) } +HTTP_ONLY_COLLECTIONS = set(["bol", "col", "per", "cub", "sss", "ury"]) + # Pre-compile regex patterns for ISSN redirects for better performance _ISSN_REDIRECT_PATTERNS = { - old_issn: re.compile(r'([?&]pid=)' + re.escape(old_issn) + r'(&|$)') - for old_issn in ISSN_URL_REDIRECTS.keys() + old_issn: re.compile(r"([?&]pid=)" + re.escape(old_issn) + r"(&|$)") + for old_issn in list(ISSN_URL_REDIRECTS.keys()) } -def _config_logging(logging_level='INFO', logging_file=None): +def title_url_for_collection(url, collection): + if collection not in HTTP_ONLY_COLLECTIONS and url.startswith("http://"): + return "https://" + url[len("http://"):] + return url + +def _config_logging(logging_level="INFO", logging_file=None): allowed_levels = { - 'DEBUG': logging.DEBUG, - 'INFO': logging.INFO, - 'WARNING': logging.WARNING, - 'ERROR': logging.ERROR, - 'CRITICAL': logging.CRITICAL + "DEBUG": logging.DEBUG, + "INFO": logging.INFO, + "WARNING": logging.WARNING, + "ERROR": logging.ERROR, + "CRITICAL": logging.CRITICAL, } - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) - logger.setLevel(allowed_levels.get(logging_level, 'INFO')) + logger.setLevel(allowed_levels.get(logging_level, "INFO")) if logging_file: - hl = logging.FileHandler(logging_file, mode='a') + hl = logging.FileHandler(logging_file, mode="a") else: hl = logging.StreamHandler() hl.setFormatter(formatter) - hl.setLevel(allowed_levels.get(logging_level, 'INFO')) + hl.setLevel(allowed_levels.get(logging_level, "INFO")) logger.addHandler(hl) @@ -61,7 +71,6 @@ def _config_logging(logging_level='INFO', logging_file=None): class Dumper(object): - def __init__(self, collection, issns=None, output_file=None): self._ratchet = utils.ratchet_server() @@ -69,47 +78,51 @@ def __init__(self, collection, issns=None, output_file=None): self._publicationstats = utils.publicationstats_server() self.collection = collection self.issns = issns - self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file + self.output_file = ( + codecs.open(output_file, "w", encoding="utf-8") + if output_file + else output_file + ) header = [ - u"publication_title", - u"print_identifier", - u"online_identifier", - u"date_first_issue_online", - u"num_first_vol_online", - u"num_first_issue_online", - u"date_last_issue_online", - u"num_last_vol_online", - u"num_last_issue_online", - u"title_url", - u"first_author", - u"title_id", - u"embargo_info", - u"coverage_depth", - u"coverage_notes", - u"publisher_name", - u"publication_type", - u"date_monograph_published_print", - u"date_monograph_published_online", - u"monograph_volume", - u"monograph_edition", - u"first_editor", - u"parent_publication_title_id", - u"preceding_publication_title_id", - u"access_type" - + "publication_title", + "print_identifier", + "online_identifier", + "date_first_issue_online", + "num_first_vol_online", + "num_first_issue_online", + "date_last_issue_online", + "num_last_vol_online", + "num_last_issue_online", + "title_url", + "first_author", + "title_id", + "embargo_info", + "coverage_depth", + "coverage_notes", + "publisher_name", + "publication_type", + "date_monograph_published_print", + "date_monograph_published_online", + "monograph_volume", + "monograph_edition", + "first_editor", + "parent_publication_title_id", + "preceding_publication_title_id", + "access_type", ] - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + self.write(",".join(['"%s"' % i.replace('"', '""') for i in header])) def _first_included_document_by_journal(self, issn, collection): fid = self._publicationstats.first_included_document_by_journal( - issn, collection) + issn, collection + ) if not fid: return None - document = self._articlemeta.document(fid['pid'], fid['collection']) + document = self._articlemeta.document(fid["pid"], fid["collection"]) if not document.data: return None @@ -118,13 +131,12 @@ def _first_included_document_by_journal(self, issn, collection): def _last_included_document_by_journal(self, issn, collection): - lid = self._publicationstats.last_included_document_by_journal( - issn, collection) + lid = self._publicationstats.last_included_document_by_journal(issn, collection) if not lid: return None - document = self._articlemeta.document(lid['pid'], lid['collection']) + document = self._articlemeta.document(lid["pid"], lid["collection"]) if not document.data: return None @@ -133,12 +145,12 @@ def _last_included_document_by_journal(self, issn, collection): def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: - self.output_file.write('%s\r\n' % line) + self.output_file.write("%s\r\n" % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) def items(self): @@ -147,64 +159,88 @@ def items(self): self.issns = [None] for issn in self.issns: - for data in self._articlemeta.journals(collection=self.collection, issn=issn): - if data.current_status != 'current': - logger.debug('Skipping non-active journal: %s (status: %s)' % (data.scielo_issn, data.current_status)) + for data in self._articlemeta.journals( + collection=self.collection, issn=issn + ): + current_status = utils.get_metadata_value(data, "current_status") + if current_status != "current": + logger.debug( + "Skipping non-active journal: %s (status: %s)" + % (data.scielo_issn, current_status) + ) continue - logger.debug('Reading document: %s' % data.scielo_issn) + logger.debug("Reading document: %s" % data.scielo_issn) yield self.fmt_csv(data) def fmt_csv(self, data): line = [] - first_document = self._first_included_document_by_journal(data.scielo_issn, data.collection_acronym) - last_document = self._last_included_document_by_journal(data.scielo_issn, data.collection_acronym) + first_document = self._first_included_document_by_journal( + data.scielo_issn, data.collection_acronym + ) + last_document = self._last_included_document_by_journal( + data.scielo_issn, data.collection_acronym + ) line.append(data.title) - line.append(data.print_issn or '') - line.append(data.electronic_issn or '') - line.append( - first_document.publication_date or '' if first_document else '') + line.append(data.print_issn or "") + line.append(data.electronic_issn or "") + line.append(first_document.publication_date or "" if first_document else "") line.append( - first_document.issue.volume or '' if first_document and first_document.issue else '') + first_document.issue.volume or "" + if first_document and first_document.issue + else "" + ) line.append( - first_document.issue.number or '' if first_document and first_document.issue else '') - if data.current_status != 'current': + first_document.issue.number or "" + if first_document and first_document.issue + else "" + ) + if utils.get_metadata_value(data, "current_status") != "current": + line.append(last_document.publication_date or "" if last_document else "") line.append( - last_document.publication_date or '' if last_document else '') + last_document.issue.volume or "" + if last_document and last_document.issue + else "" + ) line.append( - last_document.issue.volume or '' if last_document and last_document.issue else '') - line.append( - last_document.issue.number or '' if last_document and last_document.issue else '') + last_document.issue.number or "" + if last_document and last_document.issue + else "" + ) else: - line += ['', '', ''] + line += ["", "", ""] # Generate the URL - url = data.url().replace('sci_serial', 'sci_issues') - + url = data.url().replace("sci_serial", "sci_issues") + # Apply ISSN redirects for journals that changed their ISSN in URLs # This is necessary for journals that no longer use their print ISSN - for old_issn, new_issn in ISSN_URL_REDIRECTS.items(): + for old_issn, new_issn in list(ISSN_URL_REDIRECTS.items()): # Use pre-compiled regex pattern for better performance pattern = _ISSN_REDIRECT_PATTERNS[old_issn] - url = pattern.sub(r'\g<1>' + new_issn + r'\2', url) - + url = pattern.sub(r"\g<1>" + new_issn + r"\2", url) + + url = title_url_for_collection(url, self.collection) + line.append(url) - line.append('') # first_author - line.append(data.scielo_issn or '') - line.append('') # embargo_info - line.append('fulltext') # coverage_depth - line.append('') # coverage_notes - line.append(' '.join(data.publisher_name) if data.publisher_name else '') # publisher_name - line.append('Serial') # publication_type - line.append('') # date_monograph_published_print - line.append('') # date_monograph_published_online - line.append('') # monograph_volume - line.append('') # monograph_edition - line.append('') # first_editor - line.append('') # parent_publication_title_id - line.append('') # preceding_publication_title_id - line.append('F') # access_type - - joined_line = ','.join(['"%s"' % i.replace('"', '""') for i in line]) + line.append("") # first_author + line.append(data.scielo_issn or "") + line.append("") # embargo_info + line.append("fulltext") # coverage_depth + line.append("") # coverage_notes + line.append( + " ".join(data.publisher_name) if data.publisher_name else "" + ) # publisher_name + line.append("Serial") # publication_type + line.append("") # date_monograph_published_print + line.append("") # date_monograph_published_online + line.append("") # monograph_volume + line.append("") # monograph_edition + line.append("") # first_editor + line.append("") # parent_publication_title_id + line.append("") # preceding_publication_title_id + line.append("F") # access_type + + joined_line = ",".join(['"%s"' % i.replace('"', '""') for i in line]) return joined_line @@ -212,44 +248,28 @@ def fmt_csv(self, data): def main(): parser = argparse.ArgumentParser( - description='Export journals list in Kabart format' + description="Export journals list in Kabart format" ) - parser.add_argument( - 'issns', - nargs='*', - help='ISSN\'s separated by spaces' - ) + parser.add_argument("issns", nargs="*", help="ISSN's separated by spaces") - parser.add_argument( - '--collection', - '-c', - help='Collection Acronym' - ) + parser.add_argument("--collection", "-c", help="Collection Acronym") - parser.add_argument( - '--output_file', - '-r', - help='File to receive the dumped data' - ) + parser.add_argument("--output_file", "-r", help="File to receive the dumped data") - parser.add_argument( - '--logging_file', - '-o', - help='Full path to the log file' - ) + parser.add_argument("--logging_file", "-o", help="Full path to the log file") parser.add_argument( - '--logging_level', - '-l', - default='DEBUG', - choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], - help='Logggin level' + "--logging_level", + "-l", + default="DEBUG", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Logggin level", ) args = parser.parse_args() _config_logging(args.logging_level, args.logging_file) - logger.info('Dumping data for: %s' % args.collection) + logger.info("Dumping data for: %s" % args.collection) issns = None if len(args.issns) > 0: diff --git a/export/natural_keys.py b/export/natural_keys.py index 4b8311f..3bb2707 100644 --- a/export/natural_keys.py +++ b/export/natural_keys.py @@ -59,7 +59,7 @@ def __init__(self, collection, issns=None, output_file=None): def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) @@ -165,23 +165,23 @@ def run(self, output_fmt='json'): if output_fmt == 'csv': header = [ - u"coleção", - u"pid", - u"título", + "coleção", + "pid", + "título", "acrônimo do título", - u"volume", - u"número", - u"suplemento", - u"ano de publicação", - u"primeira página", - u"primeria página seq" - u"última página", - u"e-location", - u"chave natural", - u"url natural" + "volume", + "número", + "suplemento", + "ano de publicação", + "primeira página", + "primeria página seq" + "última página", + "e-location", + "chave natural", + "url natural" ] self.write( - u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header]) + ','.join(['"%s"' % i.replace('"', '""') for i in header]) ) if output_fmt == 'csv': diff --git a/export/normalize_affiliations.py b/export/normalize_affiliations.py index add2d94..12c6dfc 100644 --- a/export/normalize_affiliations.py +++ b/export/normalize_affiliations.py @@ -52,13 +52,13 @@ def __init__(self, collection, issns=None, output_file=None, not_normalized=True def run(self): - header = [u"coleção", u"PID", u"ano de publicação", u"tipo de documento",u"título", u"número", u"normalizado", u"id de afiliação", u"instituição original", u"paises original", u"instituição normalizada", u"país normalizado ISO-3661", u"código de país normalizado ISO-3166", u"estado normalizado ISO-3166", u"código de estado normalizado ISO-3166"] + header = ["coleção", "PID", "ano de publicação", "tipo de documento","título", "número", "normalizado", "id de afiliação", "instituição original", "paises original", "instituição normalizada", "país normalizado ISO-3661", "código de país normalizado ISO-3166", "estado normalizado ISO-3166", "código de estado normalizado ISO-3166"] if not self.issns: self.issns = [None] if not self.output_file: - print('%s\r\n' % ','.join(header)) + print(('%s\r\n' % ','.join(header))) for issn in self.issns: for data in self.get_data(issn=issn): for item in self.fmt_csv(data): diff --git a/export/xml_rsps.py b/export/xml_rsps.py index 12239da..8f352a3 100644 --- a/export/xml_rsps.py +++ b/export/xml_rsps.py @@ -12,12 +12,17 @@ from io import StringIO import itertools -import packtools -from packtools.catalogs import XML_CATALOG +try: + import packtools + from packtools.catalogs import XML_CATALOG +except ImportError: + packtools = None + XML_CATALOG = None from lxml.etree import XMLSyntaxError import utils -os.environ['XML_CATALOG_FILES'] = XML_CATALOG +if XML_CATALOG: + os.environ['XML_CATALOG_FILES'] = XML_CATALOG logger = logging.getLogger(__name__) @@ -87,6 +92,8 @@ def _make_err_message(err): def analyze_xml(xml): """Analyzes `file` against packtools' XMLValidator. """ + if packtools is None: + raise RuntimeError('packtools is required to validate XML RSPS files') f = StringIO(xml) @@ -167,7 +174,7 @@ def summaryze_xml_validation(self, pid, collection_acronym, output_format): except Exception as e: logger.exception(e) logger.error('Fail to read document: %s_%s' % (pid, collection_acronym)) - xml = u'' + xml = '' logger.debug('Reading document: %s' % pid) diff --git a/k8s/argo-cronworkflow.yaml b/k8s/argo-cronworkflow.yaml new file mode 100644 index 0000000..801a995 --- /dev/null +++ b/k8s/argo-cronworkflow.yaml @@ -0,0 +1,55 @@ +apiVersion: argoproj.io/v1alpha1 +kind: CronWorkflow +metadata: + name: processing-tabs +spec: + schedule: "0 3 1,8,15,22 1-12 *" + timezone: America/Sao_Paulo + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + workflowSpec: + entrypoint: processing-tabs + templates: + - name: processing-tabs + container: + image: scielo-processing:python3.14 + imagePullPolicy: IfNotPresent + command: ["/bin/bash", "-lc"] + args: + - | + if [ -n "${PROCESSING_COLLECTIONS:-}" ]; then + /app/run.sh "$PROCESSING_COLLECTIONS" + else + /app/run.sh + fi + envFrom: + - secretRef: + name: processing-env + env: + - name: DOCKER_ENV + value: "1" + - name: WORK_DIR + value: /app + - name: LOG_DIR + value: /var/log/processing + - name: TABS_DIR + value: /var/www/static_scielo_org/tabs + - name: EXIT_ON_FAILURE + value: "false" + - name: PUBLICATIONSTATS_TIMEOUT_MS + value: "120000" + volumeMounts: + - name: processing-data + mountPath: /var/www/static_scielo_org/tabs + - name: processing-logs + mountPath: /var/log/processing + volumes: + - name: processing-data + persistentVolumeClaim: + claimName: processing-data + - name: processing-logs + persistentVolumeClaim: + claimName: processing-logs + retryStrategy: + limit: "0" diff --git a/k8s/processing-env.secret.example.yaml b/k8s/processing-env.secret.example.yaml new file mode 100644 index 0000000..9fd9500 --- /dev/null +++ b/k8s/processing-env.secret.example.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Secret +metadata: + name: processing-env +type: Opaque +stringData: + ARTICLEMETA_THRIFTSERVER: "" + ARTICLEMETA_ADMINTOKEN: "" + RATCHET_THRIFTSERVER: "" + ACCESSSTATS_THRIFTSERVER: "" + CITEDBY_THRIFTSERVER: "" + PUBLICATIONSTATS_THRIFTSERVER: "" + SOLR_SEARCH_SCIELO_ORG: "" + SOLR_SEARCH_SCIELO_ORG_INDEX: "" + SLACK_WEBHOOK_URL: "" + PROCESSING_COLLECTIONS: "" diff --git a/publication/documents_affiliations.py b/publication/documents_affiliations.py index 5cc300f..5164c29 100644 --- a/publication/documents_affiliations.py +++ b/publication/documents_affiliations.py @@ -51,42 +51,42 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"document publishing ID (PID SciELO)") - header.append(u"document publishing year") - header.append(u"document type") - header.append(u"document is citable") - header.append(u"document affiliation instituition") - header.append(u"document affiliation country") - header.append(u"document affiliation country ISO 3166") - header.append(u"document affiliation state") - header.append(u"document affiliation city") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("document publishing ID (PID SciELO)") + header.append("document publishing year") + header.append("document type") + header.append("document is citable") + header.append("document affiliation instituition") + header.append("document affiliation country") + header.append("document affiliation country ISO 3166") + header.append("document affiliation state") + header.append("document affiliation city") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, lines): - if isinstance(lines, unicode): + if isinstance(lines, str): lines = [lines] for line in lines: if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -113,23 +113,23 @@ def fmt_csv(self, data): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'document') + line.append('document') line.append(data.collection_acronym) line.append(data.journal.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.journal.title) - line.append(u';'.join(data.journal.subject_areas or [])) + line.append(';'.join(data.journal.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.journal.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) line.append(data.document_type) - line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append('1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') if data.mixed_affiliations: for aff in data.mixed_affiliations: aff_line = [] diff --git a/publication/documents_affiliations_nationality.py b/publication/documents_affiliations_nationality.py index 82f1c25..5d0d518 100644 --- a/publication/documents_affiliations_nationality.py +++ b/publication/documents_affiliations_nationality.py @@ -52,38 +52,38 @@ def __init__(self, home_nationality, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"document publishing ID (PID SciELO)") - header.append(u"document publishing year") - header.append(u'document is citable') - header.append(u"document type") - header.append(u"home nationality") - header.append(u"total of affiliations") - header.append(u"national") - header.append(u"foreign") - header.append(u"undefined") - header.append(u"empty") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("document publishing ID (PID SciELO)") + header.append("document publishing year") + header.append('document is citable') + header.append("document type") + header.append("home nationality") + header.append("total of affiliations") + header.append("national") + header.append("foreign") + header.append("undefined") + header.append("empty") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) def items(self): @@ -93,11 +93,11 @@ def items(self): for issn in self.issns: for data in self._articlemeta.documents(collection=self.collection, issn=issn): - logger.debug(u'Reading document: %s' % data.publisher_id) + logger.debug('Reading document: %s' % data.publisher_id) yield self.fmt_csv(data) def fmt_csv(self, data): - know_languages = set([u'pt', u'es', u'en']) + know_languages = set(['pt', 'es', 'en']) languages = set(data.languages()) issns = [] @@ -108,22 +108,22 @@ def fmt_csv(self, data): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'document') + line.append('document') line.append(data.collection_acronym) line.append(data.journal.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.journal.title) - line.append(u';'.join(data.journal.subject_areas or [])) + line.append(';'.join(data.journal.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.journal.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) - line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append('1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') line.append(data.document_type) line.append(self.home_nationality) line.append(str(len(data.mixed_affiliations)) if data.mixed_affiliations else '0') @@ -144,11 +144,11 @@ def fmt_csv(self, data): national += 1 continue - if aff_value in choices.ISO_3166.keys() and aff_value != self.home_nationality: + if aff_value in list(choices.ISO_3166.keys()) and aff_value != self.home_nationality: foreign += 1 continue - if aff_value not in choices.ISO_3166.keys(): + if aff_value not in list(choices.ISO_3166.keys()): undefined += 1 continue diff --git a/publication/documents_authors.py b/publication/documents_authors.py index 2842f7b..f5cffe2 100644 --- a/publication/documents_authors.py +++ b/publication/documents_authors.py @@ -51,42 +51,42 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"document publishing ID (PID SciELO)") - header.append(u"document publishing year") - header.append(u"document type") - header.append(u"document is citable") - header.append(u"document author") - header.append(u"document author institution") - header.append(u"document author affiliation country") - header.append(u"document author affiliation state") - header.append(u"document author affiliation city") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("document publishing ID (PID SciELO)") + header.append("document publishing year") + header.append("document type") + header.append("document is citable") + header.append("document author") + header.append("document author institution") + header.append("document author affiliation country") + header.append("document author affiliation state") + header.append("document author affiliation city") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, lines): - if isinstance(lines, unicode): + if isinstance(lines, str): lines = [lines] for line in lines: if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -118,23 +118,23 @@ def fmt_csv(self, data): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'document') + line.append('document') line.append(data.collection_acronym) line.append(data.journal.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.journal.title) - line.append(u';'.join(data.journal.subject_areas or [])) + line.append(';'.join(data.journal.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.journal.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) line.append(data.document_type) - line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append('1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') if data.authors: for author in data.authors: author_line = [' '.join([author.get('given_names', ''), author.get('surname', '')])] diff --git a/publication/documents_counts.py b/publication/documents_counts.py index dcb10c9..2d87ace 100644 --- a/publication/documents_counts.py +++ b/publication/documents_counts.py @@ -65,42 +65,42 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"document publishing ID (PID SciELO)") - header.append(u"document publishing year") - header.append(u"document type") - header.append(u"document is citable") - header.append(u"authors") - header.append(u"0 authors") - header.append(u"1 author") - header.append(u"2 authors") - header.append(u"3 authors") - header.append(u"4 authors") - header.append(u"5 authors") - header.append(u"+6 authors") - header.append(u"pages") - header.append(u"references") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("document publishing ID (PID SciELO)") + header.append("document publishing year") + header.append("document type") + header.append("document is citable") + header.append("authors") + header.append("0 authors") + header.append("1 author") + header.append("2 authors") + header.append("3 authors") + header.append("4 authors") + header.append("5 authors") + header.append("+6 authors") + header.append("pages") + header.append("references") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -130,35 +130,35 @@ def fmt_csv(self, data): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'document') + line.append('document') line.append(data.collection_acronym) line.append(data.journal.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.journal.title) - line.append(u';'.join(data.journal.subject_areas or [])) + line.append(';'.join(data.journal.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.journal.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) line.append(data.document_type) - line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') - line.append(unicode(tot_authors)) - line.append(u'1' if tot_authors == 0 else u'0') # total de autores - line.append(u'1' if tot_authors == 1 else u'0') # total de autores - line.append(u'1' if tot_authors == 2 else u'0') # total de autores - line.append(u'1' if tot_authors == 3 else u'0') # total de autores - line.append(u'1' if tot_authors == 4 else u'0') # total de autores - line.append(u'1' if tot_authors == 5 else u'0') # total de autores - line.append(u'1' if tot_authors >= 6 else u'0') # total de autores - line.append(unicode(pages(data.start_page, data.end_page))), # total de páginas - line.append(unicode(len(data.citations or []))) # total de citações - - joined_line = u','.join([u'"%s"' % i.replace(u'"', u'""') for i in line]) + line.append('1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append(str(tot_authors)) + line.append('1' if tot_authors == 0 else '0') # total de autores + line.append('1' if tot_authors == 1 else '0') # total de autores + line.append('1' if tot_authors == 2 else '0') # total de autores + line.append('1' if tot_authors == 3 else '0') # total de autores + line.append('1' if tot_authors == 4 else '0') # total de autores + line.append('1' if tot_authors == 5 else '0') # total de autores + line.append('1' if tot_authors >= 6 else '0') # total de autores + line.append(str(pages(data.start_page, data.end_page))), # total de páginas + line.append(str(len(data.citations or []))) # total de citações + + joined_line = ','.join(['"%s"' % i.replace('"', '""') for i in line]) return joined_line diff --git a/publication/documents_dates.py b/publication/documents_dates.py index c27a9ae..6648e5e 100644 --- a/publication/documents_dates.py +++ b/publication/documents_dates.py @@ -53,60 +53,60 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"document publishing ID (PID SciELO)") - header.append(u"document publishing year") - header.append(u"document type") - header.append(u"document is citable") - header.append(u"document submitted at") - header.append(u"document submitted at year") - header.append(u"document submitted at month") - header.append(u"document submitted at day") - header.append(u"document accepted at") - header.append(u"document accepted at year") - header.append(u"document accepted at month") - header.append(u"document accepted at day") - header.append(u"document reviewed at") - header.append(u"document reviewed at year") - header.append(u"document reviewed at month") - header.append(u"document reviewed at day") - header.append(u"document published as ahead of print at") - header.append(u"document published as ahead of print at year") - header.append(u"document published as ahead of print at month") - header.append(u"document published as ahead of print at day") - header.append(u"document published at") - header.append(u"document published at year") - header.append(u"document published at month") - header.append(u"document published at day") - header.append(u"document published in SciELO at") - header.append(u"document published in SciELO at year") - header.append(u"document published in SciELO at month") - header.append(u"document published in SciELO at day") - header.append(u"document updated in SciELO at") - header.append(u"document updated in SciELO at year") - header.append(u"document updated in SciELO at month") - header.append(u"document updated in SciELO at day") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("document publishing ID (PID SciELO)") + header.append("document publishing year") + header.append("document type") + header.append("document is citable") + header.append("document submitted at") + header.append("document submitted at year") + header.append("document submitted at month") + header.append("document submitted at day") + header.append("document accepted at") + header.append("document accepted at year") + header.append("document accepted at month") + header.append("document accepted at day") + header.append("document reviewed at") + header.append("document reviewed at year") + header.append("document reviewed at month") + header.append("document reviewed at day") + header.append("document published as ahead of print at") + header.append("document published as ahead of print at year") + header.append("document published as ahead of print at month") + header.append("document published as ahead of print at day") + header.append("document published at") + header.append("document published at year") + header.append("document published at month") + header.append("document published at day") + header.append("document published in SciELO at") + header.append("document published in SciELO at year") + header.append("document published in SciELO at month") + header.append("document published in SciELO at day") + header.append("document updated in SciELO at") + header.append("document updated in SciELO at year") + header.append("document updated in SciELO at month") + header.append("document updated in SciELO at day") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -142,23 +142,23 @@ def fmt_csv(self, data): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'document') + line.append('document') line.append(data.collection_acronym) line.append(data.journal.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.journal.title) - line.append(u';'.join(data.journal.subject_areas or [])) + line.append(';'.join(data.journal.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.journal.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(document_publication_date_splitted[0]) line.append(data.document_type) - line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append('1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') line.append(data.receive_date or '') receive_splited = utils.split_date(data.receive_date or '') line.append(receive_splited[0]) # year @@ -177,7 +177,7 @@ def fmt_csv(self, data): try: aop_pubdate = data.ahead_publication_date or '' - except xylose.scielodocument.UnavailableMetadataException: + except (KeyError, xylose.scielodocument.UnavailableMetadataException): aop_pubdate = '' line.append(aop_pubdate) diff --git a/publication/documents_languages.py b/publication/documents_languages.py index 88f8252..1dab758 100644 --- a/publication/documents_languages.py +++ b/publication/documents_languages.py @@ -51,37 +51,37 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"document publishing ID (PID SciELO)") - header.append(u"document publishing year") - header.append(u'document is citable') - header.append(u"document type") - header.append(u"document languages") - header.append(u"document pt") - header.append(u"document es") - header.append(u"document en") - header.append(u"document other languages") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("document publishing ID (PID SciELO)") + header.append("document publishing year") + header.append('document is citable') + header.append("document type") + header.append("document languages") + header.append("document pt") + header.append("document es") + header.append("document en") + header.append("document other languages") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) def items(self): @@ -91,11 +91,11 @@ def items(self): for issn in self.issns: for data in self._articlemeta.documents(collection=self.collection, issn=issn): - logger.debug(u'Reading document: %s' % data.publisher_id) + logger.debug('Reading document: %s' % data.publisher_id) yield self.fmt_csv(data) def fmt_csv(self, data): - know_languages = set([u'pt', u'es', u'en']) + know_languages = set(['pt', 'es', 'en']) languages = set(data.languages()) issns = [] @@ -106,22 +106,22 @@ def fmt_csv(self, data): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'document') + line.append('document') line.append(data.collection_acronym) line.append(data.journal.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.journal.title) - line.append(u';'.join(data.journal.subject_areas or [])) + line.append(';'.join(data.journal.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.journal.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) - line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append('1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') line.append(data.document_type) line.append(';'.join(languages)) line.append('1' if 'pt' in languages else '0') # PT diff --git a/publication/documents_licenses.py b/publication/documents_licenses.py index 593a654..5fe5ab3 100644 --- a/publication/documents_licenses.py +++ b/publication/documents_licenses.py @@ -51,33 +51,33 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"document publishing ID (PID SciELO)") - header.append(u"document publishing year") - header.append(u"document type") - header.append(u"document is citable") - header.append(u"document license") + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("document publishing ID (PID SciELO)") + header.append("document publishing year") + header.append("document type") + header.append("document is citable") + header.append("document license") - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -100,23 +100,23 @@ def fmt_csv(self, data): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'document') + line.append('document') line.append(data.collection_acronym) line.append(data.journal.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.journal.title) - line.append(u';'.join(data.journal.subject_areas or [])) + line.append(';'.join(data.journal.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.journal.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) line.append(data.document_type) - line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append('1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') perm = '' if data.permissions: perm = data.permissions.get('id' or '') diff --git a/publication/journals.py b/publication/journals.py index 57dec6f..34886ab 100644 --- a/publication/journals.py +++ b/publication/journals.py @@ -75,56 +75,56 @@ def __init__(self, collection, issns=None, output_file=None, years=6): now = datetime.date.today().year self.years_range = [i for i in range(now, now-self._years, -1)] header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"title + subtitle SciELO") - header.append(u"short title SciELO") - header.append(u"short title ISO") - header.append(u"title PubMed") - header.append(u"publisher name") - header.append(u"use license") - header.append(u"alpha frequency") - header.append(u"numeric frequency (in months)") - header.append(u"inclusion year at SciELO") - header.append(u"stopping year at SciELO") - header.append(u"stopping reason") - header.append(u"date of the first document") - header.append(u"volume of the first document") - header.append(u"issue of the first document") - header.append(u"date of the last document") - header.append(u"volume of the last document") - header.append(u"issue of the last document") - header.append(u"total of issues") - header += [u"issues at %s" % str(i) for i in self.years_range] - header.append(u"total of regular issues") - header += [u"regular issues at %s" % str(i) for i in self.years_range] - header.append(u"total of documents") - header += [u"documents at %s" % str(i) for i in self.years_range] - header.append(u"citable documents") - header += [u"citable documents at %s" % str(i) for i in self.years_range] + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("title + subtitle SciELO") + header.append("short title SciELO") + header.append("short title ISO") + header.append("title PubMed") + header.append("publisher name") + header.append("use license") + header.append("alpha frequency") + header.append("numeric frequency (in months)") + header.append("inclusion year at SciELO") + header.append("stopping year at SciELO") + header.append("stopping reason") + header.append("date of the first document") + header.append("volume of the first document") + header.append("issue of the first document") + header.append("date of the last document") + header.append("volume of the last document") + header.append("issue of the last document") + header.append("total of issues") + header += ["issues at %s" % str(i) for i in self.years_range] + header.append("total of regular issues") + header += ["regular issues at %s" % str(i) for i in self.years_range] + header.append("total of documents") + header += ["documents at %s" % str(i) for i in self.years_range] + header.append("citable documents") + header += ["citable documents at %s" % str(i) for i in self.years_range] for year in self.years_range: - header.append(u'portuguese documents at %s ' % year) + header.append('portuguese documents at %s ' % year) for year in self.years_range: - header.append(u'spanish documents at %s ' % year) + header.append('spanish documents at %s ' % year) for year in self.years_range: - header.append(u'english documents at %s ' % year) + header.append('english documents at %s ' % year) for year in self.years_range: - header.append(u'other language documents at %s ' % year) + header.append('other language documents at %s ' % year) for year in self.years_range: - header.append(u'google scholar h5 %s ' % year) + header.append('google scholar h5 %s ' % year) for year in self.years_range: - header.append(u'google scholar m5 %s ' % year) + header.append('google scholar m5 %s ' % year) - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def _documents_languages_by_year(self, issn, collection, years=None): @@ -201,12 +201,12 @@ def _impact_factor(self, issn, collection): def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -236,38 +236,38 @@ def fmt_csv(self, data): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'journal') + line.append('journal') line.append(data.collection_acronym) line.append(data.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.title) - line.append(u';'.join(data.subject_areas or [])) + line.append(';'.join(data.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.subject_areas or []) > 2 else '0') - line.append(data.current_status) - line.append(u' '.join([data.title or u'', data.subtitle or u''])) - line.append(data.abbreviated_title or u'') - line.append(data.abbreviated_iso_title or u'') - line.append(data.title_nlm or u'') - line.append(u'; '.join(data.publisher_name or [])) - line.append(data.permissions.get('id', u'') if data.permissions else u'') - line.append(data.periodicity[1] or u'') - line.append(data.periodicity_in_months or u'') + line.append(utils.get_metadata_value(data, 'current_status')) + line.append(' '.join([data.title or '', data.subtitle or ''])) + line.append(data.abbreviated_title or '') + line.append(data.abbreviated_iso_title or '') + line.append(data.title_nlm or '') + line.append('; '.join(data.publisher_name or [])) + line.append(data.permissions.get('id', '') if data.permissions else '') + line.append(data.periodicity[1] or '') + line.append(data.periodicity_in_months or '') line.append(data.creation_date[:4]) - line.append(interruption[0][:4] if interruption else u'') - line.append(interruption[2][:4] if interruption else u'') - line.append(first_document.publication_date or u'' if first_document else u'') - line.append(first_document.issue.volume or u'' if first_document and first_document.issue else u'') - line.append(first_document.issue.number or u'' if first_document and first_document.issue else u'') - line.append(last_document.publication_date or u'' if last_document else u'') - line.append(last_document.issue.volume or u'' if last_document and last_document.issue else u'') - line.append(last_document.issue.number or u'' if last_document and last_document.issue else u'') - - line.append(unicode(self._number_of_issues_by_year( + line.append(interruption[0][:4] if interruption else '') + line.append(interruption[2][:4] if interruption else '') + line.append(first_document.publication_date or '' if first_document else '') + line.append(first_document.issue.volume or '' if first_document and first_document.issue else '') + line.append(first_document.issue.number or '' if first_document and first_document.issue else '') + line.append(last_document.publication_date or '' if last_document else '') + line.append(last_document.issue.volume or '' if last_document and last_document.issue else '') + line.append(last_document.issue.number or '' if last_document and last_document.issue else '') + + line.append(str(self._number_of_issues_by_year( data.scielo_issn, data.collection_acronym, years=0 @@ -281,9 +281,9 @@ def fmt_csv(self, data): ) for issue in issues: - line.append(unicode(issue[1])) + line.append(str(issue[1])) - line.append(unicode(self._number_of_issues_by_year( + line.append(str(self._number_of_issues_by_year( data.scielo_issn, data.collection_acronym, years=0, @@ -299,7 +299,7 @@ def fmt_csv(self, data): ) for issue in regular_issues: - line.append(unicode(issue[1])) + line.append(str(issue[1])) line.append(str(self._number_of_articles_by_year( data.scielo_issn, @@ -314,7 +314,7 @@ def fmt_csv(self, data): ) for document in documents: - line.append(unicode(document[1])) + line.append(str(document[1])) line.append(str(self._number_of_articles_by_year( data.scielo_issn, @@ -331,7 +331,7 @@ def fmt_csv(self, data): )] for document in documents: - line.append(unicode(document)) + line.append(str(document)) languages = self._documents_languages_by_year( data.scielo_issn, @@ -339,14 +339,14 @@ def fmt_csv(self, data): years=self._years ) - for years, values in sorted(languages.items(), reverse=True): - line.append(unicode(values['pt'])) - for years, values in sorted(languages.items(), reverse=True): - line.append(unicode(values['es'])) - for years, values in sorted(languages.items(), reverse=True): - line.append(unicode(values['en'])) - for years, values in sorted(languages.items(), reverse=True): - line.append(unicode(values['other'])) + for years, values in sorted(list(languages.items()), reverse=True): + line.append(str(values['pt'])) + for years, values in sorted(list(languages.items()), reverse=True): + line.append(str(values['es'])) + for years, values in sorted(list(languages.items()), reverse=True): + line.append(str(values['en'])) + for years, values in sorted(list(languages.items()), reverse=True): + line.append(str(values['other'])) for year in self.years_range: h5 = h5m5.get(data.scielo_issn, str(year)) @@ -358,7 +358,7 @@ def fmt_csv(self, data): m5 = m5.get('m5', None) if m5 else None line.append(m5 or '') - joined_line = u','.join([u'"%s"' % i.replace(u'"', u'""') for i in line]) + joined_line = ','.join(['"%s"' % i.replace('"', '""') for i in line]) return joined_line diff --git a/publication/journals_status_changes.py b/publication/journals_status_changes.py index ba09971..83489b8 100644 --- a/publication/journals_status_changes.py +++ b/publication/journals_status_changes.py @@ -52,34 +52,34 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] - header.append(u"extraction date") - header.append(u"study unit") - header.append(u"collection") - header.append(u"ISSN SciELO") - header.append(u"ISSN\'s") - header.append(u"title at SciELO") - header.append(u"title thematic areas") + header.append("extraction date") + header.append("study unit") + header.append("collection") + header.append("ISSN SciELO") + header.append("ISSN\'s") + header.append("title at SciELO") + header.append("title thematic areas") for area in choices.THEMATIC_AREAS: - header.append(u"title is %s" % area.lower()) - header.append(u"title is multidisciplinary") - header.append(u"title current status") - header.append(u"status change date") - header.append(u"status change year") - header.append(u"status change month") - header.append(u"status change day") - header.append(u"status changed to") - header.append(u"status change reason") - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + header.append("title is %s" % area.lower()) + header.append("title is multidisciplinary") + header.append("title current status") + header.append("status change date") + header.append("status change year") + header.append("status change month") + header.append("status change day") + header.append("status changed to") + header.append("status change reason") + + self.write(','.join(['"%s"' % i.replace('"', '""') for i in header])) def write(self, line): if not self.output_file: - print(line.encode('utf-8')) + print(line) else: self.output_file.write('%s\r\n' % line) def run(self): - for item in self.items(): + for item in list(self.items()): self.write(item) logger.info('Export finished') @@ -105,17 +105,17 @@ def fmt_csv(self, data, history): line = [] line.append(datetime.datetime.now().isoformat()[0:10]) - line.append(u'journal') + line.append('journal') line.append(data.collection_acronym) line.append(data.scielo_issn) - line.append(u';'.join(issns)) + line.append(';'.join(issns)) line.append(data.title) - line.append(u';'.join(data.subject_areas or [])) + line.append(';'.join(data.subject_areas or [])) for area in choices.THEMATIC_AREAS: if area.lower() in [i.lower() for i in data.subject_areas or []]: - line.append(u'1') + line.append('1') else: - line.append(u'0') + line.append('0') line.append('1' if len(data.subject_areas or []) > 2 else '0') line.append(data.current_status) line.append(hist) diff --git a/requirements.txt b/requirements.txt index e87df26..6bf3ee0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,7 @@ -accessstatsapi==1.2.1 -articlemetaapi==1.26.5 -certifi==2019.11.28 -chardet==3.0.4 -citedbyapi==1.11.3 --e git+https://github.com/fabiobatalha/doaj_client@27c2c17dc6d0d9ee3aa12283b20c1ac6170869b6#egg=doaj_client-master -idna==2.7 -legendarium==2.0.2 -lxml==4.5.0 -packtools==2.5.3 -pathlib==1.0.1 -picles.plumber==0.11 -ply==3.11 -publicationstatsapi==1.2.1 -requests==2.19.1 --e git+https://github.com/scieloorg/scieloh5m5.git@1.9.6#egg=scieloh5m5 -thriftpy==0.3.9 -urllib3==1.23 -wsgiref==0.1.2 --e git+https://github.com/scieloorg/xylose.git@1.35.13#egg=xylose +doaj_client @ git+https://github.com/fabiobatalha/doaj_client@0.2 +legendarium>=2.0.2 +lxml>=6.0.0 +requests>=2.32.0 +scieloh5m5 @ git+https://github.com/scieloorg/scieloh5m5.git@1.9.6 +thriftpy2>=0.5.2 +xylose @ git+https://github.com/scieloorg/xylose.git@1.35.14 diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..68d8ab7 --- /dev/null +++ b/run.sh @@ -0,0 +1,305 @@ +#!/bin/bash +# +# Script para tabulação de acessos e publicação de artigos SciELO. +# Envia notificações ao Slack quando SLACK_WEBHOOK_URL estiver configurado +# e copia os arquivos gerados para um diretório persistente. + +set -euo pipefail + +readonly DOCKER_ENV="${DOCKER_ENV:-0}" +if [[ "$DOCKER_ENV" == "1" ]]; then + readonly PROCESSING_SETTINGS_FILE="${PROCESSING_SETTINGS_FILE-}" +else + readonly PROCESSING_SETTINGS_FILE="${PROCESSING_SETTINGS_FILE:-/etc/scieloapps/processing.ini}" +fi +readonly VENV_DIR="${VENV_DIR:-/var/www/.venvs/processing}" +readonly WORK_DIR="${WORK_DIR:-/var/www/processing}" +readonly LOG_DIR="${LOG_DIR:-/var/log/processing}" +readonly NETWORK_DIR="network" +readonly TABS_DIR="${TABS_DIR:-/var/www/static_scielo_org/tabs}" +readonly MAX_RETRIES="${MAX_RETRIES:-3}" +readonly RETRY_DELAY="${RETRY_DELAY:-5}" +readonly EXIT_ON_FAILURE="${EXIT_ON_FAILURE:-true}" +readonly TIMESTAMP="$(date +%Y%m%d_%H%M%S)" +readonly REPORT_DATE="$(date +%F)" +readonly MASTER_LOG="$LOG_DIR/master_$TIMESTAMP.log" +readonly SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}" + +readonly DEFAULT_ACRONYMS=( + "scl-BR" "arg-AR" "bol-BO" "chl-CL" "cub-CU" "col-CO" + "cri-CR" "ecu-EC" "esp-ES" "mex-MX" "per-PE" "prt-PR" + "pry-PY" "psi-BR" "rve-BR" "rvt-BR" "spa-BR" "sss-BR" + "sza-SZ" "ury-UY" "ven-VE" "wid-WI" "dom-DO" +) + +readonly CSV_FILES=( + "documents_affiliations" + "documents_affiliation_nationality" + "documents_authors" + "documents_counts" + "documents_dates" + "documents_languages" + "documents_licenses" + "journals" + "journals_status_changes" +) + +kbart_csv_file() { + local acronym=$1 + echo "SciELO_${acronym}_AllTitles_${REPORT_DATE}.csv" +} + +notify_slack() { + local message=$1 + [[ -z "$SLACK_WEBHOOK_URL" ]] && return 0 + + local payload + payload=$(python -c 'import json, sys; print(json.dumps({"text": sys.argv[1]}))' "$message") + + curl -sS -o /dev/null -X POST \ + -H "Content-type: application/json" \ + --data "$payload" \ + "$SLACK_WEBHOOK_URL" || log_error "Falha ao enviar notificação para o Slack" +} + +log_message() { + local level="${2:-INFO}" + mkdir -p "$LOG_DIR" + echo "[$(date '+%Y-%m-%d %H:%M:%S')] [$level] $1" | tee -a "$MASTER_LOG" +} + +log_error() { + log_message "$1" "ERROR" >&2 +} + +log_success() { + log_message "$1" "SUCCESS" +} + +validate_prerequisites() { + mkdir -p "$LOG_DIR" "$TABS_DIR" + + if [[ -n "$PROCESSING_SETTINGS_FILE" && ! -f "$PROCESSING_SETTINGS_FILE" ]]; then + log_error "Arquivo de configuração não encontrado: $PROCESSING_SETTINGS_FILE" + exit 1 + fi + + if [[ "$DOCKER_ENV" != "1" && ! -f "$VENV_DIR/bin/activate" ]]; then + log_error "Virtualenv Python 3 não encontrado: $VENV_DIR" + exit 1 + fi +} + +validate_acronym() { + local item=$1 + if [[ ! $item =~ ^[a-z]{3}(-[A-Z]{2})?(-)?$ ]]; then + log_error "Formato inválido de acrônimo: $item (esperado: abc, abc-XY ou abc-)" + return 1 + fi +} + +setup_environment() { + export PROCESSING_SETTINGS_FILE + + if [[ "$DOCKER_ENV" != "1" ]]; then + # shellcheck disable=SC1090 + source "$VENV_DIR/bin/activate" + fi + + cd "$WORK_DIR" + log_success "Ambiente configurado (Python $(python --version))" +} + +prepare_network_directory() { + rm -rf "$WORK_DIR/$NETWORK_DIR" + mkdir -p "$WORK_DIR/$NETWORK_DIR" +} + +run_processing_command() { + local cmd=$1 + local acron=$2 + local acrond=$3 + local extra_args="${4:-}" + local output_file="${5:-}" + local log_file="$LOG_DIR/${cmd}_${acrond}.log" + local full_cmd="processing_${cmd} -c $acron $extra_args -o $log_file -l ERROR" + + if [[ -n "$output_file" ]]; then + full_cmd="$full_cmd -r $output_file" + fi + + log_message "Executando: $cmd para $acron" + + local attempt=1 + while [[ $attempt -le $MAX_RETRIES ]]; do + if [[ $attempt -gt 1 ]]; then + log_message "Tentativa $attempt/$MAX_RETRIES para $cmd ($acron)" + sleep "$RETRY_DELAY" + fi + + if eval "$full_cmd" 2>&1 | tee -a "$log_file"; then + log_success "Comando $cmd concluído para $acron" + return 0 + fi + + log_error "Falha no comando $cmd para $acron (tentativa $attempt/$MAX_RETRIES)" + attempt=$((attempt + 1)) + done + + log_error "Comando $cmd falhou após $MAX_RETRIES tentativas para $acron" + return 1 +} + +process_collection() { + local item=$1 + local is_network_mode=$2 + local counter=$3 + local acron nationality acrond kbart_file tail_head=1 + + validate_acronym "$item" || return 1 + + acron=$(echo "$item" | cut -f1 -d-) + nationality=$(echo "$item" | cut -f2 -d- -s) + acrond="$acron" + [[ "$acron" == "scl" ]] && acrond="bra" + kbart_file=$(kbart_csv_file "$acrond") + [[ $counter -gt 1 ]] && tail_head=2 + + log_message "Processando coleção: $item" + + local temp_dir + temp_dir=$(mktemp -d -p "$WORK_DIR" "tmp_${acrond}_XXXXXX") + cd "$temp_dir" + + local has_errors=0 + local critical_errors=0 + + if [[ -n "$nationality" ]]; then + run_processing_command "publication_all" "$acron" "$acrond" "-n $nationality" || has_errors=1 + else + run_processing_command "publication_all" "$acron" "$acrond" "" || has_errors=1 + fi + + run_processing_command "publication_journals" "$acron" "$acrond" "" "journals.csv" || { + has_errors=1 + critical_errors=1 + } + run_processing_command "publication_journals_status_changes" "$acron" "$acrond" "" "journals_status_changes.csv" || { + has_errors=1 + critical_errors=1 + } + run_processing_command "export_kbart" "$acron" "$acrond" "" "$kbart_file" || { + has_errors=1 + critical_errors=1 + } + + if [[ $critical_errors -gt 0 ]]; then + notify_slack ":x: Processing: erro crítico na coleção \`$acron\`. ZIP não gerado. Log: \`$MASTER_LOG\`" + cd "$WORK_DIR" + rm -rf "$temp_dir" + return 1 + fi + + [[ $has_errors -gt 0 ]] && notify_slack ":warning: Processing: erro não-crítico na coleção \`$acron\`. Log: \`$MASTER_LOG\`" + + local zip_files=() + for csv in "${CSV_FILES[@]}"; do + [[ -f "${csv}.csv" ]] && zip_files+=("${csv}.csv") + done + [[ -f "$kbart_file" ]] && zip_files+=("$kbart_file") + + if [[ ${#zip_files[@]} -eq 0 ]]; then + log_error "Nenhum arquivo CSV encontrado para $acron" + cd "$WORK_DIR" + rm -rf "$temp_dir" + return 1 + fi + + zip -q "tabs_${acrond}.zip" "${zip_files[@]}" + cp "tabs_${acrond}.zip" "$TABS_DIR/" + log_success "Arquivo tabs_${acrond}.zip copiado para $TABS_DIR" + + if [[ "$is_network_mode" == "true" ]]; then + for csv in "${CSV_FILES[@]}"; do + [[ -f "${csv}.csv" ]] && tail -n +"$tail_head" "${csv}.csv" >> "$WORK_DIR/$NETWORK_DIR/${csv}.csv" + done + [[ -f "$kbart_file" ]] && tail -n +"$tail_head" "$kbart_file" >> "$WORK_DIR/$NETWORK_DIR/$(kbart_csv_file network)" + fi + + cd "$WORK_DIR" + rm -rf "$temp_dir" + return 0 +} + +process_network_zip() { + cd "$WORK_DIR/$NETWORK_DIR" + + local network_files=() + for csv in "${CSV_FILES[@]}"; do + [[ -f "${csv}.csv" ]] && network_files+=("${csv}.csv") + done + local network_kbart_file + network_kbart_file=$(kbart_csv_file network) + [[ -f "$network_kbart_file" ]] && network_files+=("$network_kbart_file") + + if [[ ${#network_files[@]} -eq 0 ]]; then + log_error "Nenhum arquivo encontrado para criar tabs_network.zip" + return 1 + fi + + zip -q tabs_network.zip "${network_files[@]}" + cp tabs_network.zip "$TABS_DIR/" + cd "$WORK_DIR" + log_success "Arquivo tabs_network.zip copiado para $TABS_DIR" +} + +main() { + local start_time end_time duration + start_time=$(date +%s) + + validate_prerequisites + setup_environment + + notify_slack ":hourglass_flowing_sand: Processing iniciado em \`$(hostname)\`." + + local acronyms_to_process=() + local is_network_mode="false" + + if [[ $# -eq 0 ]]; then + acronyms_to_process=("${DEFAULT_ACRONYMS[@]}") + is_network_mode="true" + prepare_network_directory + else + IFS=' ' read -ra acronyms_to_process <<< "$1" + fi + + local counter=0 + local failed_collections=() + + for item in "${acronyms_to_process[@]}"; do + counter=$((counter + 1)) + process_collection "$item" "$is_network_mode" "$counter" || failed_collections+=("$item") + done + + if [[ "$is_network_mode" == "true" ]]; then + process_network_zip || failed_collections+=("network") + fi + + end_time=$(date +%s) + duration=$((end_time - start_time)) + + if [[ ${#failed_collections[@]} -gt 0 ]]; then + log_error "Coleções falhadas: ${failed_collections[*]}" + notify_slack ":x: Processing finalizado com falhas (${#failed_collections[@]}/$counter) em ${duration}s. Coleções: ${failed_collections[*]}. Logs: \`$LOG_DIR\`" + if [[ "$EXIT_ON_FAILURE" == "true" ]]; then + exit 1 + fi + log_success "Processing finalizado com falhas reportadas, sem interromper o job" + exit 0 + fi + + log_success "Processing finalizado em ${duration}s. Arquivos persistidos em $TABS_DIR" + notify_slack ":white_check_mark: Processing finalizado com sucesso. Coleções: $counter. Tempo: ${duration}s. Arquivos: \`$TABS_DIR\`" +} + +main "$@" diff --git a/setup.py b/setup.py index 9e06469..bf5490a 100644 --- a/setup.py +++ b/setup.py @@ -2,17 +2,12 @@ from setuptools import setup, find_packages install_requires = [ - 'thriftpy', - 'packtools<=2.5.3', + 'thriftpy2', 'requests', 'lxml', - 'doaj_client', + 'doaj_client @ git+https://github.com/fabiobatalha/doaj_client@0.2', 'scieloh5m5', 'xylose', - 'articlemetaapi<=1.26.5', - 'publicationstatsapi>=1.2.1', - 'accessstatsapi>=1.2.1', - 'citedbyapi>=1.11.3', 'legendarium>=2.0.2', ] @@ -31,11 +26,6 @@ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 2.7", - ], - dependency_links=[ - "git+https://github.com/fabiobatalha/doaj_client@0.2#egg=doaj_client", - "git+https://github.com/scieloorg/xylose@1.35.8#egg=xylose", ], tests_require=tests_require, test_suite='tests', diff --git a/sonar-project.properties b/sonar-project.properties new file mode 100644 index 0000000..ca9760b --- /dev/null +++ b/sonar-project.properties @@ -0,0 +1 @@ +sonar.projectKey=scieloorg_processing_993ae0b3-768b-41c4-bf2b-7374dbb84dc8 diff --git a/tests/test_accesses_dumpdata.py b/tests/test_accesses_dumpdata.py index 092e2b6..8c59b56 100644 --- a/tests/test_accesses_dumpdata.py +++ b/tests/test_accesses_dumpdata.py @@ -44,8 +44,8 @@ def __init__(self): result = dumpdata.website_2018_urls(document) self.assertEqual( result, - [u'/article/abcd/2018.v22n3suppl0/e707/', - u'/pdf/abcd/2018.v22n3suppl0/e707/'] + ['/article/abcd/2018.v22n3suppl0/e707/', + '/pdf/abcd/2018.v22n3suppl0/e707/'] ) def test_pdf_keys(self): @@ -384,7 +384,7 @@ def test_join_metadata_with_accesses(self): 'issns': {'0102-6720'}, 'document_type': 'research-article', 'aff_countries': ['undefined'], - 'document_title': 'An\u00e1lise de custos entre a raquianestesia e a anestesia venosa com propofol associada ao bloqueio perianal local em opera\u00e7\u00f5es anorretais', + 'document_title': 'Análise de custos entre a raquianestesia e a anestesia venosa com propofol associada ao bloqueio perianal local em operações anorretais', 'issue_title': 'ABCD, arq. bras. cir. dig., 2009, v22n3', 'access_total': 14, 'access_abstract': 3, @@ -401,10 +401,10 @@ def test_join_metadata_with_accesses(self): 'publication_year': '2009', 'publication_date_at_scielo': '2010-05-14', 'journal_current_status': 'current', - 'journal_title': 'ABCD. Arquivos Brasileiros de Cirurgia Digestiva (S\u00e3o Paulo)', + 'journal_title': 'ABCD. Arquivos Brasileiros de Cirurgia Digestiva (São Paulo)', 'processing_date': '2010-05-14', 'publication_date': '2009-09', 'issue': '0102-672020090003' } - self.assertEqual(sorted([k+str(v) for k, v in expected.items()]), sorted([k+str(v) for k, v in result.items()])) + self.assertEqual(sorted([k+str(v) for k, v in list(expected.items())]), sorted([k+str(v) for k, v in list(result.items())])) diff --git a/tests/test_clients.py b/tests/test_clients.py index 4c6bf94..7b361c3 100644 --- a/tests/test_clients.py +++ b/tests/test_clients.py @@ -54,7 +54,7 @@ def test_compute_first_included_document_by_journal(self): "citations": 0, "issue": "scl_S1678-532020030001", "doi_prefix": "10.1590", - "journal_title": "ARS (S\u00e3o Paulo)", + "journal_title": "ARS (S\\u00e3o Paulo)", "collection": "scl", "authors": 1, "publication_date": "2003", @@ -99,7 +99,7 @@ def test_compute_first_included_document_by_journal(self): "citations": 0, "issue": "scl_S1678-532020030001", "doi_prefix": "10.1590", - "journal_title": "ARS (S\u00e3o Paulo)", + "journal_title": "ARS (S\\u00e3o Paulo)", "collection": "scl", "authors": 1, "publication_date": "2003", @@ -172,7 +172,7 @@ def test_compute_first_included_document_by_journal(self): "citations": 0, "issue": "scl_S1678-532020030001", "doi_prefix": "10.1590", - "journal_title": "ARS (S\u00e3o Paulo)", + "journal_title": "ARS (S\\u00e3o Paulo)", "collection": "scl", "authors": 1, "publication_date": "2003", @@ -217,7 +217,7 @@ def test_compute_first_included_document_by_journal(self): "citations": 0, "issue": "scl_S1678-532020030001", "doi_prefix": "10.1590", - "journal_title": "ARS (S\u00e3o Paulo)", + "journal_title": "ARS (S\\u00e3o Paulo)", "collection": "scl", "authors": 1, "publication_date": "2003", diff --git a/tests/test_kbart.py b/tests/test_kbart.py new file mode 100644 index 0000000..5e91c6d --- /dev/null +++ b/tests/test_kbart.py @@ -0,0 +1,35 @@ +# coding: utf-8 +import unittest + +from export import kbart + + +class KbartTest(unittest.TestCase): + + def test_title_url_uses_https_for_updated_collections(self): + url = "http://www.scielo.br/scielo.php?script=sci_issues&pid=0100-879X" + + result = kbart.title_url_for_collection(url, "scl") + + self.assertEqual( + result, + "https://www.scielo.br/scielo.php?script=sci_issues&pid=0100-879X" + ) + + def test_title_url_keeps_http_for_collections_without_https(self): + url = "http://www.scielo.org.bo/scielo.php?script=sci_issues&pid=2077-3323" + + result = kbart.title_url_for_collection(url, "bol") + + self.assertEqual(result, url) + + def test_title_url_keeps_existing_https(self): + url = "https://www.scielo.br/scielo.php?script=sci_issues&pid=0100-879X" + + result = kbart.title_url_for_collection(url, "scl") + + self.assertEqual(result, url) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_publication.py b/tests/test_publication.py index 1fd5050..1fc9d91 100644 --- a/tests/test_publication.py +++ b/tests/test_publication.py @@ -1,6 +1,7 @@ # coding: utf-8 import unittest +from publication import documents_dates from publication import journals @@ -9,8 +10,8 @@ class PublicationTest(unittest.TestCase): def test_interruption_status(self): data = [ - (u'2010', 'current', ''), - (u'2013-10', 'suspended', 'suspended-by-committee') + ('2010', 'current', ''), + ('2013-10', 'suspended', 'suspended-by-committee') ] expected = ('2013-10', 'suspended', 'suspended-by-committee') @@ -30,7 +31,7 @@ def test_interruption_status_0(self): def test_interruption_status_1(self): data = [ - (u'2010', 'current', '') + ('2010', 'current', '') ] result = journals.interruption_status(data) @@ -40,10 +41,10 @@ def test_interruption_status_1(self): def test_interruption_status_2(self): data = [ - (u'2010', 'current', ''), - (u'2013-10', 'suspended', 'suspended-by-committee'), - (u'2014-10', 'current', ''), - (u'2015-10', 'deceased', '') + ('2010', 'current', ''), + ('2013-10', 'suspended', 'suspended-by-committee'), + ('2014-10', 'current', ''), + ('2015-10', 'deceased', '') ] @@ -52,3 +53,38 @@ def test_interruption_status_2(self): result = journals.interruption_status(data) self.assertEqual(expected, result) + + def test_document_dates_missing_issue_when_reading_aop_date(self): + + class Metadata(object): + document_publication_date = '2020' + creation_date = '' + update_date = '' + issue_publication_date = '2020' + collection_acronym = 'dom' + publisher_id = 'S0000-00002020000100001' + document_type = 'research-article' + receive_date = '' + acceptance_date = '' + review_date = '' + + @property + def ahead_publication_date(self): + raise KeyError('issue') + + class Journal(object): + print_issn = '0000-0000' + electronic_issn = '' + scielo_issn = '0000-0000' + title = 'Journal' + subject_areas = ['Health Sciences'] + current_status = 'current' + + data = Metadata() + data.journal = Journal() + + dumper = documents_dates.Dumper.__new__(documents_dates.Dumper) + + result = dumper.fmt_csv(data) + + self.assertIn('"S0000-00002020000100001"', result) diff --git a/tests/test_utils.py b/tests/test_utils.py index 3dd3a3d..ac1b149 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,7 @@ # coding: utf-8 +import os import unittest +from unittest import mock import utils @@ -53,3 +55,38 @@ def test_split_date_5(self): result = utils.split_date('') self.assertEqual(result, ('', '', '')) + + def test_get_settings_from_environment(self): + + env = { + 'ARTICLEMETA_THRIFTSERVER': 'articlemeta.example.invalid:11621', + 'RATCHET_THRIFTSERVER': 'ratchet.example.invalid:11649', + 'ACCESSSTATS_THRIFTSERVER': 'access.example.invalid:11660', + } + + with mock.patch.dict(os.environ, env, clear=True): + result = utils.get_settings() + + self.assertEqual( + result['app:main']['articlemeta_thriftserver'], + 'articlemeta.example.invalid:11621' + ) + self.assertEqual( + result['app:main']['ratchet_thriftserver'], + 'ratchet.example.invalid:11649' + ) + self.assertEqual( + result['app:main']['accessstats_thriftserver'], + 'access.example.invalid:11660' + ) + + def test_get_metadata_value_returns_default_for_incomplete_metadata(self): + + class Metadata(object): + @property + def current_status(self): + raise IndexError('list index out of range') + + result = utils.get_metadata_value(Metadata(), 'current_status') + + self.assertEqual(result, '') diff --git a/thrift/clients.py b/thrift/clients.py index 1b5bcd0..206e81c 100644 --- a/thrift/clients.py +++ b/thrift/clients.py @@ -1,6 +1,5 @@ # coding: utf-8 import os -import thriftpy import json import logging from datetime import date @@ -10,11 +9,17 @@ from accessstats.client import ThriftClient as AccessesThriftClient from publicationstats.client import ThriftClient as PublicationThriftClient from citedby.custom_query import journal_titles -from thriftpy.rpc import make_client from xylose.scielodocument import Article, Journal import utils +try: + import thriftpy2 as thriftpy + from thriftpy2.rpc import make_client +except ImportError: + import thriftpy + from thriftpy.rpc import make_client + LIMIT = 1000 logger = logging.getLogger(__name__) @@ -322,6 +327,19 @@ def document_access_monthnyear(self, code): class PublicationStats(PublicationThriftClient): + def __init__(self, domain=None, timeout=None): + super(PublicationStats, self).__init__(domain=domain) + self.timeout = int(timeout or os.environ.get('PUBLICATIONSTATS_TIMEOUT_MS', 60000)) + + @property + def client(self): + return make_client( + self.PUBLICATIONSTATS_THRIFT.PublicationStats, + self._address, + self._port, + timeout=self.timeout + ) + def _compute_documents_languages_by_year(self, query_result, years=0): year = date.today().year @@ -412,7 +430,7 @@ def _compute_number_of_articles_by_year(self, query_result, years=0): years[item['key']] = item.get('doc_count', 0) - return [(k, v) for k, v in sorted(years.items(), reverse=True)] + return [(k, v) for k, v in sorted(list(years.items()), reverse=True)] def number_of_articles_by_year(self, issn, collection, document_types=None, years=0): @@ -506,7 +524,7 @@ def _compute_number_of_issues_by_year(self, query_result, years=0): continue years[item['key']] = item.get('issue', {}).get('value', 0) - return [(k, v) for k, v in sorted(years.items(), reverse=True)] + return [(k, v) for k, v in sorted(list(years.items()), reverse=True)] def number_of_issues_by_year(self, issn, collection, years=0, type=None): """ diff --git a/thriftpy/__init__.py b/thriftpy/__init__.py new file mode 100644 index 0000000..271771c --- /dev/null +++ b/thriftpy/__init__.py @@ -0,0 +1,2 @@ +from thriftpy2 import * # noqa +from thriftpy2 import load diff --git a/thriftpy/rpc.py b/thriftpy/rpc.py new file mode 100644 index 0000000..179436a --- /dev/null +++ b/thriftpy/rpc.py @@ -0,0 +1 @@ +from thriftpy2.rpc import * # noqa diff --git a/utils.py b/utils.py index e2ca973..9e36f2b 100644 --- a/utils.py +++ b/utils.py @@ -7,17 +7,22 @@ import logging import string -from thrift import clients - -try: - from configparser import ConfigParser -except: - from ConfigParser import ConfigParser +from configparser import ConfigParser logger = logging.getLogger(__name__) REGEX_ISSN = re.compile(r"^[0-9]{4}-[0-9]{3}[0-9xX]$") TAG_RE = re.compile(r'<[^>]+>') +ENV_SETTINGS = { + 'ARTICLEMETA_THRIFTSERVER': 'articlemeta_thriftserver', + 'ARTICLEMETA_ADMINTOKEN': 'articlemeta_admintoken', + 'RATCHET_THRIFTSERVER': 'ratchet_thriftserver', + 'ACCESSSTATS_THRIFTSERVER': 'accessstats_thriftserver', + 'CITEDBY_THRIFTSERVER': 'citedby_thriftserver', + 'PUBLICATIONSTATS_THRIFTSERVER': 'publicationstats_thriftserver', + 'SOLR_SEARCH_SCIELO_ORG': 'solr_search_scielo_org', + 'SOLR_SEARCH_SCIELO_ORG_INDEX': 'solr_search_scielo_org_index', +} def remove_tags(text): @@ -26,12 +31,11 @@ def remove_tags(text): def cleanup_string(text): - try: - nfd_form = unicodedata.normalize('NFD', text.strip().lower()) - except TypeError: - nfd_form = unicodedata.normalize('NFD', unicode(text.strip().lower())) + if not isinstance(text, str): + text = str(text) + nfd_form = unicodedata.normalize('NFD', text.strip().lower()) - cleaned_str = u''.join(x for x in nfd_form if x in string.ascii_letters or x == ' ') + cleaned_str = ''.join(x for x in nfd_form if x in string.ascii_letters or x == ' ') return remove_tags(cleaned_str).lower() @@ -106,9 +110,8 @@ def from_file(cls, filepath): ``filepath`` is a text string. """ - fp = open(filepath, 'r') - - return cls(fp) + with open(filepath, 'r') as fp: + return cls(fp) def __getattr__(self, attr): return getattr(self.conf, attr) @@ -120,35 +123,87 @@ def items(self): section in [section for section in self.conf.sections()]] -config = Configuration.from_env() -settings = dict(config.items()) +def get_settings(): + settings = {'app:main': {}} + + filepath = os.environ.get('PROCESSING_SETTINGS_FILE') + if filepath: + settings.update(dict(Configuration.from_file(filepath).items())) + + for env_name, setting_name in ENV_SETTINGS.items(): + value = os.environ.get(env_name) + if value is not None: + settings['app:main'][setting_name] = value + + if not settings['app:main']: + raise ValueError( + 'missing PROCESSING_SETTINGS_FILE or service environment variables' + ) + + return settings + + +def get_metadata_value(obj, attr, default=''): + try: + return getattr(obj, attr) + except (AttributeError, IndexError, KeyError): + return default + + +def get_service_setting(settings, name, aliases=None): + aliases = aliases or [] + app_settings = settings['app:main'] + for key in [name] + aliases: + value = app_settings.get(key) + if value: + return value + raise ValueError('missing required setting: %s' % name) def publicationstats_server(): - server = settings['app:main'].get('publicationstats_thriftserver', 'publication.scielo.org:11620') + from thrift import clients + + settings = get_settings() + server = get_service_setting(settings, 'publicationstats_thriftserver') return clients.PublicationStats(server) def citedby_server(): - server = settings['app:main'].get('citedby_thriftserver', 'citedby.scielo.org:11610') + from thrift import clients + + settings = get_settings() + server = get_service_setting(settings, 'citedby_thriftserver') return clients.Citedby(domain=server) def ratchet_server(): - server = settings['app:main'].get('ratchet_thriftserver', 'ratchet.scielo.org:11630').split(':') + from thrift import clients + + settings = get_settings() + server = get_service_setting(settings, 'ratchet_thriftserver').split(':') host = server[0] port = int(server[1]) return clients.Ratchet(host, port) def articlemeta_server(): - server = settings['app:main'].get('articlemeta_thriftserver', 'articlemeta.scielo.org:11621') + from thrift import clients + + settings = get_settings() + server = get_service_setting(settings, 'articlemeta_thriftserver') admintoken = settings['app:main'].get('articlemeta_admintoken', None) return clients.ArticleMeta(domain=server, admintoken=admintoken) def accessstats_server(): - server = settings['app:main'].get('accessesstats_thriftserver', 'ratchet.scielo.org:11660') + from thrift import clients + + settings = get_settings() + server = get_service_setting( + settings, + 'accessstats_thriftserver', + aliases=['accessesstats_thriftserver'] + ) return clients.AccessStats(server)