Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
9b4f012
Port codebase syntax to Python 3
rondinelisaad Apr 29, 2026
82e85c7
Add Python 3.14 Docker runtime
rondinelisaad Apr 29, 2026
d2c81d5
Add processing entrypoint with Slack notifications
rondinelisaad Apr 29, 2026
86ed031
Allow service configuration from environment
rondinelisaad Apr 29, 2026
749657e
Make publication stats timeout configurable
rondinelisaad Apr 29, 2026
f626f02
Handle missing issue metadata in date export
rondinelisaad Apr 29, 2026
4b5a16e
Handle missing journal status metadata
rondinelisaad Apr 29, 2026
29d6182
adicao do sonarqube
rondinelisaad Apr 29, 2026
f67ea05
adicao do build da imagem, scan usando o trivy
rondinelisaad Apr 29, 2026
63296c4
adicao do build da imagem, scan usando o trivy-1
rondinelisaad Apr 29, 2026
d51f544
Keep service endpoints out of repository config
rondinelisaad Apr 29, 2026
e1cc2c0
Add Argo CronWorkflow for scheduled processing
rondinelisaad Apr 29, 2026
a70a8c1
Altera script para nome do arquivo kbart para manter melhor conformidade
samuelveigarangel Jun 17, 2026
31c2d26
change nome do arquivo kbart
samuelveigarangel Jun 17, 2026
b545350
black
samuelveigarangel Jun 17, 2026
5f8c121
Mantém protocolo HTTP para coleções definidas
samuelveigarangel Jun 18, 2026
156b40f
Update version xylose to version 1.35.14
samuelveigarangel Jun 19, 2026
930149c
Merge pull request #96 from scieloorg/issue-72
samuelveigarangel Jun 19, 2026
15e23eb
Merge pull request #97 from scieloorg/update-xylose-to-1.35.14
samuelveigarangel Jun 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
PROCESSING_SETTINGS_FILE=
ARTICLEMETA_THRIFTSERVER=
ARTICLEMETA_ADMINTOKEN=
RATCHET_THRIFTSERVER=
ACCESSSTATS_THRIFTSERVER=
CITEDBY_THRIFTSERVER=
PUBLICATIONSTATS_THRIFTSERVER=
SOLR_SEARCH_SCIELO_ORG=
SOLR_SEARCH_SCIELO_ORG_INDEX=
PUBLICATIONSTATS_TIMEOUT_MS=60000
EXIT_ON_FAILURE=true
SLACK_WEBHOOK_URL=
26 changes: 26 additions & 0 deletions .github/workflows/master-quality.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Master Quality

on:
push:
branches:
- codex/python3-14-migration

jobs:
sonar:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: SonarQube Scan
uses: SonarSource/sonarqube-scan-action@v6
env:
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}

- name: Quality Gate
uses: SonarSource/sonarqube-quality-gate-action@v1
env:
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
117 changes: 117 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
name: Release

on:
push:
tags:
- 'v*'

jobs:
release:
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read

steps:
- uses: actions/checkout@v4

- name: Set version
run: echo "VERSION=${GITHUB_REF_NAME}" >> $GITHUB_ENV

- name: Docker Login
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USER }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Build image
run: |
docker build \
-t infrascielo/processing:${VERSION} \
-t infrascielo/processing:latest \
.

# 🔐 Scan único (policy)
- name: Trivy Image Scan
uses: aquasecurity/trivy-action@v0.36.0
with:
image-ref: infrascielo/processing:${{ env.VERSION }}
severity: HIGH,CRITICAL
exit-code: 0

- name: Install Trivy CLI
run: |
curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin

# 📄 Relatório (evidência)
- name: Trivy Report
run: |
trivy image \
--scanners vuln \
--severity HIGH,CRITICAL \
--format table \
--output trivy-report.txt \
infrascielo/processing:${VERSION}

- uses: actions/upload-artifact@v4
with:
name: trivy-report
path: trivy-report.txt

# 📦 SBOM
- name: Generate SBOM (CycloneDX)
run: |
trivy image \
--scanners vuln \
--format cyclonedx \
--output sbom-${VERSION}.json \
infrascielo/processing:${VERSION}

- uses: actions/upload-artifact@v4
with:
name: sbom-${{ env.VERSION }}
path: sbom-${{ env.VERSION }}.json

- name: Push image
run: |
docker push infrascielo/processing:${VERSION}
docker push infrascielo/processing:latest

- name: Push image
run: |
docker push infrascielo/processing:${VERSION}
docker push infrascielo/processing:latest

- name: Get image digest
run: |
DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' infrascielo/processing:${VERSION})
echo "IMAGE_DIGEST=${DIGEST}" >> $GITHUB_ENV

- name: Install Cosign
uses: sigstore/cosign-installer@v3

- name: Sign image with Cosign
env:
COSIGN_EXPERIMENTAL: "1"
COSIGN_YES: "true"
run: |
cosign sign ${IMAGE_DIGEST}

- name: Verify image signature
env:
COSIGN_EXPERIMENTAL: "1"
run: |
cosign verify \
--certificate-oidc-issuer https://token.actions.githubusercontent.com \
--certificate-identity-regexp "https://github.com/${{ github.repository }}/*" \
${IMAGE_DIGEST}

- name: Attach SBOM attestation
env:
COSIGN_EXPERIMENTAL: "1"
COSIGN_YES: "true"
run: |
cosign attest \
--predicate sbom-${VERSION}.json \
--type cyclonedx \
${IMAGE_DIGEST}
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
.DS_Store
*.swp
.env
.env.*
!.env.example
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -57,5 +61,11 @@ docs/_build/
# PyBuilder
target/

# Runtime processing workspace
tmp_*/
network/
var/
k8s/*secret.yaml

# pip source directory
src/
40 changes: 40 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
FROM python:3.14-slim

ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PROCESSING_SETTINGS_FILE=/app/config.ini \
DOCKER_ENV=1 \
WORK_DIR=/app \
LOG_DIR=/var/log/processing \
TABS_DIR=/var/www/static_scielo_org/tabs \
PYTHONPATH=/app

WORKDIR /app

RUN apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential \
curl \
git \
libxml2-dev \
libxslt1-dev \
zip \
&& rm -rf /var/lib/apt/lists/*

COPY requirements.txt setup.py ./
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& pip install --no-deps \
accessstatsapi==1.2.1 \
articlemetaapi==1.26.7 \
citedbyapi==1.11.3 \
publicationstatsapi==1.2.2 \
packtools==2.6.4

COPY . .
RUN cp config.ini-TEMPLATE config.ini \
&& mkdir -p /var/log/processing /var/www/static_scielo_org/tabs \
&& pip install --no-deps -e .

ENTRYPOINT ["/bin/bash", "/app/run.sh"]
71 changes: 70 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,76 @@ e também para o envio de dados à parceiros.


## Requisitos:
* Python 2.7
* Python 3.14

## Docker

Para instalar as dependências e executar a suíte de testes:

```bash
docker compose run --rm tests
```

Para executar o processamento via Docker, configure as variáveis em um arquivo
`.env` local, baseado em `.env.example`. O `.env` não deve ser versionado.

```bash
cp .env.example .env
```

Preencha no `.env` os valores equivalentes ao antigo `config.ini`:

```ini
ARTICLEMETA_THRIFTSERVER=
ARTICLEMETA_ADMINTOKEN=
RATCHET_THRIFTSERVER=
ACCESSSTATS_THRIFTSERVER=
CITEDBY_THRIFTSERVER=
PUBLICATIONSTATS_THRIFTSERVER=
SOLR_SEARCH_SCIELO_ORG=
SOLR_SEARCH_SCIELO_ORG_INDEX=
PUBLICATIONSTATS_TIMEOUT_MS=60000
SLACK_WEBHOOK_URL=
```

Depois execute:

```bash
docker compose run --rm processing "scl-BR"
```

Os logs ficam persistidos em `var/log/processing` e os arquivos ZIP gerados em
`var/tabs`.

## Kubernetes / Argo

O agendamento para Argo Workflows está em `k8s/argo-cronworkflow.yaml` com o cron:

```text
0 3 1,8,15,22 1-12 *
```

O workflow executa as coleções sequencialmente. Quando uma coleção falha, o
script envia a notificação de erro e continua para a próxima; ao final, o job
termina com sucesso para não interromper as próximas execuções agendadas.

Os valores sensíveis devem ser criados em um Secret do Kubernetes chamado
`processing-env`, baseado em `k8s/processing-env.secret.example.yaml`. Não
versione o Secret real.

Volumes esperados:

```text
processing-data -> /var/www/static_scielo_org/tabs
processing-logs -> /var/log/processing
```

Aplicação:

```bash
kubectl apply -f k8s/processing-env.secret.yaml
kubectl apply -f k8s/argo-cronworkflow.yaml
```


## Exportação de dados ao DOAJ
Expand Down
52 changes: 26 additions & 26 deletions accesses/documents_by_journals.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,35 +51,35 @@ def __init__(self, collection, issns=None, output_file=None):
self.issns = issns
self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file
header = []
header.append(u"extraction date")
header.append(u"study unit")
header.append(u"collection")
header.append(u"ISSN SciELO")
header.append(u"ISSN\'s")
header.append(u"title at SciELO")
header.append(u"title thematic areas")
header.append("extraction date")
header.append("study unit")
header.append("collection")
header.append("ISSN SciELO")
header.append("ISSN\'s")
header.append("title at SciELO")
header.append("title thematic areas")
for area in choices.THEMATIC_AREAS:
header.append(u"title is %s" % area.lower())
header.append(u"title is multidisciplinary")
header.append(u"title current status")
header.append(u"publishing year")
header.append(u"accesses year")
header.append(u"accesses to html")
header.append(u"accesses to abstract")
header.append(u"accesses to pdf")
header.append(u"accesses to epdf")
header.append(u"total accesses")

self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header]))
header.append("title is %s" % area.lower())
header.append("title is multidisciplinary")
header.append("title current status")
header.append("publishing year")
header.append("accesses year")
header.append("accesses to html")
header.append("accesses to abstract")
header.append("accesses to pdf")
header.append("accesses to epdf")
header.append("total accesses")

self.write(','.join(['"%s"' % i.replace('"', '""') for i in header]))

def write(self, line):
if not self.output_file:
print(line.encode('utf-8'))
print(line)
else:
self.output_file.write('%s\r\n' % line)

def run(self):
for item in self.items():
for item in list(self.items()):
self.write(item)
logger.info('Export finished')

Expand All @@ -103,17 +103,17 @@ def fmt_csv(self, data):

line = []
line.append(datetime.datetime.now().isoformat()[0:10])
line.append(u'journal')
line.append('journal')
line.append(data.collection_acronym)
line.append(data.scielo_issn)
line.append(u';'.join(issns))
line.append(';'.join(issns))
line.append(data.title)
line.append(u';'.join(data.subject_areas or []))
line.append(';'.join(data.subject_areas or []))
for area in choices.THEMATIC_AREAS:
if area.lower() in [i.lower() for i in data.subject_areas or []]:
line.append(u'1')
line.append('1')
else:
line.append(u'0')
line.append('0')
line.append('1' if len(data.subject_areas or []) > 2 else '0')
line.append(data.current_status)

Expand Down
Loading
Loading