diff --git a/.gitignore b/.gitignore index ba74660..9dec496 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.DS_Store # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -55,3 +56,6 @@ docs/_build/ # PyBuilder target/ + +# pip source directory +src/ diff --git a/.travis.yml b/.travis.yml index 5182fb5..4626a9d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,7 @@ language: python python: - 2.7 - - 3.4 + - 3.5 env: - PROCESSING_SETTINGS_FILE=./config.ini install: diff --git a/README.md b/README.md new file mode 100644 index 0000000..f362c62 --- /dev/null +++ b/README.md @@ -0,0 +1,45 @@ +# SciELO Processing + +Conjunto de utilitários para a produção de tabulações com metadados e métricas +e também para o envio de dados à parceiros. + + +## Requisitos: +* Python 2.7 + + +## Exportação de dados ao DOAJ + +Esta integração é realizada por meio de um robô que obtém os metadados dos +documentos a partir do ArticleMeta, já codificados em XML conforme o _schema_ +do DOAJ, e os envia, um a um, através do formulário de submissão de artigos em +XML, do site do DOAJ. Este processo não é ótimo mas é o único que nos permite +enviar metadados em múltiplos idiomas. + +Ao depositar um documento, o DOAJ verifica automaticamente se os _ISSNs_ +informados correspondem exatamente com os cadastrados em sua base de dados e, +no caso de qualquer divergência, o depósito é rejeitado. A fim de contornar +este problema, é possível construir uma _base de correções_ que poderá ser +utilizada pelo utilitário de depósito. A _base de correções_ é produzida a +partir de consultas às bases do DOAJ e, quando aplicada, modifica os metadados +dos documentos de forma que os _ISSNs_ fiquem conforme esperado pelo DOAJ. + +Para produzir a _base de correções_ e armazená-la em `/tmp/corr.jsonl`: + +```bash +python export/gen_doaj_correctionsdb.py gen-correctionsdb > /tmp/corr.jsonl +``` + +Para executar o depósito dos documentos no DOAJ: + +```bash +PROCESSING_SETTINGS_FILE=myconfig.ini processing_export_doaj --corrections_db /tmp/corr.jsonl --user US3R --password P4SSW0RD +``` + +Note que: (a) será necessário criar um arquivo de configurações e atribuí-lo +à variável `PROCESSING_SETTINGS_FILE`. Este arquivo poderá ser baseado no +[exemplo](https://raw.githubusercontent.com/scieloorg/processing/master/config.ini-TEMPLATE) +fornecido no projeto e (b) que o utilitário `processing_export_doaj` aceita +opções que podem ser usadas de forma a delimitar o conjunto de dados que serão +depositados no DOAJ. Para mais detalhes execute +`PROCESSING_SETTINGS_FILE=myconfig.ini processing_export_doaj --help`. diff --git a/README.rst b/README.rst deleted file mode 100644 index 8adcd07..0000000 --- a/README.rst +++ /dev/null @@ -1,12 +0,0 @@ -Processamentos SciELO -===================== - -Utilitário para produção de tabulações de metadados e estatísticas. Este -utilitário foi implementado utulizando os serviços RPC de coleta de indicadores -bibliométricos, de publicação e de acessos aos documentos da Rede SciELO. - -Build status -============ - -.. image:: https://travis-ci.org/scieloorg/processing.svg?branch=master - :target: https://travis-ci.org/scieloorg/processing diff --git a/accesses/documents_by_journals.py b/accesses/documents_by_journals.py index e14e2a7..7d2c4b1 100644 --- a/accesses/documents_by_journals.py +++ b/accesses/documents_by_journals.py @@ -1,7 +1,7 @@ # coding: utf-8 """ Este processamento gera uma tabulação de acessos por periódico, ano de publicação e ano de -acesso para o texto completo em HTML, resumo em HTML, PDF e EPDF. +acesso para o texto completo em HTML, resumo em HTML, PDF e EPDF. """ import argparse @@ -60,6 +60,7 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"publishing year") header.append(u"accesses year") @@ -113,6 +114,7 @@ def fmt_csv(self, data): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.subject_areas or []) > 2 else '0') line.append(data.current_status) acessos = self._accessstats.access_lifetime(data.scielo_issn, self.collection) @@ -127,7 +129,7 @@ def fmt_csv(self, data): def main(): parser = argparse.ArgumentParser( - description='Dump languages distribution by article' + description='Dump accesses distribution by journals' ) parser.add_argument( diff --git a/accesses/dumpdata.py b/accesses/dumpdata.py index 66ae209..542742d 100644 --- a/accesses/dumpdata.py +++ b/accesses/dumpdata.py @@ -2,7 +2,6 @@ """ Esse processamento condença os metadados de documentos com os dados de acessos. """ -import sys import argparse import logging import re @@ -10,8 +9,9 @@ import codecs import datetime -import choices +from legendarium.urlegendarium import URLegendarium +import choices import utils __version__ = 0.1 @@ -64,7 +64,7 @@ def pdf_keys(fulltexts): for language, url in fulltexts['pdf'].items(): path = REGEX_PDF_PATH.search(url) if path: - keys.append(path.group().upper()) + keys.append(path.group()) return keys @@ -92,8 +92,37 @@ def eligible_match_keys(document): if document.doi: keys.append(document.doi) keys += pdf_keys(document.fulltexts()) + keys.extend(website_2018_urls(document)) + + # há registros no ratchet cuja chave possui apenas caracteres maiúsculos + ci_keys = list(set(keys + [key.upper() for key in keys])) + return ci_keys - return keys + +def website_2018_urls(document): + try: + suppl = document.issue.supplement_volume or \ + document.issue.supplement_number or '' + leg = URLegendarium( + acron=document.journal.acronym, + year_pub=document.publication_date[:4], + volume=document.issue.volume, + number=document.issue.number, + fpage=document.start_page, + fpage_sequence=document.start_page_sequence, + lpage=document.end_page, + article_id=document.elocation, + suppl_number=suppl, + doi=document.doi, + order=document.issue.order) + return ['/article/%s/' % leg.url_article, '/pdf/%s/' % leg.url_article] + except ValueError as e: + logger.error( + 'Fail to build legendarium eligible match key for %s_%s', + document.collection_acronym, document.publisher_id + ) + logger.exception(e) + return [] def country(country): @@ -116,43 +145,23 @@ def get_date_timestamp(date): return date -def issuelabel(document): - label_volume = document.issue.volume if document.issue.volume else '' - label_issue = document.issue.number if document.issue.number else '' - - label_suppl_issue = ' suppl %s' % document.issue.supplement_number if document.issue.supplement_number else '' - - if label_suppl_issue: - label_issue += label_suppl_issue - - label_suppl_volume = ' suppl %s' % document.issue.supplement_volume if document.issue.supplement_volume else '' - - if label_suppl_volume: - label_issue += label_suppl_volume - - label_issue = SUPPLBEG_REGEX.sub('', label_issue) - label_issue = SUPPLEND_REGEX.sub('', label_issue) - - label_volume = 'n.' + label_volume - label_issue = 'v.' + label_issue - - itens = [ - document.journal.abbreviated_title, - ' '.join([label_volume, label_issue]), - document.publication_date[0:4] - ] - - return ', '.join(itens) - - def join_metadata_with_accesses(document, accesses_date, accesses): + issns = set() + issns.add(document.journal.scielo_issn) + if document.journal.print_issn: + issns.add(document.journal.print_issn) + if document.journal.electronic_issn: + issns.add(document.journal.electronic_issn) + data = {} data['id'] = '_'.join([document.collection_acronym, document.publisher_id]) data['pid'] = document.publisher_id data['issn'] = document.journal.scielo_issn + data['issns'] = issns data['journal_title'] = document.journal.title - data['issue'] = document.publisher_id[0:18] + data['journal_current_status'] = document.journal.current_status + data['issue'] = document.issue.publisher_id data['document_title'] = '' if document.original_title(): data['document_title'] = document.original_title() @@ -162,13 +171,13 @@ def join_metadata_with_accesses(document, accesses_date, accesses): data['document_title'] = title break - - data['issue_title'] = issuelabel(document) + data['issue_title'] = ', '.join([document.journal.abbreviated_title, document.issue.publication_date[:4], document.issue.label]) data['processing_date'] = document.processing_date + data['publication_date_at_scielo'] = document.creation_date data['publication_date'] = document.publication_date data['publication_year'] = document.publication_date[0:4] - subject_areas = document.journal.subject_areas or ['undefined'] - data['subject_areas'] = [i for i in subject_areas] + data['subject_areas'] = document.journal.subject_areas or ['undefined'] + data['subject_areas'] = ['Multidisciplinary'] if len(data['subject_areas']) > 2 else data['subject_areas'] data['collection'] = document.collection_acronym data['document_type'] = document.document_type data['languages'] = list(set([i for i in document.languages()]+[document.original_language() or 'undefined'])) @@ -262,21 +271,59 @@ def __init__(self, collection, issns=None, from_date=FROM, until_date=UNTIL, self.from_date = from_date self.until_date = until_date self.dayly_granularity = dayly_granularity - self.output_file=output_file + self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file self.issns = issns self.collection = collection - self.fmt = self.fmt_csv if fmt == 'json': self.fmt = self.fmt_json - + else: + self.fmt = self.fmt_csv + header = [] + header.append(u"extraction date") + header.append(u"study unit") + header.append(u"collection") + header.append(u"ISSN SciELO") + header.append(u"ISSN\'s") + header.append(u"title at SciELO") + header.append(u"title thematic areas") + for area in choices.THEMATIC_AREAS: + header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") + header.append(u"title current status") + header.append(u"document publishing ID (PID SciELO)") + header.append(u"document publishing year") + header.append(u"document type") + header.append(u'document is citable') + header.append(u"issue") + header.append(u"issue title") + header.append(u"document title") + header.append(u"processing date") + header.append(u"publication date at SciELO") + header.append(u"publication date") + header.append(u"access date") + header.append(u"access year") + header.append(u"access month") + header.append(u"access to abstract") + header.append(u"access to html") + header.append(u"access to pdf") + header.append(u"access to epdf") + header.append(u"access total") + + self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) def get_accesses(self, issn): - for document in self._articlemeta.documents(collection=self.collection, issn=issn): accesses = [] - keys = eligible_match_keys(document) - logger.debug('keys to join for %s: %s' % (document.publisher_id, str(keys))) + + try: + keys = eligible_match_keys(document) + except Exception as e: + logger.error('Error ao ler: %s_%s', document.collection_acronym, document.publisher_id) + logger.exception(e) + continue + + logger.debug('keys to join for %s: %s', document.publisher_id, str(keys)) for key in keys: data = self._ratchet.document(key) jdata = json.loads(data) @@ -287,40 +334,60 @@ def get_accesses(self, issn): self.dayly_granularity) for adate, adata in joined_accesses.items(): - yield join_metadata_with_accesses(document, adate, adata) + try: + yield join_metadata_with_accesses(document, adate, adata) + except Exception as e: + logger.exception(e) + + def write(self, line): + if not self.output_file: + print(line.encode('utf-8')) + else: + self.output_file.write('%s\r\n' % line) def fmt_json(self, data): + del(data['issns']) + del(data['journal_current_status']) + return json.dumps(data) def fmt_csv(self, data): - line = [ - data['collection'], - data['pid'], - data['issn'], - data['journal_title'], - data['issue'], - data['issue_title'], - data['document_title'], - data['processing_date'], - data['publication_date'], - data['publication_year'], - data['document_type'], - ', '.join(data['subject_areas']), - ', '.join(data['languages']), - ', '.join(data['aff_countries']), - data['access_date'], - data['access_date'][:4], - data['access_date'][5:7], - data['access_date'][8:], - data.get('access_abstract', 0), - data.get('access_html', 0), - data.get('access_pdf', 0), - data.get('access_epdf', 0), - data['access_total'] - ] - - return ','.join(['"%s"' % i for i in line]) + line = [] + line.append(datetime.datetime.now().isoformat()[0:10]) + line.append('document') + line.append(data['collection']) + line.append(data['issn']) + line.append(u';'.join(data['issns'])) + line.append(data['journal_title']) + line.append(', '.join(data['subject_areas'])) + for area in choices.THEMATIC_AREAS: + if area.lower() in [i.lower() for i in data['subject_areas'] or []]: + line.append(u'1') + else: + line.append(u'0') + line.append('1' if len(data['subject_areas'] or []) > 2 else '0') + line.append(data['journal_current_status']) + line.append(data['pid']) + line.append(data['publication_year']) + line.append(data['document_type']) + line.append(u'1' if data['document_type'].lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append(data['issue']) + line.append(data['issue_title']) + line.append(data['document_title']) + line.append(data['processing_date']) + line.append(data['publication_date_at_scielo']) + line.append(data['publication_date']) + line.append(data['access_date']) + line.append(data['access_year']) + line.append(data['access_month']) + line.append(str(data.get('access_abstract', 0))) + line.append(str(data.get('access_html', 0))) + line.append(str(data.get('access_pdf', 0))) + line.append(str(data.get('access_epdf', 0))) + line.append(str(data['access_total'])) + + return ','.join(['"%s"' % i.replace('"', '""') for i in line]) def run(self): @@ -333,10 +400,9 @@ def run(self): print(self.fmt(data)) exit() - with codecs.open(self.output_file, 'w', encoding='utf-8') as f: - for issn in self.issns: - for data in self.get_accesses(issn=issn): - f.write(u'%s\r\n' % self.fmt(data)) + for issn in self.issns: + for data in self.get_accesses(issn=issn): + self.write(self.fmt(data)) def main(): @@ -408,7 +474,7 @@ def main(): args = parser.parse_args() _config_logging(args.logging_level, args.logging_file) logger.info('Dumping data for: %s' % args.collection) - + issns = None if len(args.issns) > 0: issns = utils.ckeck_given_issns(args.issns) diff --git a/bibliometric/citedby_document.py b/bibliometric/citedby_document.py new file mode 100644 index 0000000..a584b34 --- /dev/null +++ b/bibliometric/citedby_document.py @@ -0,0 +1,225 @@ +# coding: utf-8 +""" +Este processamento gera uma tabulação de citações concedidas no SciELO por +artigos da coleção SciELO. +Formato de saída: +"PID","ISSN","título","área temática","ano de publicação","tipo de documento","título do documento","citado por PID","citado por ISSN","citado por título","citado por título do documento" +""" +import argparse +import logging +import codecs +import json +import datetime + +import utils +import choices + +logger = logging.getLogger(__name__) + +OUTPUT_FORMAT = 'csv' + + +def _config_logging(logging_level='INFO', logging_file=None): + + allowed_levels = { + 'DEBUG': logging.DEBUG, + 'INFO': logging.INFO, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL + } + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + logger.setLevel(allowed_levels.get(logging_level, 'INFO')) + + if logging_file: + hl = logging.FileHandler(logging_file, mode='a') + else: + hl = logging.StreamHandler() + + hl.setFormatter(formatter) + hl.setLevel(allowed_levels.get(logging_level, 'INFO')) + + logger.addHandler(hl) + + return logger + + +class Dumper(object): + + def __init__(self, collection, issns=None, output_file=None, output_format=OUTPUT_FORMAT): + + self._citedby = utils.citedby_server() + self._articlemeta = utils.articlemeta_server() + self.collection = collection + self.issns = issns + self.output_format = output_format + self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file + + if output_format != 'json': + header = [] + header.append(u"extraction date") + header.append(u"study unit") + header.append(u"collection") + header.append(u"ISSN SciELO") + header.append(u"ISSN\'s") + header.append(u"title at SciELO") + header.append(u"title thematic areas") + for area in choices.THEMATIC_AREAS: + header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") + header.append(u"title current status") + header.append(u"document publication ID (PID SciELO)") + header.append(u"document publication year") + header.append(u"document type") + header.append(u"document is citable") + header.append(u"document title") + header.append(u"cited publication ID (PID SciELO)") + header.append(u"cited by issn") + header.append(u"cited by journal") + header.append(u"cited by document publication year") + header.append(u"cited by document title") + + self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + + def write(self, line): + if not self.output_file: + print(line.encode('utf-8')) + else: + self.output_file.write('%s\r\n' % line) + + def run(self): + for item in self.items(): + self.write(item) + logger.info('Export finished') + + def items(self): + + if not self.issns: + self.issns = [None] + + for issn in self.issns: + for data in self._articlemeta.documents(collection=self.collection, issn=issn): + logger.debug('Reading document: %s' % data.publisher_id) + + citedby = self._citedby.citedby_pid(data.publisher_id, metaonly=False) + if self.output_format == 'json' and isinstance(citedby, dict): + yield self.fmt_json(citedby) + continue + + for item in citedby.get('cited_by', []): + yield self.fmt_csv((data, item)) + + def fmt_json(self, content): + + return json.dumps(content) + + def fmt_csv(self, content): + + data, citedby = content + + know_languages = set(['pt', 'es', 'en']) + languages = set(data.languages()) + + issns = [] + if data.journal.print_issn: + issns.append(data.journal.print_issn) + if data.journal.electronic_issn: + issns.append(data.journal.electronic_issn) + + line = [] + line.append(datetime.datetime.now().isoformat()[0:10]) + line.append(u'document') + line.append(data.collection_acronym) + line.append(data.journal.scielo_issn) + line.append(u';'.join(issns)) + line.append(data.journal.title) + line.append(u';'.join(data.journal.subject_areas or [])) + for area in choices.THEMATIC_AREAS: + if area.lower() in [i.lower() for i in data.journal.subject_areas or []]: + line.append(u'1') + else: + line.append(u'0') + line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') + line.append(data.journal.current_status) + line.append(data.publisher_id) + line.append(data.publication_date[0:4]) + line.append(data.document_type) + line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') + line.append(data.original_title() or '') + line.append(citedby.get('code', '')) + line.append(citedby.get('issn', '')) + line.append(citedby.get('source', '')) + + citedby_publication_year = citedby.get('code', None) + citedby_publication_year = citedby_publication_year[10:14] if citedby_publication_year else '' + line.append(citedby_publication_year) + + if 'titles' in citedby and len(citedby['titles']) > 0: + line.append(citedby['titles'][0]) + else: + line.append('') + + joined_line = ','.join(['"%s"' % i.replace('"', '""') for i in line]) + + return joined_line + + +def main(): + + parser = argparse.ArgumentParser( + description='Dump languages distribution by article' + ) + + parser.add_argument( + 'issns', + nargs='*', + help='ISSN\'s separated by spaces' + ) + + parser.add_argument( + '--collection', + '-c', + help='Collection Acronym' + ) + + parser.add_argument( + '--output_format', + '-f', + choices=['json', 'csv'], + default=OUTPUT_FORMAT, + help='Output format' + ) + + parser.add_argument( + '--output_file', + '-r', + help='File to receive the dumped data' + ) + + parser.add_argument( + '--logging_file', + '-o', + help='Full path to the log file' + ) + + parser.add_argument( + '--logging_level', + '-l', + default='DEBUG', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Logggin level' + ) + + args = parser.parse_args() + _config_logging(args.logging_level, args.logging_file) + logger.info('Dumping data for: %s' % args.collection) + + issns = None + if len(args.issns) > 0: + issns = utils.ckeck_given_issns(args.issns) + + dumper = Dumper(args.collection, issns, args.output_file, args.output_format) + + dumper.run() diff --git a/bibliometric/citedby_journal.py b/bibliometric/citedby_journal.py new file mode 100644 index 0000000..970f1c8 --- /dev/null +++ b/bibliometric/citedby_journal.py @@ -0,0 +1,224 @@ +# coding: utf-8 +""" +Este processamento gera uma tabulação de citações concedidas no SciELO por +artigos da coleção SciELO. +Formato de saída: +"PID","ISSN","título","área temática","ano de publicação","tipo de documento","título do documento","citado por PID","citado por ISSN","citado por título","citado por título do documento" +""" +import argparse +import logging +import codecs +import json +import datetime + +import utils +import choices + +logger = logging.getLogger(__name__) + +OUTPUT_FORMAT = 'csv' + + +def _config_logging(logging_level='INFO', logging_file=None): + + allowed_levels = { + 'DEBUG': logging.DEBUG, + 'INFO': logging.INFO, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL + } + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + logger.setLevel(allowed_levels.get(logging_level, 'INFO')) + + if logging_file: + hl = logging.FileHandler(logging_file, mode='a') + else: + hl = logging.StreamHandler() + + hl.setFormatter(formatter) + hl.setLevel(allowed_levels.get(logging_level, 'INFO')) + + logger.addHandler(hl) + + return logger + + +def compute_citations(raw_data): + + data = [] + for publication_year in raw_data['aggregations']['publication_year']['buckets']: + for reference_publication_year in publication_year['reference_publication_year']['buckets']: + result = (publication_year['key'], (reference_publication_year['key'], reference_publication_year['doc_count'])) + data.append(result) + + return data + + +class Dumper(object): + + def __init__(self, collection, issns=None, output_file=None, output_format=OUTPUT_FORMAT, with_ref_links=False): + + self._citedby = utils.citedby_server() + self._articlemeta = utils.articlemeta_server() + self.collection = collection + self.issns = issns + self.output_format = output_format + self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file + + if output_format != 'json': + header = [] + header.append(u"extraction date") + header.append(u"study unit") + header.append(u"collection") + header.append(u"ISSN SciELO") + header.append(u"ISSN\'s") + header.append(u"title at SciELO") + header.append(u"title thematic areas") + for area in choices.THEMATIC_AREAS: + header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") + header.append(u"title current status") + header.append(u"has optimized queries") + header.append(u"publications from (year)") + header.append(u"cited publications from (year)") + header.append(u"total of citations") + + self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) + + def write(self, line): + if not self.output_file: + print(line.encode('utf-8')) + else: + self.output_file.write('%s\r\n' % line) + + def run(self): + for item in self.items(): + self.write(item) + logger.info('Export finished') + + def items(self): + + if not self.issns: + self.issns = [None] + + for issn in self.issns: + for data in self._articlemeta.journals( + collection=self.collection, issn=issn): + logger.debug('Reading journal: %s' % data.scielo_issn) + + titles = [] + titles.append(data.title) + titles.append(data.title_nlm) + titles.append(data.fulltitle) + titles.append(data.abbreviated_title) + titles.append(data.abbreviated_iso_title) + titles += data.other_titles or [] + titles = [i for i in set(titles) if i] + + citedby = self._citedby.publication_and_citing_years(data.scielo_issn, titles) + + for item in compute_citations(citedby) or []: + yield self.fmt_csv((data, item)) + + def fmt_csv(self, content): + + data, citedby = content + + issns = [] + if data.print_issn: + issns.append(data.print_issn) + if data.electronic_issn: + issns.append(data.electronic_issn) + + line = [] + line.append(datetime.datetime.now().isoformat()[0:10]) + line.append(u'journal') + line.append(data.collection_acronym) + line.append(data.scielo_issn) + line.append(u';'.join(issns)) + line.append(data.title) + line.append(u';'.join(data.subject_areas or [])) + for area in choices.THEMATIC_AREAS: + if area.lower() in [i.lower() for i in data.subject_areas or []]: + line.append(u'1') + else: + line.append(u'0') + line.append('1' if len(data.subject_areas or []) > 2 else '0') + line.append(data.current_status) + line.append('1' if self._citedby.has_optmized_journal_queries(data.scielo_issn) else '0') + line.append(str(citedby[0])) + line.append(str(citedby[1][0])) + line.append(str(citedby[1][1])) + + joined_line = ','.join(['"%s"' % i.replace('"', '""') for i in line]) + + return joined_line + + +def main(): + + parser = argparse.ArgumentParser( + description='Dump languages distribution by article' + ) + + parser.add_argument( + 'issns', + nargs='*', + help='ISSN\'s separated by spaces' + ) + + parser.add_argument( + '--collection', + '-c', + help='Collection Acronym' + ) + + parser.add_argument( + '--with_ref_links', + '-x', + action="store_false", + help='Include reference links for citing documents. It increase drastically the size of the report' + ) + + parser.add_argument( + '--output_format', + '-f', + choices=['json', 'csv'], + default=OUTPUT_FORMAT, + help='Output format' + ) + + parser.add_argument( + '--output_file', + '-r', + help='File to receive the dumped data' + ) + + parser.add_argument( + '--logging_file', + '-o', + help='Full path to the log file' + ) + + parser.add_argument( + '--logging_level', + '-l', + default='DEBUG', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Logggin level' + ) + + args = parser.parse_args() + _config_logging(args.logging_level, args.logging_file) + logger.info('Dumping data for: %s' % args.collection) + + issns = None + if len(args.issns) > 0: + issns = utils.ckeck_given_issns(args.issns) + + dumper = Dumper(args.collection, issns, args.output_file, args.output_format, args.with_ref_links) + + dumper.run() diff --git a/bibliometric/impact_factor.py b/bibliometric/impact_factor.py index 109e8e6..0a0a594 100644 --- a/bibliometric/impact_factor.py +++ b/bibliometric/impact_factor.py @@ -60,6 +60,7 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"base year") header.append(u"imediacity") @@ -113,6 +114,7 @@ def fmt_csv(self, data): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.subject_areas or []) > 2 else '0') line.append(data.current_status) impact_factor = self._analytics.impact_factor(data.scielo_issn, self.collection) diff --git a/choices.py b/choices.py index 46ffb95..3436185 100644 --- a/choices.py +++ b/choices.py @@ -8,10 +8,11 @@ u"exact and earth sciences", u"health sciences", u"human sciences", - u"linguistics, letters and arts", + u"linguistics, letters and arts" ]) CITABLE_DOCUMENT_TYPES = ( + 'data-article', u'article-commentary', u'brief-report', u'case-report', diff --git a/clients/search.py b/clients/search.py index 3a1ee22..b5a1f05 100644 --- a/clients/search.py +++ b/clients/search.py @@ -17,10 +17,13 @@ API_DOMAIN = utils.settings.get('app:main', {}).get( 'solr_search_scielo_org', 'localhost:8080') +API_INDEX = utils.settings.get('app:main', {}).get( + 'solr_search_scielo_org_index', 'search-scielo') + class Search(object): - UPDATE_ENDPOINT = 'http://%s/solr/scielo-articles/update' % API_DOMAIN + UPDATE_ENDPOINT = 'http://%s/solr/%s/update' % (API_DOMAIN, API_INDEX) def _do_request(self, url, params=None, data=None, headers=None): """ diff --git a/config.ini-TEMPLATE b/config.ini-TEMPLATE index 4a22773..9935074 100644 --- a/config.ini-TEMPLATE +++ b/config.ini-TEMPLATE @@ -1,7 +1,9 @@ [app:main] articlemeta_thriftserver = 127.0.0.1:11720 +articlemeta_admintoken = ratchet_thriftserver = 127.0.0.1:11630 accessstats_thriftserver = 127.0.0.1:11660 citedby_thriftserver = 127.0.0.1:11610 publicationstats_thriftserver = 127.0.0.1:11620 -solr_search_scielo_org = 127.0.0.1:8080 \ No newline at end of file +solr_search_scielo_org = 127.0.0.1:8080 +solr_search_scielo_org_index = search-scielo \ No newline at end of file diff --git a/docs/source/public_reports.rst b/docs/source/public_reports.rst index 9c7acf8..f56b3f6 100644 --- a/docs/source/public_reports.rst +++ b/docs/source/public_reports.rst @@ -5,7 +5,7 @@ Relatórios Públicos Alguns relatórios são produzidos mensalmente e publicados on-line para que usuários, instituições e outros interessados possam fazer download. -Para o download desses relatórios acessar: http://analytics.scielo.org/w/downloads +Para o download desses relatórios acessar: http://analytics.scielo.org/w/reports ------------------------ Relatórios de Periódicos @@ -39,24 +39,27 @@ Os formatos de saída disponíveis para este relatório são: CSV. * título é de engenharias (**engineering**) * título é de ciências exatas e da terra (**exact and earth science**) * título é de ciências da saúde (**health sciences**) - * título é de ciências humanas(**humanities**) - * título é de linguistica, letras e artes (**literature and arts**) + * título é de ciências humanas(**human sciences**) + * título é de linguistica, letras e artes (**linguistics, literature and arts**) + * título é multidisciplinar (**title is multidisciplinary**) * situação corrente do título (**title current status**) * título + sub-título na SciELO (**title + subtitle at SciELO**) * título abreviado na SciELO (**short title SciELO**) + * título abreviado conforme norma ISO (**short title ISO**) * título abreviado no PubMed (**short title PubMed**) * nome do publicador (**publisher name**) * licença de uso (**use license**) * frequencia alpha (:ref:`alpha frequency`) + * periodicidade numérica (em meses) (**numeric frequency (in months)**) * ano de inclusão na SciELO (**inclusion year at SciELO**) * ano de paralização na SciELO (**stopping year at SciELO**) * motivo de paralização (**stopping reason**) * data do primeiro documento (**date of the first document**) * volume do primeiro documento (**volume of the first document**) - * número do primeiro documento (**número of the first document**) + * número do primeiro documento (**issue of the first document**) * data do último documento (**date of the last document**) * volume do último documento (**volume of the last document**) - * número do último documento (**número of the last document**) + * número do último documento (**issue of the last document**) * total de fascículos (**total of issues**) * fascículos em (**issues at **) * fascículos em (**issues at **) @@ -78,6 +81,13 @@ Os formatos de saída disponíveis para este relatório são: CSV. * documentos em (**documents at **) * documentos em (**documents at **) * documentos em (**documents at **) + * documentos citáveis (**citable documents**) + * documentos citáveis em (**citable at **) + * documentos citáveis em (**citable at **) + * documentos citáveis em (**citable at **) + * documentos citáveis em (**citable at **) + * documentos citáveis em (**citable at **) + * documentos citáveis em (**citable at **) * documentos em português em (**portuguese documents at **) * documentos em português em (**portuguese documents at **) * documentos em português em (**portuguese documents at **) @@ -102,6 +112,19 @@ Os formatos de saída disponíveis para este relatório são: CSV. * documentos em outros idiomas em (**other language documents at **) * documentos em outros idiomas em (**other languages documents at **) * documentos em outros idiomas em (**other languages documents at **) + * google scholar h5 (**google scholar h5 **) + * google scholar h5 (**google scholar h5 **) + * google scholar h5 (**google scholar h5 **) + * google scholar h5 (**google scholar h5 **) + * google scholar h5 (**google scholar h5 **) + * google scholar h5 (**google scholar h5 **) + * google scholar m5 (**google scholar h5 **) + * google scholar m5 (**google scholar m5 **) + * google scholar m5 (**google scholar m5 **) + * google scholar m5 (**google scholar m5 **) + * google scholar m5 (**google scholar m5 **) + * google scholar m5 (**google scholar m5 **) + Relatório de histórico de mudanças de status dos periódicos =========================================================== @@ -133,6 +156,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. * título é de ciências da saúde (**health sciences**) * título é de ciências humanas(**humanities**) * título é de linguistica, letras e artes (**literature and arts**) + * título é multidisciplinar (**title is multidisciplinary**) * situação corrente do título (**title current status**) * data da mudança de status (**status change date**) * ano da mudança de status (**status change year**) @@ -171,6 +195,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. * título é de ciências da saúde (**health sciences**) * título é de ciências humanas(**humanities**) * título é de linguistica, letras e artes (**literature and arts**) + * título é multidisciplinar (**title is multidisciplinary**) * situação corrente do título (**title current status**) * ano de publicação (**publishing year**) * ano dos acessos (**accesses year**) @@ -258,6 +283,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. * título é de ciências da saúde (**health sciences**) * título é de ciências humanas(**humanities**) * título é de linguistica, letras e artes (**literature and arts**) + * título é multidisciplinar (**title is multidisciplinary**) * situação corrente do título (**title current status**) * ID de publicação do documento (PID SciELO) (**document publishing ID (PID SciELO)**) * ano de publicação do documento (**document publishing year**) @@ -315,7 +341,7 @@ Relatório de afiliações dos documentos **nome do arquivo:** documents_affiliations.csv -**finalidade:** Relatório com autores dos documentos, para extração +**finalidade:** Relatório com afiliação dos documentos, para extração de indicadores de publicação. Formatos de saída @@ -340,6 +366,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. * título é de ciências da saúde (**health sciences**) * título é de ciências humanas(**humanities**) * título é de linguistica, letras e artes (**literature and arts**) + * título é multidisciplinar (**title is multidisciplinary**) * situação corrente do título (**title current status**) * ID de publicação do documento (PID SciELO) (**document publishing ID (PID SciELO)**) * ano de publicação do documento (**document publishing year**) @@ -347,7 +374,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. * documento citável (**document is citable**, :ref:`citable documents`) * instituição de Afiliação do documento (**document affiliation institution**) * país de afiliação do documento (**document affiliation country**) - * país de afiliação do documento ISO-3166 (**document addiliation country ISO-3166**, :ref:`languages`) + * país de afiliação do documento ISO-3166 (**document affiliation country ISO-3166**) * estado de afiliação do documento (**document affiliation state**) * cidade de afiliação do documento (**document affiliation city**) @@ -360,6 +387,66 @@ Os formatos de saída disponíveis para este relatório são: CSV. devido aos processos estabelecidos por cada uma delas para a garantia de qualidade de seus metadados. + +Relatório de nacionalidade de afiliações dos documentos +======================================================= + +**nome do arquivo:** documents_affiliations_nationality.csv + +**finalidade:** Relatório com contagem de nacionalidade dos documentos com base +em uma nacionalidade de origem, para extração de indicadores de publicação. + +Formatos de saída +----------------- + +Os formatos de saída disponíveis para este relatório são: CSV. + +**Formato CSV** + + * data de extração (**extraction date**) + * unidade de estudo (:ref:`study unity`) + * coleção (:ref:`collection`) + * ISSN SciELO (**ISSN SciELO**) + * ISSN's (**ISSN's**) + * título na SciELO (**title at SciELO**) + * areas temáticas do título (:ref:`title thematic areas`) + * título é de ciências agrárias (**agricultural sciences**) + * título é de ciências sociais aplicadas (**applied social sciences**) + * título é de ciências biológicas (**biological sciences**) + * título é de engenharias (**engineering**) + * título é de ciências exatas e da terra (**exact and earth science**) + * título é de ciências da saúde (**health sciences**) + * título é de ciências humanas(**humanities**) + * título é de linguistica, letras e artes (**literature and arts**) + * título é multidisciplinar (**title is multidisciplinary**) + * situação corrente do título (**title current status**) + * ID de publicação do documento (PID SciELO) (**document publishing ID (PID SciELO)**) + * ano de publicação do documento (**document publishing year**) + * tipo de documento (:ref:`document type`) + * documento citável (**document is citable**, :ref:`citable documents`) + * nacionalidade de referência (**home nationality ISO-3166**) + * total de affiliações do documento + * total nacional + * total extrangeira + * total indefinada + * total inexistente + +.. hint:: + + Os dados de afiliação não estão disponíveis para todos os documentos. Por + se tratar de um processo com alto índice de trabalho manual, é reconhecida, + mesmo que em baixas proporções, a existência de metadados errados ou não + normalizados. A qualidade dos metadados podem variar de uma coleção para outra + devido aos processos estabelecidos por cada uma delas para a garantia de + qualidade de seus metadados. + + +.. hint:: + + A definição de afiliação nacional ou extrageira é produzida comparando os + dados de país de afiliação dos documentos com o dado definido como **nacionalidade + de referência** no momento da extração do relatório. + Relatório de contagens gerais relacionadas aos dos documentos ============================================================= @@ -391,6 +478,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. * título é de ciências da saúde (**health sciences**) * título é de ciências humanas(**humanities**) * título é de linguistica, letras e artes (**literature and arts**) + * título é multidisciplinar (**title is multidisciplinary**) * situação corrente do título (**title current status**) * ID de publicação do documento (PID SciELO) (**document publishing ID (PID SciELO)**) * ano de publicação do documento (**document publishing year**) @@ -436,6 +524,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. * título é de ciências da saúde (**health sciences**) * título é de ciências humanas(**humanities**) * título é de linguistica, letras e artes (**literature and arts**) + * título é multidisciplinar (**title is multidisciplinary**) * situação corrente do título (**title current status**) * ID de publicação do documento (PID SciELO) (**document publishing ID (PID SciELO)**) * ano de publicação do documento (**document publishing year**) @@ -453,6 +542,10 @@ Os formatos de saída disponíveis para este relatório são: CSV. * documento revisado no ano (document reviewed at year) * documento revisado no mês (document reviewed at month) * documento revisado no dia (document reviewed at day) + * documento publicado como ahead of print em (document published as ahead of print at) + * documento publicado como ahead of print no ano (document published as ahead of print at year) + * documento publicado como ahead of print no mês (document published as ahead of print at month) + * documento publicado como ahead of print no dia (document published as ahead of print at day) * documento publicado em (published reviewed at) * documento publicado no ano (document published at year) * documento publicado no mês (document published at month) @@ -495,6 +588,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. * título é de ciências da saúde (**health sciences**) * título é de ciências humanas(**humanities**) * título é de linguistica, letras e artes (**literature and arts**) + * título é multidisciplinar (**title is multidisciplinary**) * situação corrente do título (**title current status**) * ID de publicação do documento (PID SciELO) (**document publishing ID (PID SciELO)**) * ano de publicação do documento (**document publishing year**) @@ -535,6 +629,7 @@ Os formatos de saída disponíveis para este relatório são: CSV. * título é de ciências da saúde (**health sciences**) * título é de ciências humanas(**humanities**) * título é de linguistica, letras e artes (**literature and arts**) + * título é multidisciplinar (**title is multidisciplinary**) * situação corrente do título (**title current status**) * ID de publicação do documento (PID SciELO) (**document publishing ID (PID SciELO)**) * ano de publicação do documento (**document publishing year**) diff --git a/evaluation/altmetrics.py b/evaluation/altmetrics.py index e5c0cea..5e4c625 100644 --- a/evaluation/altmetrics.py +++ b/evaluation/altmetrics.py @@ -7,9 +7,14 @@ import logging import codecs import requests -import urlparse import datetime +# Python 3 and 2 Compatibilility +try: + import urlparse as parse # Python 2 +except: + from urllib import parse # Python3 + import utils import choices @@ -73,6 +78,7 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"document publishing ID (PID SciELO)") header.append(u"document publishing year") @@ -142,14 +148,15 @@ def fmt_csv(self, data, altmetrics): title = altmetrics.get('title', '').replace('\n', '') doi = altmetrics.get('doi', get_doi_from_url(url)) details_url = altmetrics.get('details_url', None) - pid = urlparse.parse_qs(urlparse.urlparse(url).query).get('pid', None) if url else None + pid = parse.parse_qs(parse.urlparse(url).query).get('pid', None) if url else None if doi: article = self._articlemeta.document(doi.upper(), self.collection) - publication_date = article.publication_date if article else u'not defined' - publisher_id = article.publisher_id if article else u'not defined' - document_type = article.document_type if article else u'not defined' + publication_date = article.publication_date if article and article.data else u'not defined' + publisher_id = article.publisher_id if article and article.data else u'not defined' + document_type = article.document_type if article and article.data else u'not defined' + score = altmetrics.get('score', None) issns = [] @@ -171,6 +178,7 @@ def fmt_csv(self, data, altmetrics): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.subject_areas or []) > 2 else '0') line.append(data.current_status) line.append(publisher_id) if publication_date == u'not define': diff --git a/export/doaj_journals.py b/export/doaj_journals.py index d6307ae..11ee731 100644 --- a/export/doaj_journals.py +++ b/export/doaj_journals.py @@ -133,7 +133,7 @@ def items(self): jissns.add(data.scielo_issn) in_doaj = self.get_doaj_journal(list(jissns)) yield self.fmt_csv(data, in_doaj) - + def fmt_csv(self, data, in_doaj): line = [ @@ -197,6 +197,6 @@ def main(): if len(args.issns) > 0: issns = utils.ckeck_given_issns(args.issns) - dumper = Dumper(args.collection, issns, args.output_file) + dumper = Dumper(args.collection, issns) dumper.run() diff --git a/export/dump_articles.py b/export/dump_articles.py new file mode 100644 index 0000000..42dfdda --- /dev/null +++ b/export/dump_articles.py @@ -0,0 +1,172 @@ +# coding: utf-8 +""" +This scripts uses the Article Meta API to harvest all SciELO Network Documents +They are stored into a zip file. +This processing always harvest the entire database to garantee that all the +documents are up to date. +""" +import os +import logging +import zipfile +import datetime +import argparse +from lxml import etree + +import requests +from articlemeta.client import ThriftClient + +import utils + +logger = logging.getLogger(__name__) + + +def _config_logging(logging_level='INFO', logging_file=None): + + allowed_levels = { + 'DEBUG': logging.DEBUG, + 'INFO': logging.INFO, + 'WARNING': logging.WARNING, + 'ERROR': logging.ERROR, + 'CRITICAL': logging.CRITICAL + } + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + logger.setLevel(allowed_levels.get(logging_level, 'INFO')) + + if logging_file: + hl = logging.FileHandler(logging_file, mode='a') + else: + hl = logging.StreamHandler() + + hl.setFormatter(formatter) + hl.setLevel(allowed_levels.get(logging_level, 'INFO')) + + logger.addHandler(hl) + + return logger + +trans_acronym = {'scl': 'bra'} + + +def getschema(): + + try: + xsd = requests.get('https://raw.githubusercontent.com/scieloorg/articles_meta/master/tests/xsd/scielo_sci/ThomsonReuters_publishing.xsd').text + logger.debug('Schema download') + return xsd + except: + logger.error('Schema download fail') + + +class Dumper(object): + + def __init__(self, collection, issns=None, xml_format='xmlwos', zip_name='file.zip'): + self._articlemeta = utils.articlemeta_server() + self.collection = collection + self.issns = issns + self.zip_name = zip_name + self.xml_format = xml_format + + def items(self): + + if not self.issns: + self.issns = [None] + + for issn in self.issns: + for document in self._articlemeta.documents(collection=self.collection, issn=issn, only_identifiers=True): + xml = self._articlemeta.document(code=document.code, collection=document.collection, fmt=self.xml_format) + yield (document.code, document.collection, xml) + + def run(self): + + client = ThriftClient() + + logger.info('Creating zip file: %s', self.zip_name) + logger.info('XML Format: %s', self.xml_format) + + with zipfile.ZipFile(self.zip_name, 'w', compression=zipfile.ZIP_DEFLATED, allowZip64=True) as thezip: + for pid, collection, document in self.items(): + logger.debug('Loading XML file for %s', '_'.join([collection, pid])) + collection = trans_acronym.get(collection, collection) + issn = pid[1:10] + xml_file = '{0}/{1}/{2}.xml'.format(collection, issn, pid) + thezip.writestr(xml_file, bytes(document.encode('utf-8'))) + + readmef = open(os.path.dirname(__file__)+'/templates/dumparticle_readme.txt', 'r').read() + readme = '{0}\r\n* Documents updated at: {1}\r\n'.format(readmef, datetime.datetime.now().isoformat()) + + thezip.writestr("README.txt", bytes(readme.encode('utf-8'))) + + if self.xml_format == 'xmlwos': + xsd = getschema() + if xsd: + thezip.writestr("schema/ThomsonReuters_publishing.xsd", bytes(xsd.encode('utf-8'))) + + logger.info('Zip created: %s', self.zip_name) + logger.info('Processing finished') + + +def main(): + + parser = argparse.ArgumentParser( + description="Dump SciELO Network metadata" + ) + + parser.add_argument( + 'issns', + nargs='*', + help='ISSN\'s separated by spaces' + ) + + parser.add_argument( + '--collection', + '-c', + help='Collection Acronym' + ) + + parser.add_argument( + '--zip_file', + '-f', + default='dumpdata.zip', + help='Full path to the zip file that will receive the documents' + ) + + parser.add_argument( + '--xml_format', + '-x', + default='xmlwos', + choices=['xmlwos', 'xmlrsps'], + help='XML output format' + ) + + parser.add_argument( + '--logging_file', + '-o', + help='Full path to the log file' + ) + + parser.add_argument( + '--logging_level', + '-l', + default='DEBUG', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Logggin level' + ) + + args = parser.parse_args() + _config_logging(args.logging_level, args.logging_file) + logger.info('Dumping data for: %s' % args.collection) + + issns = None + if len(args.issns) > 0: + issns = utils.ckeck_given_issns(args.issns) + + dumper = Dumper( + args.collection, + issns, + args.xml_format, + args.zip_file + ) + + dumper.run() diff --git a/export/exdoaj.py b/export/exdoaj.py index ed6e7d0..1148617 100644 --- a/export/exdoaj.py +++ b/export/exdoaj.py @@ -20,7 +20,7 @@ FROM = datetime.now() - timedelta(days=30) FROM = FROM.isoformat()[:10] -DOAJ_XSD = open(os.path.dirname(__file__)+'/xsd/doajArticles.xsd', 'r').read() +DOAJ_XSD = open(os.path.dirname(__file__)+'/xsd/doaj/doajArticles.xsd', 'r').read() logger = logging.getLogger(__name__) @@ -54,7 +54,7 @@ def _config_logging(logging_level='INFO', logging_file=None): class Dumper(object): def __init__(self, collection, issns=None, output_file=None, from_date=FROM, - user=None, password=None, api_token=None): + user=None, password=None, api_token=None, corrections_db=None, validate_schema=False): self._articlemeta = utils.articlemeta_server() self.collection = collection @@ -63,8 +63,10 @@ def __init__(self, collection, issns=None, output_file=None, from_date=FROM, self.password = password self.issns = issns or [None] self.session = self.authenticated_session() - self.parse_schema() + self.validate_schema = validate_schema + self.doaj_schema = self.parse_schema() if self.validate_schema else None self.doaj_articles = Articles(usertoken=api_token) + self.corrections_db = corrections_db def _doaj_id_by_meta(self, issn, publication_year, title): ### Query by metadata @@ -133,7 +135,7 @@ def parse_schema(self): logger.error('Fail to parse XML') return False - self.doaj_schema = sch + return sch def authenticated_session(self): auth_url = 'https://doaj.org/account/login' @@ -159,7 +161,6 @@ def authenticated_session(self): return session def xml_is_valid(self, xml): - try: xml = StringIO(xml) xml_doc = etree.parse(xml) @@ -191,7 +192,7 @@ def send_xml(self, file_name, file_data): logger.debug('Fail to send document to DOAJ') return False - if u'successfully uploaded' in response.text: + if u'File uploaded and waiting to be processed' in response.text: logger.info('Document Sent') return True else: @@ -233,15 +234,103 @@ def run(self): logger.error('Fail to read document: %s_%s' % (document.publisher_id, document.collection_acronym)) xml = u'' - if not self.xml_is_valid(xml): + if self.validate_schema and not self.xml_is_valid(xml): logger.error('Fail to parse xml document: %s_%s' % (document.publisher_id, document.collection_acronym)) continue logger.info('Sending document: %s_%s' % (document.publisher_id, document.collection_acronym)) filename = '%s_%s.xml' % (document.publisher_id, document.collection_acronym) + # aplica a correção aos ISSNs. + if self.corrections_db is not None: + xml = self._fix_issns(xml) + self.send_xml(filename, xml) + def _get_doaj_issns(self, issn): + corrected = self.corrections_db.find(issn) + result = {} + for issn in corrected.get("issns", []): + result[issn["type"]] = issn["id"] + + return result + + def _fix_issns(self, xml_data): + et = etree.parse(BytesIO(xml_data.encode("utf-8"))) + issns = [i.text for i in et.xpath("/records/record/issn | /records/record/eissn")] + for issn in issns: + try: + issns_from_doaj = self._get_doaj_issns(issn) + except ValueError: + logger.info('could not find corrected issns for "%s"', issn) + continue + else: + result = replace_issns(et, pissn=issns_from_doaj.get("pissn"), eissn=issns_from_doaj.get("eissn")) + logger.debug('the xml "%s" was replaced by "%s"', xml_data, result) + return result + else: + return xml_data + + +def replace_issns(et, pissn=None, eissn=None): + assert any([pissn, eissn]) + + record_node = et.find("record") + journalTitle_node = et.xpath("/records/record/journalTitle")[0] + + for node in et.xpath("/records/record/issn | /records/record/eissn"): + del(record_node[record_node.index(node)]) + + if eissn: + new_eissn_node = etree.Element("eissn") + new_eissn_node.text = eissn + record_node.insert( + record_node.index(journalTitle_node) + 1, + new_eissn_node + ) + + if pissn: + new_issn_node = etree.Element("issn") + new_issn_node.text = pissn + record_node.insert( + record_node.index(journalTitle_node) + 1, + new_issn_node + ) + + return etree.tostring(et, encoding="unicode", pretty_print=False) + + +class CorrectionsDB(object): + def __init__(self, data): + self._data = tuple(data) + def _make_issn_getter(issn_type): + def _issn_getter(item): + for issn_data in item.get('issns', []): + if issn_data['type'] == issn_type: + return issn_data['id'] + + return _issn_getter + + self._index = self._create_index(self._data, _make_issn_getter('eissn')) + self._index.update(self._create_index(self._data, _make_issn_getter('pissn'))) + + def _create_index(self, data, func): + """Este índice mapeia cada ISSN a uma posição na lista `data`, de forma + que não seja necessário iterar sobre a lista em busca do item desejado. + """ + result = {func(item): i for i, item in enumerate(data)} + try: + del(result[None]) + except KeyError: + pass + return result + + def find(self, issn): + pos = self._index.get(issn) + if pos is None: + raise ValueError + return self._data[pos] + def main(): @@ -303,6 +392,18 @@ def main(): help='Logggin level' ) + parser.add_argument( + '--validate_schema', + action='store_true', + help='Validate each document against the DOAJ Schema before submitting', + ) + + parser.add_argument( + '--corrections_db', + help='Path to the corrections database file', + type=argparse.FileType("r"), + ) + args = parser.parse_args() _config_logging(args.logging_level, args.logging_file) logger.info('Dumping data for: %s' % args.collection) @@ -321,8 +422,16 @@ def main(): else: issns = issns_from_file if issns_from_file else [] + if args.corrections_db: + corrections_data = [json.loads(line) for line in args.corrections_db.readlines()] + corrections_db = CorrectionsDB(corrections_data) + logger.info('a database of corrections will be used to fix ISSNs before sending the data') + else: + corrections_db = None + logger.info('no database of corrections was given, the processing will proceed anyway') + dumper = Dumper( args.collection, issns, from_date=args.from_date, user=args.user, - password=args.password) + password=args.password, corrections_db=corrections_db) dumper.run() diff --git a/export/gen_doaj_correctionsdb.py b/export/gen_doaj_correctionsdb.py new file mode 100644 index 0000000..2d9a4c0 --- /dev/null +++ b/export/gen_doaj_correctionsdb.py @@ -0,0 +1,200 @@ +# coding: utf-8 +import sys +import json +import math +import time +import logging +import argparse +import functools +from urllib import quote + +import requests + + +LOGGER = logging.getLogger(__name__) + +MAX_RETRIES = 4 + +BACKOFF_FACTOR = 1.9 + +DEFAULT_DOAJ_API_URL="https://doaj.org/api/v1/" + +EPILOG = """\ +Copyright 2020 SciELO . +Licensed under the terms of the BSD license. Please see LICENSE in the source +code for more information. +""" + +LOGGER_FMT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s" + + +class RetryableError(Exception): + pass + + +class NonRetryableError(Exception): + pass + + +class retry_gracefully: + """Produz decorador que torna o objeto decorado resiliente às exceções dos + tipos informados em `exc_list`. Tenta no máximo `max_retries` vezes com + intervalo exponencial entre as tentativas. + """ + + def __init__( + self, + max_retries=MAX_RETRIES, + backoff_factor=BACKOFF_FACTOR, + exc_list=(RetryableError,), + ): + self.max_retries = int(max_retries) + self.backoff_factor = float(backoff_factor) + self.exc_list = tuple(exc_list) + + def _sleep(self, seconds): + time.sleep(seconds) + + def __call__(self, func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + retry = 1 + while True: + try: + return func(*args, **kwargs) + except self.exc_list as exc: + if retry <= self.max_retries: + wait_seconds = self.backoff_factor ** retry + LOGGER.info( + 'could not get the result for "%s" with *args "%s" ' + 'and **kwargs "%s". retrying in %s seconds ' + "(retry #%s): %s", + func.__qualname__, + args, + kwargs, + str(wait_seconds), + retry, + exc, + ) + self._sleep(wait_seconds) + retry += 1 + else: + raise + + return wrapper + + +@retry_gracefully() +def get(url, timeout=2): + try: + response = requests.get(url, timeout=timeout) + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as exc: + raise RetryableError(exc) + except ( + requests.exceptions.InvalidSchema, + requests.exceptions.MissingSchema, + requests.exceptions.InvalidURL, + ) as exc: + raise NonRetryableError(exc) + else: + try: + response.raise_for_status() + except requests.HTTPError as exc: + if 400 <= exc.response.status_code < 500: + raise NonRetryableError(exc) + elif 500 <= exc.response.status_code < 600: + raise RetryableError(exc) + else: + raise + + return response.content + + +class DOAJ: + def __init__(self, api_url=DEFAULT_DOAJ_API_URL): + self.api_url = api_url + + def search_journals(self, search_query, page=None): + url = self.api_url + "search/journals/" + quote(search_query) + if page: + url += "?page=%s" % page + + return get(url) + + def iter_search_journals(self, search_query): + results = json.loads(self.search_journals(search_query)) + total_pages = math.ceil(results["total"] / results["pageSize"]) + + for journal in results.get("results", []): + yield journal + + for page in range(2, int(total_pages + 1)): + results = json.loads(self.search_journals(search_query, page=page)) + for journal in results.get("results", []): + yield journal + + +def iter_journals_by_provider(name): + search_query = 'bibjson.provider:"%s"' % name + doaj = DOAJ() + return doaj.iter_search_journals(search_query) + + +def gen_corrections_db(args): + for journal in iter_journals_by_provider("scielo"): + bibjson = journal.get("bibjson", {}) + selected_fields = { + "doaj_id": journal.get("id"), + "title": bibjson.get("title"), + "alternative_title": bibjson.get("alternative_title"), + "is_active": bibjson.get("active"), + "provider": bibjson.get("provider"), + "issns": bibjson.get("identifier"), + } + + args.output.write(json.dumps(selected_fields) + "\n") + + +def cli(argv=None): + if argv is None: + argv = sys.argv[1:] + parser = argparse.ArgumentParser( + description="Tool to send documents from SciELO to DOAJ.", epilog=EPILOG, + ) + parser.add_argument("--loglevel", default="") + subparsers = parser.add_subparsers() + + parser_gen_correctionsdb = subparsers.add_parser( + "gen-correctionsdb", help="Generate corrections DB from DOAJ API.", + ) + parser_gen_correctionsdb.add_argument( + "--output", default=sys.stdout, type=argparse.FileType("w"), required=False, + ) + parser.set_defaults(func=gen_corrections_db) + + args = parser.parse_args(argv) + # todas as mensagens serão omitidas se level > 50 + logging.basicConfig( + level=getattr(logging, args.loglevel.upper(), 999), format=LOGGER_FMT + ) + try: + return args.func(args) + except AttributeError: + parser.print_usage() + + +def main(): + try: + sys.exit(cli()) + except KeyboardInterrupt: + LOGGER.info("Got a Ctrl+C. Terminating the program.") + # É convencionado no shell que o programa finalizado pelo signal de + # código N deve retornar o código N + 128. + sys.exit(130) + except Exception as exc: + LOGGER.exception(exc) + sys.exit("An unexpected error has occurred: %s" % exc) + + +if __name__ == "__main__": + main() diff --git a/export/kbart.py b/export/kbart.py index b150d71..5811528 100644 --- a/export/kbart.py +++ b/export/kbart.py @@ -2,17 +2,36 @@ """ Este processamento gera uma tabulação de periódicos seguindo o formato Kbart. -Formato de saída: -"Título do Periódico","ISSN impresso","ISSN online","Data do primeiro número","volume","número","Data do último número publicado","volume","número","url issues","ID SciELO" +Formato de saída (headers em inglês, conforme diretrizes KBART): +publication_title, print_identifier, online_identifier, date_first_issue_online, +num_first_vol_online, num_first_issue_online, date_last_issue_online, +num_last_vol_online, num_last_issue_online, title_url, first_author, title_id, +embargo_info, coverage_depth, coverage_notes, publisher_name, publication_type, +date_monograph_published_print, date_monograph_published_online, monograph_volume, +monograph_edition, first_editor, parent_publication_title_id, +preceding_publication_title_id, access_type """ import argparse import logging import codecs +import re import utils logger = logging.getLogger(__name__) +# ISSN redirects for journals that changed their ISSN in URLs +# Maps old ISSN to new ISSN +ISSN_URL_REDIRECTS = { + '1575-0620': '2013-6463', # Revista española de sanidad penitenciaria (SciELO Spain) +} + +# Pre-compile regex patterns for ISSN redirects for better performance +_ISSN_REDIRECT_PATTERNS = { + old_issn: re.compile(r'([?&]pid=)' + re.escape(old_issn) + r'(&|$)') + for old_issn in ISSN_URL_REDIRECTS.keys() +} + def _config_logging(logging_level='INFO', logging_file=None): @@ -52,31 +71,31 @@ def __init__(self, collection, issns=None, output_file=None): self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [ - u"Título do Periódico (publication_title)", - u"ISSN impresso (print_identifier)", - u"ISSN online (online_identifier)", - u"Data do primeiro fascículo (date_first_issue_online)", - u"volume do primeiro fascículo (num_first_vol_online)", - u"número do primeiro fascículo (num_first_issue_online)", - u"Data do último fascículo publicado (date_last_issue_online)", - u"volume do último fascículo publicado (num_last_vol_online)", - u"número do último fascículo publicado (num_last_issue_online)", - u"url de fascículos (title_url)", - u"primeiro autor (first_author)", - u"ID do periódico no SciELO (title_id)", - u"informação de embargo (embargo_info)", - u"cobertura (coverage_depth)", - u"informação sobre cobertura (coverage_notes)", - u"nome do publicador (publisher_name)", - u"tipo de publicação (publication_type)", - u"data de publicação monográfica impressa (date_monograph_published_print)", - u"data de publicação monográfica online (date_monograph_published_online)", - u"volume de monografia (monograph_volume)", - u"edição de monografia (monograph_edition)", - u"primeiro editor (first_editor)", - u"ID de publicação pai (parent_publication_title_id)", - u"ID de publicação prévia (preceding_publication_title_id)", - u"tipo de acesso (access_type)" + u"publication_title", + u"print_identifier", + u"online_identifier", + u"date_first_issue_online", + u"num_first_vol_online", + u"num_first_issue_online", + u"date_last_issue_online", + u"num_last_vol_online", + u"num_last_issue_online", + u"title_url", + u"first_author", + u"title_id", + u"embargo_info", + u"coverage_depth", + u"coverage_notes", + u"publisher_name", + u"publication_type", + u"date_monograph_published_print", + u"date_monograph_published_online", + u"monograph_volume", + u"monograph_edition", + u"first_editor", + u"parent_publication_title_id", + u"preceding_publication_title_id", + u"access_type" ] @@ -92,6 +111,9 @@ def _first_included_document_by_journal(self, issn, collection): document = self._articlemeta.document(fid['pid'], fid['collection']) + if not document.data: + return None + return document def _last_included_document_by_journal(self, issn, collection): @@ -104,6 +126,9 @@ def _last_included_document_by_journal(self, issn, collection): document = self._articlemeta.document(lid['pid'], lid['collection']) + if not document.data: + return None + return document def write(self, line): @@ -123,6 +148,9 @@ def items(self): for issn in self.issns: for data in self._articlemeta.journals(collection=self.collection, issn=issn): + if data.current_status != 'current': + logger.debug('Skipping non-active journal: %s (status: %s)' % (data.scielo_issn, data.current_status)) + continue logger.debug('Reading document: %s' % data.scielo_issn) yield self.fmt_csv(data) @@ -137,25 +165,35 @@ def fmt_csv(self, data): line.append( first_document.publication_date or '' if first_document else '') line.append( - first_document.issue.volume or '' if first_document else '') + first_document.issue.volume or '' if first_document and first_document.issue else '') line.append( - first_document.issue.number or '' if first_document else '') + first_document.issue.number or '' if first_document and first_document.issue else '') if data.current_status != 'current': line.append( last_document.publication_date or '' if last_document else '') line.append( - last_document.issue.volume or '' if last_document else '') + last_document.issue.volume or '' if last_document and last_document.issue else '') line.append( - last_document.issue.number or '' if last_document else '') + last_document.issue.number or '' if last_document and last_document.issue else '') else: line += ['', '', ''] - line.append(data.url().replace('sci_serial', 'sci_issues')) + # Generate the URL + url = data.url().replace('sci_serial', 'sci_issues') + + # Apply ISSN redirects for journals that changed their ISSN in URLs + # This is necessary for journals that no longer use their print ISSN + for old_issn, new_issn in ISSN_URL_REDIRECTS.items(): + # Use pre-compiled regex pattern for better performance + pattern = _ISSN_REDIRECT_PATTERNS[old_issn] + url = pattern.sub(r'\g<1>' + new_issn + r'\2', url) + + line.append(url) line.append('') # first_author line.append(data.scielo_issn or '') line.append('') # embargo_info - line.append('') # coverage_depth + line.append('fulltext') # coverage_depth line.append('') # coverage_notes - line.append(' '.join(data.publisher_name) if data.publisher_name else []) # publisher_name + line.append(' '.join(data.publisher_name) if data.publisher_name else '') # publisher_name line.append('Serial') # publication_type line.append('') # date_monograph_published_print line.append('') # date_monograph_published_online diff --git a/export/natural_keys.py b/export/natural_keys.py index 9bf36bf..4b8311f 100644 --- a/export/natural_keys.py +++ b/export/natural_keys.py @@ -15,12 +15,9 @@ from io import StringIO -import packtools -from packtools.catalogs import XML_CATALOG - +from legendarium.urlegendarium import URLegendarium import utils -os.environ['XML_CATALOG_FILES'] = XML_CATALOG logger = logging.getLogger(__name__) @@ -59,9 +56,6 @@ def __init__(self, collection, issns=None, output_file=None): self.collection = collection self.issns = issns or [None] self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file - header = [u"coleção", u"pid", u"título", u"volume", u"número", u"ano de publicação", u"primeira página", u"primeria página seq" u"última página", u"e-location", u"ahead of print id", u"chave"] - - self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) def write(self, line): if not self.output_file: @@ -69,104 +63,140 @@ def write(self, line): else: self.output_file.write('%s\r\n' % line) - def fmt_json(self, data, xml_etree): - - parsed_xml = xml_etree.lxml - - def get_value(expression): - try: - first_occ = parsed_xml.xpath(expression)[0] - except IndexError: - return None - - try: - value = first_occ.text - except AttributeError: - # valor de atributo - value = first_occ - - try: - return value.strip() - except AttributeError: - return value - - line = [] - - journal_title = get_value('/article/front/journal-meta/journal-title-group/journal-title') - volume = get_value('/article/front/article-meta/volume') - issue = get_value('/article/front/article-meta/issue') - year = get_value('/article/front/article-meta/pub-date/year') - fpage = get_value('/article/front/article-meta/fpage') - fpage_seq = get_value('/article/article/front/article-meta/fpage/@seq') - lpage = get_value('/article/front/article-meta/lpage') - elocation = get_value('/article/front/article-meta/elocation') - aop_id = get_value('/article/front/article-meta/article-id[@pub-id-type="other"]') + def fmt_json(self, data): + + item = { + 'collection_acronym': data.collection_acronym, + 'publisher_id': data.publisher_id, + 'journal_title': data.journal.title, + 'journal_acronym': data.journal.acronym, + 'volume': data.issue.volume, + 'number': data.issue.number, + 'supplement': (data.issue.supplement_volume or '') + (data.issue.supplement_number or ''), + 'publication_year': data.publication_date[:4], + 'first_page': data.start_page or '', + 'first_page_seq': data.start_page_sequence or '', + 'last_page': data.end_page or '', + 'elocation': data.elocation or '', + 'doi': data.doi or '', + 'order': data.internal_sequence_id or '' + } + + # legendarium natural_url + natural_url = URLegendarium( + item['journal_acronym'], + item['publication_year'], + item['volume'], + item['number'], + item['first_page'], + item['first_page_seq'], + item['last_page'], + item['elocation'], + item['supplement'], + item['doi'], + item['order'] + ).url_article + + natural_key = self.build_key([ + item['journal_acronym'], + item['volume'], + item['number'], + item['supplement'], + item['publication_year'], + item['first_page'], + item['first_page_seq'], + item['last_page'], + item['elocation'], + ]) + + item['natural_key'] = natural_key + item['natural_url'] = natural_url + + return json.dumps(item) + + def fmt_csv(self, data): line = [ data.collection_acronym, data.publisher_id, - journal_title if journal_title else '', - volume if volume else '', - issue if issue else '', - year if year else '', - fpage if fpage else '', - fpage_seq if fpage_seq else '', - lpage if lpage else '', - elocation if elocation else '', - aop_id if aop_id else '' + data.journal.title, + data.journal.acronym, + data.issue.volume, + data.issue.number, + (data.issue.supplement_volume or '') + (data.issue.supplement_number or ''), + data.publication_date[:4], + data.start_page or '', + data.start_page_sequence or '', + data.end_page or '', + data.elocation or '' ] - natural_key = self.build_key(line[2:]) + # legendarium natural_url + natural_url = URLegendarium( + data.journal.acronym, + data.publication_date[:4], + data.issue.volume, + data.issue.number, + data.start_page or '', + data.start_page_sequence or '', + data.end_page or '', + data.elocation or '', + (data.issue.supplement_volume or '') + (data.issue.supplement_number or ''), + ).url_article + + natural_key = self.build_key(line[3:]) line.append(natural_key) + line.append(natural_url) joined_line = ','.join(['"%s"' % i.replace('"', '""') for i in line]) return joined_line - def parse(self, xml): - f = StringIO(xml) - try: - tree = packtools.XMLValidator(f) - except Exception as e: - logger.exception(e) - logger.error('Fail to parse XML') - tree = None - - return tree - def build_key(self, data): values = (i for i in data) text_values = (value if value else 'none' for value in values) joined_values = '_'.join(text_values) - return utils.call_django_slugify(joined_values) + return utils.slugify(joined_values) + + def run(self, output_fmt='json'): + + if output_fmt == 'csv': + header = [ + u"coleção", + u"pid", + u"título", + "acrônimo do título", + u"volume", + u"número", + u"suplemento", + u"ano de publicação", + u"primeira página", + u"primeria página seq" + u"última página", + u"e-location", + u"chave natural", + u"url natural" + ] + self.write( + u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header]) + ) + + if output_fmt == 'csv': + output_fmt = self.fmt_csv + else: + output_fmt = self.fmt_json - def run(self): for issn in self.issns: for document in self._articlemeta.documents( - collection=self.collection, issn=issn): - logger.debug('Reading document: %s' % document.publisher_id) - - try: - xml = self._articlemeta.document( - document.publisher_id, document.collection_acronym, - fmt='xmlrsps') - except Exception as e: - logger.exception(e) - logger.error('Fail to read document: %s_%s' % ( - document.publisher_id, document.collection_acronym)) - xml = u'' + collection=self.collection, issn=issn + ): - et = self.parse(xml) - - if not et: - logger.error('Fail to parse xml document: %s_%s' % ( - document.publisher_id, document.collection_acronym)) - continue + logger.debug('Reading document: %s' % document.publisher_id) - self.write(self.fmt_json(document, et)) + self.write(output_fmt(document)) def main(): @@ -187,6 +217,14 @@ def main(): help='Collection Acronym' ) + parser.add_argument( + '--format', + '-f', + choices=['json', 'csv'], + default='json', + help='Collection Acronym' + ) + parser.add_argument( '--output_file', '-r', @@ -217,4 +255,4 @@ def main(): dumper = Dumper(args.collection, issns, args.output_file) - dumper.run() \ No newline at end of file + dumper.run(args.format) diff --git a/export/normalize_affiliations.py b/export/normalize_affiliations.py index 5f65808..add2d94 100644 --- a/export/normalize_affiliations.py +++ b/export/normalize_affiliations.py @@ -71,16 +71,20 @@ def run(self): for data in self.get_data(issn=issn): for item in self.fmt_csv(data): f.write('%s\r\n' % item) - + def fmt_csv(self, data): + issue_label = '' + if data.issue: + issue_label = data.issue.label or '' + line = [ data.collection_acronym, data.publisher_id, data.publication_date[0:4], data.document_type, data.journal.title, - data.issue_label + issue_label ] if len(data.mixed_affiliations) == 0: diff --git a/export/search_update_indicators.py b/export/search_update_indicators.py deleted file mode 100644 index 5423d7b..0000000 --- a/export/search_update_indicators.py +++ /dev/null @@ -1,173 +0,0 @@ -# coding: utf-8 -""" -Este processamento realiza a exportação/atualização de número de citações e -acessos no índice da ferramenta de busca search.scielo.org -""" -import argparse -import logging -import json - -import requests - -from clients.search import Search -import utils - -logger = logging.getLogger(__name__) - - -def _config_logging(logging_level='INFO', logging_file=None): - - allowed_levels = { - 'DEBUG': logging.DEBUG, - 'INFO': logging.INFO, - 'WARNING': logging.WARNING, - 'ERROR': logging.ERROR, - 'CRITICAL': logging.CRITICAL - } - - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - - logger.setLevel(allowed_levels.get(logging_level, 'INFO')) - - if logging_file: - hl = logging.FileHandler(logging_file, mode='a') - else: - hl = logging.StreamHandler() - - hl.setFormatter(formatter) - hl.setLevel(allowed_levels.get(logging_level, 'INFO')) - - logger.addHandler(hl) - - return logger - - -class Dumper(object): - - def __init__(self, collection, issns=None, citations_mode='pid'): - - self._articlemeta = utils.articlemeta_server() - self._accessstats = utils.accessstats_server() - self._cited_by = utils.citedby_server() - self._search = Search() - self._load_citations = self._load_citations_by_meta if citations_mode == 'meta' else self._load_citations_by_pid - self.collection = collection - self.issns = issns or [None] - - def _load_citations_by_pid(self, item): - - response = self._cited_by.citedby_pid(item.publisher_id) - - try: - total = json.loads(response)['article']['total_received'] - except: - total = None - - return total - - def _load_citations_by_meta(self, item): - - title = item.original_title() if item.original_title() else None - year = item.publication_date[0:4] if item.publication_date else None - surname = item.authors[0].get('surname', None) if item.authors else None - - if title and surname and year: - response = self._cited_by.citedby_meta(title, surname, int(year)) - - try: - total = json.loads(response)['article']['total_received'] - except: - total = None - - return total - - def _load_accesses(self, item): - - response = self._accessstats.client.document( - item.publisher_id, item.collection_acronym) - - try: - total = int(json.loads(response)['access_total']['value']) - except: - total = 0 - - return total - - def run(self): - logger.info('Export started') - - for item in self.items(): - citations = self._load_citations(item) - accesses = self._load_accesses(item) - - item_id = '-'.join([item.publisher_id, item.collection_acronym]) - self._search.update_document_indicators( - item_id, citations, accesses) - - self._search.deploy() - - logger.info('Export finished') - - def items(self): - - if not self.issns: - self.issns = [None] - - for issn in self.issns: - for data in self._articlemeta.documents( - collection=self.collection, issn=issn): - logger.debug('Reading document: %s' % data.publisher_id) - yield data - - -def main(): - - parser = argparse.ArgumentParser( - description='Update the citing numbers of each documents in search.scielo.org' - ) - - parser.add_argument( - 'issns', - nargs='*', - help='ISSN\'s separated by spaces' - ) - - parser.add_argument( - '--collection', - '-c', - help='Collection Acronym' - ) - - parser.add_argument( - '--citations_mode', - '-m', - default='pid', - choices=['meta', 'pid'], - help='Mode to retrieve received citations.' - ) - - parser.add_argument( - '--logging_file', - '-o', - help='Full path to the log file' - ) - - parser.add_argument( - '--logging_level', - '-l', - default='DEBUG', - choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], - help='Logggin level' - ) - - args = parser.parse_args() - _config_logging(args.logging_level, args.logging_file) - logger.info('Dumping data for: %s' % args.collection) - - issns = None - if len(args.issns) > 0: - issns = utils.ckeck_given_issns(args.issns) - - dumper = Dumper(args.collection, issns, args.citations_mode) - - dumper.run() diff --git a/export/templates/dumparticle_readme.txt b/export/templates/dumparticle_readme.txt new file mode 100644 index 0000000..87f2eec --- /dev/null +++ b/export/templates/dumparticle_readme.txt @@ -0,0 +1,32 @@ +SciELO Network Dump Data +======================== + +Use Lincense +------------ + +All content of this zipfile are under the license (CC-BY - http://creativecommons.org/licenses/by/4.0/). + + +Collections acronym +------------------- + +arg: Argentina +bol: Bolivia +bra: Brazil +chl: Chile +col: Colombia +cri: Costa Rica +cub: Cuba +esp: Spain +mex: Mexico +per: Peru +prt: Portugal +sza: South Africa +ven: Venezuela +spa: Public Health +sss: Social Sciences + +About +----- + +* Schema is available in folder "schema" \ No newline at end of file diff --git a/export/xml_rsps.py b/export/xml_rsps.py index 26b0ddf..12239da 100644 --- a/export/xml_rsps.py +++ b/export/xml_rsps.py @@ -6,18 +6,21 @@ import os import argparse import logging -import codecs import json +import threading +import multiprocessing from io import StringIO +import itertools import packtools from packtools.catalogs import XML_CATALOG - +from lxml.etree import XMLSyntaxError import utils os.environ['XML_CATALOG_FILES'] = XML_CATALOG logger = logging.getLogger(__name__) + def _config_logging(logging_level='INFO', logging_file=None): allowed_levels = { @@ -44,6 +47,7 @@ def _config_logging(logging_level='INFO', logging_file=None): return logger + def summarize(validator): def _make_err_message(err): @@ -65,7 +69,6 @@ def _make_err_message(err): return err_msg - dtd_is_valid, dtd_errors = validator.validate() sps_is_valid, sps_errors = validator.validate_style() @@ -74,33 +77,64 @@ def _make_err_message(err): 'sps_errors': [_make_err_message(err) for err in sps_errors], } - summary['dtd_is_valid'] = validator.validate()[0] - summary['sps_is_valid'] = validator.validate_style()[0] - summary['is_valid'] = bool(validator.validate()[0] and validator.validate_style()[0]) + summary['dtd_is_valid'] = dtd_is_valid + summary['sps_is_valid'] = sps_is_valid + summary['is_valid'] = bool(dtd_is_valid and sps_is_valid) return summary -def analyze_xml(xml, document): + +def analyze_xml(xml): """Analyzes `file` against packtools' XMLValidator. """ f = StringIO(xml) try: - xml = packtools.XMLValidator(f) - except: - logger.error('Could not read file %s' % document.publisher_id) + xml = packtools.XMLValidator.parse(f, sps_version='sps-1.4') + except packtools.exceptions.PacktoolsError as e: + logger.exception(e) + summary = {} + summary['dtd_is_valid'] = False + summary['sps_is_valid'] = False + summary['is_valid'] = False + summary['parsing_error'] = True + summary['dtd_errors'] = [] + summary['sps_errors'] = [] + return summary + except XMLSyntaxError as e: + logger.exception(e) summary = {} summary['dtd_is_valid'] = False summary['sps_is_valid'] = False summary['is_valid'] = False summary['parsing_error'] = True + summary['dtd_errors'] = [e.message] + summary['sps_errors'] = [] return summary else: summary = summarize(xml) + return summary +class ThreadSafeIter(object): + """Wraps an iterable for safe use in a threaded environment. + """ + def __init__(self, it): + self.it = iter(it) + self.lock = threading.Lock() + + def __iter__(self): + return self + + def __next__(self): + with self.lock: + return next(self.it) + + next = __next__ + + class Dumper(object): def __init__(self, collection, issns=None): @@ -109,32 +143,68 @@ def __init__(self, collection, issns=None): self.collection = collection self.issns = issns or [None] - def fmt_json(self, data, xml_result): + def fmt_json(self, data): fmt = {} fmt['code'] = data.publisher_id - fmt['collection'] = data.collection_acronym + fmt['collection'] = data.collection_acronym fmt['id'] = '_'.join([data.collection_acronym, data.publisher_id]) fmt['document_type'] = data.document_type fmt['publication_year'] = data.publication_date[0:4] - fmt['document_type'] = data.document_type + fmt['journal'] = data.journal.title + fmt['issn'] = data.journal.scielo_issn + fmt['issue_label'] = data.issue.label + fmt['subject_areas'] = data.journal.subject_areas fmt['data_version'] = 'legacy' if data.data_model_version == 'html' else 'xml' - fmt.update(xml_result) - return json.dumps(fmt) - - def run(self): - for issn in self.issns: - for document in self._articlemeta.documents(collection=self.collection, issn=issn): - try: - xml = self._articlemeta.document(document.publisher_id, document.collection_acronym, fmt='xmlrsps') - except Exception as e: - logger.exception(e) - logger.error('Fail to read document: %s_%s' % (document.publisher_id, document.collection_acronym)) - xml = u'' - logger.debug('Reading document: %s' % document.publisher_id) - validation_result = analyze_xml(xml, document) - print(self.fmt_json(document, validation_result)) + + return fmt + + def summaryze_xml_validation(self, pid, collection_acronym, output_format): + + try: + xml = self._articlemeta.document(pid, collection_acronym, fmt='xmlrsps') + except Exception as e: + logger.exception(e) + logger.error('Fail to read document: %s_%s' % (pid, collection_acronym)) + xml = u'' + + logger.debug('Reading document: %s' % pid) + + output_format.update(analyze_xml(xml)) + + print(json.dumps(output_format)) + + def _worker(self, docs, t): + for doc in docs: + logger.debug('Running thread %s' % t) + self.summaryze_xml_validation(doc['code'], doc['collection'], doc) + + def run(self, processes): + max_threads = multiprocessing.cpu_count() * processes + + def _gen_iterdocs(): + """Produz um gerador de geradores de documentos. + """ + for issn in self.issns: + iterdocs = (self.fmt_json(doc) + for doc in self._articlemeta.documents( + collection=self.collection, issn=issn)) + yield iterdocs + + iterdocs = itertools.chain.from_iterable(_gen_iterdocs()) + safe_iterdocs = ThreadSafeIter(iterdocs) + + jobs = [] + for t in range(max_threads): + thread = threading.Thread(target=self._worker, + args=(safe_iterdocs, t)) + jobs.append(thread) + thread.start() + logger.info('Thread running %s' % thread) + + for job in jobs: + job.join() def main(): @@ -149,12 +219,27 @@ def main(): help='ISSN\'s separated by spaces' ) + parser.add_argument( + '--issns_file', + '-i', + default=None, + help='Full path to a txt file within a list of ISSNs to be exported' + ) + parser.add_argument( '--collection', '-c', help='Collection Acronym' ) + parser.add_argument( + '--processes', + '-p', + type=int, + default=1, + help='Number of processes per CPU' + ) + parser.add_argument( '--logging_file', '-o', @@ -177,6 +262,16 @@ def main(): if len(args.issns) > 0: issns = utils.ckeck_given_issns(args.issns) + issns_from_file = None + if args.issns_file: + with open(args.issns_file, 'r') as f: + issns_from_file = utils.ckeck_given_issns([i.strip() for i in f]) + + if issns: + issns += issns_from_file if issns_from_file else [] + else: + issns = issns_from_file if issns_from_file else [] + dumper = Dumper(args.collection, issns) - dumper.run() \ No newline at end of file + dumper.run(args.processes) diff --git a/export/xsd/crossref/AccessIndicators.xsd b/export/xsd/crossref/AccessIndicators.xsd new file mode 100644 index 0000000..1ae7aa0 --- /dev/null +++ b/export/xsd/crossref/AccessIndicators.xsd @@ -0,0 +1,64 @@ + + + + + Version: 1.1 This is CrossRef's schema for defining the applicable + licenses for a given item. This schema was available and in use prior to the completion + of the NISO working group Access and License Indicators + (http://www.niso.org/publications/rp/rp-22-2015). That effort produced a schema + (http://www.niso.org/schemas/ali/1.0/ali.xsd) that extended the CrossRef definition but + at the same time omitted necessary CrossRef features. This schema will continue as the + basis for CrossRef metadata deposits, but will incorporate the NISO work where possible. + Change history: 2/23/15 CSK added Niso free_to_read element + 4/21/15 CSK added start and end attributes to the free-to-read element as in the Niso ALI schema + but will make both attributes optional. + + + + + + Accommodates deposit of license metadata. The license_ref value will + be a URL. Values for the "applies_to" attribute are vor (version of record),am + (accepted manuscript), and tdm (text and data mining). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/export/xsd/crossref/JATS-journalpublishing1.xsd b/export/xsd/crossref/JATS-journalpublishing1.xsd new file mode 100644 index 0000000..83b6b6d --- /dev/null +++ b/export/xsd/crossref/JATS-journalpublishing1.xsd @@ -0,0 +1,6176 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/export/xsd/crossref/clinicaltrials.xsd b/export/xsd/crossref/clinicaltrials.xsd new file mode 100644 index 0000000..9ff423a --- /dev/null +++ b/export/xsd/crossref/clinicaltrials.xsd @@ -0,0 +1,61 @@ + + + + + + Linked Clinical Trials is a CrossRef initiative helping to connect the published literature to registered clinical trials associated with the research, + + - version 1.0 (initial release) September 24, 2015 + + + + + + Accommodates deposit of linked clincal trials metadata. The clinical-trial-number value will + be a string that must match a specific pattern appropriate for a given clinical trial registry. The + registry is identified in the required attribute 'registry' and must be the DOI of a recognized registry + (see http://dx.doi.org/10.18810/registries) + + + + + + + + + + + + + + + The clinical trial identifier related to the article. + + + + + + + + + + + + + Used to identify the article publication date in relation to the issuance of the trial results + + + + + + + + + + + + + + diff --git a/export/xsd/crossref/common4.3.5.xsd b/export/xsd/crossref/common4.3.5.xsd new file mode 100644 index 0000000..9e07c14 --- /dev/null +++ b/export/xsd/crossref/common4.3.5.xsd @@ -0,0 +1,2719 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + language attributes are based on iso 639 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Mime types for component format. For mime types refer to + http://www.iana.org/assignments/media-types/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Use to flag metadata for distribution. "query" is the default and + follows current protocol - bibliographic metadata is distributed to anyone in a + query response, bulk distribution is only allowed per CMS rules. "any" allows bulk + distribution of metadata to anyone using OAI-PMH queries. + + + + + + + + + + + + + Use to flag references for distribution. "none" is the default and + follows current protocol - references are only distributed to everyone if the prefix + level permission is set, otherwise reference distribution is limited to the DOI + owner. Setting the value to "query" releases references to anyone making a query + request (this overrides any established prefix level permission). Value "any" allows + bulk distribution to anyone (using a CrossRef query account) using the OAI-PMH + protocol, and also releases references to anyone making a query + request. + + + + + + + + + + + + + + + + + + + + The following are basic data types for face markup. Face markup that + appears in the title, subtitle, and original_language_title elements should be + retained when depositing metadata. Face markup in other elements (e.g. small caps in + author names) must be dropped. Face markup support includes bold (b), italic (i), + underline (u), over-line (ovl), superscript (sup), subscript (sub), small caps + (scp), and typewriter text (tt). See + http://help.crossref.org/#face_markup + +MathML may also be included using the 'mml' namespace prefix. + + + + + + + + + + + + + + + + + + + + + + + + + + The following are basic data types for date + parts. + + + + + + + + + + + + + + + + + + + + + + + + + Publisher generated ID that uniquely identifies the DOI submission + batch. It will be used as a reference in error messages sent by the MDDB, and can be + used for submission tracking. The publisher must insure that this number is unique + for every submission to CrossRef. + + + + + + + + + + + Indicates version of a batch file instance or DOI. timestamp is used + to uniquely identify batch files and DOI values when a DOI has been updated one or + more times. timestamp is an integer representation of date and time that serves as a + version number for the record that is being deposited. Because CrossRef uses it as a + version number, the format need not follow any public standard and therefore the + publisher can determine the internal format. The schema format is a double of at + least 64 bits, insuring that a fully qualified date/time stamp of 19 digits can be + submitted. When depositing data, CrossRef will check to see if a DOI has already + been deposited for the specific doi value. If the newer data carries a time stamp + value that is equal to or greater than the old data based on a strict numeric + comparison, the new data will replace the old data. If the new data value is less + than the old data value, the new data will not replace the old data. timestamp is + optional in doi_data and required in head. The value from the head instance + timestamp will be used for all instances of doi_data that do not include a timestamp + element. + + + + + Information about the organization submitting DOI metadata to + CrossRef + + + + + + + + + + + Name of the organization registering the DOIs. The name placed in + this element should match the name under which a depositing organization has + registered with CrossRef. + + + + + + + + + + + e-mail address to which batch success and/or error messages are sent. + It is recommended that this address be unique to a position within the organization + submitting data (e.g. "doi@...") rather than unique to a person. In this way, the + alias for delivery of this mail can be changed as responsibility for submission of + DOI data within the organization changes from one person to another. + + + + + + + + + + + + + The organization that owns the information being registered. + + + + + + + + + + + + + The chapter, section, part, etc. number for a content item in a book. + Unlike volume and edition_number, component_number should include any additional + text that helps identify the type of component. In the example above, the text + "Section 8" appeared on the table of contents and it is reflected here. "8" is also + acceptable, however the former treatment is preferred. The type of the component is + given the component_type attribute of content_item. + + + + + + + + + + + The edition number of a book. edition_number should include only a + number and not additional text such as "edition". For example, you should submit + "3", not "third edition" or "3rd edition". Roman numerals are acceptable. Publishers + will update a print edition with a new edition number when more than ten percent of + the content has changed. However, publishers expect to continuously update online + editions of books without changing the edition number. The ability to update the + electronic version independent of the print version could be problematic for + researchers. For example, if a research article cites the print version of a + chapter, and a researcher subsequently links to the online version of the same + chapter, the content may be different from the print version without the typical + indication of a new edition. This topic requires further discussion outside of + the scope of this specification. + + + + + + + + + + + + The issue number in which an article is published. Only one issue + name should be used for the issue. The issue number takes precedence over any other + name. For example, if an issue has only a seasonal name, then the season should be + listed in issue. However, if an issue has a number and a season, then only the + number should be listed in issue, and the season should be placed in month (see the + table in month, below, for proper encoding of the season) if the specific month of + publication is not known. Do not include the words "issue", "No" or "number" in this + element. When submitting DOIs for journal articles published online ahead of print, + you should submit the issue number, when known, even if the pagination information + for the entity is not yet known. Data may be alpha, numeric or a combination. + Examples: 74(3):1999 + + 1999 + + + 74 + + 3 + Volume 74, Spring 1999 + + 1999 + + + 74 + + Spring + Volume 74, issue 3 Spring 1999 + + 21 + 1999 + + + 74 + + 3 + + + + + + + + + + + + + The container for elements related directly to a DOI. doi_data + contains the doi, timestamp (version) and corresponding resource (URI) data for the + doi. Cases of single-resolution (i.e. one DOI with a single corresponding URI) + should be tagged with a doi/resource pair in doi_data. If additional resources are + to be proved the <collection> element may also be used. The single URL + provided in the <resource> is mandatory and serves as the single resolution + target for the DOI. Note: A timestamp value placed inside doi_data will override any + timestamp value placed in the <head> element. + + + + + + + + + + + + + The element that contains a URI associated with a DOI. URLs are + referred to as resources in the 2.0 CrossRef schema because they can be any valid + URI. Cases of single-resolution (i.e. one DOI with a single corresponding URI) + should be tagged with a doi/resource pair in doi_data. Only one resource is allowed + per doi_data, the exception being resource elements within a collection element. + Values for the "content_version" attribute are vor (version of record) and am + (advance manuscript). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A collection is a container for one or more items each holding a doi + or a resource (URI) which is related to the DOI in the ancestor <doi_data> + element. A collection must be qualified by a property attibute or the + multi-resolution attribute. property attributes: list-based: uses an interim page + and presents the list of items to the user (via Multiple Resolution) country-based: + proxy picks destination based on the country code of the user's location (this + option is not currently active, contact support@crossref.org for more info) + crawler-based: identifies resource to be crawled by the specified crawlers. + text-mining: identifies resource to be used for text and data mining unspecified: + identifies resource with unspecified usage syndication: identifies resource to be + used for syndication The multi-resolution attribute may be used to lock or unlock + DOIs for multiple resolution. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A container used to associate a collection, doi, or resource (URI) + with zero or more property elements. item is currently used for supplying as-crawled + URLs (http://help.crossref.org/#as-crawled-urls) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + property elements qualify the semantic meaning of a item or + collection. property elements consist of a type/value pair where the property type + is found in the type attribute and the value is found in the element content. The + property element is not currently in use. + + + + + + + + + + + + The container for all who contributed to authoring or editing an + entity. + + + + + + + + + + + + + + + + + + + + The name of an organization (as opposed to a person) that contributed + to authoring an entity. If multiple organizations authored an entity, each one + should be captured in a unique organization element. If an entity was authored by + individuals in addition to one or more organizations, person_name and organization + may be freely intermixed within contributors. contributor_role should be set, as + appropriate, to author or editor. When a contributor translated a work, set + contributor_role to "translator". "chair" should only be used for conference + proceedings to indicate a conference chair. + + + + + + + + + + + + The name of a person (as opposed to an organization) that contributed + to authoring an entity. Authors with name suffixes should have the suffix captured + in suffix, not in surname. Author prefixes such as "Dr.", "Prof.", or "President" + should not be included in person_name or any other element. Author degrees (e.g. + M.D., Ph.D.) also should not be included in CrossRef submissions. contributor_role + should be set, as appropriate, to author or editor. When a contributor translated a + work, set contributor_role to "translator". "chair" should only be used for + conference proceedings to indicate a conference chair. + + + + + + + + + + + + + + + + A contributor's given name. The given_name, combined with surname, + forms the name of an author or editor. given_name may be submitted as either + initials or a full name. Do not place given_name within the surname unless it is + unclear how to distinguish the given name from the surname, as may be the case in + non-Western names. Do not include titles such as "Dr.", "Prof.", or "President" in + given_name. These titles should not be submitted to CrossRef. + + + + + + + + + + + + + The surname of an author or editor. The surname, combined with + given_name, forms the name of an author or editor. Whenever possible, the given name + should not be included in the surname element. In cases where the given name is not + clear, as may happen with non-Western names or some societies in which surnames are + not distinguished, you may place the entire name in surname, e.g.: Leonardo + da Vinci If an author is an organization, you should use organization, + not surname. Suffixes should be tagged with suffix. Author degrees (e.g. M.D., + Ph.D.) should not be included in CrossRef submissions. + + + + + + + + + + + + + The suffix of an author name, e.g. junior or senior. A name suffix, + that typically denotes a generation distinction, is tagged with suffix. Author + degrees (e.g. M.D., Ph.D.) should not be included in CrossRef + submissions. + + + + + + + + + + + + + + + + + The ORCID for an author. The schema performs basic pattern validation, checksum validation is performed upon deposit via a system check. + + + + + + + + + + + + + The institution(s) with which a contributor is affiliated. This + element may hold the name and location of an affiliation with which a contributor is + affiliated. Please note the following points when using this element: 1. A + contributor may have up to five affiliations. Each affiliation should be in a unique + <affiliation> element. The following is correct: <affiliation>University + of New Mexico</affiliation> <affiliation>Sandia National + Laboratories</affiliation> The following is NOT correct + <affiliation>University of New Mexico; Sandia National + Laboratories</affiliation> 2. The name of the institution is required in this + element. The location is optional. Both of the following are correct: + <affiliation>Harvard University</affiliation> <affiliation>Harvard + University, Cambridge, MA</affiliation> 3. Additional address information such + as a URL or email address should NOT be deposited in this element 4. Visual linking + indicators used in publication to connect authors with their affiliations such as + footnote symbols or initials should NOT be included in the <affiliation> + element 5. If you have only a single string that has the affiliation for multiple + contributors to a work and that string is not broken out into the individual + affliations for each author, please do NOT deposit the affilation information. This + element is to be used only for affiliation information that is directly connected to + the author with whom this information is included within the person_name element. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A container for the title and original language title + elements. + + + + + + + + + + + + + + + + + The title of the entity being registered. When a title contains a + subtitle, it is preferable to capture the subtitle portion in the subtitle element. + Only minimal face markup is supported, see See http://help.crossref.org/#face_markup + + + + + + + + + + The title of an entity in its original language if the registration + is for a translation of a work. When providing the original language of a title, you + should set the language attribute. + + + + + + + + + + + + The sub-title portion of an entity title. When possible, it is better + to tag a title and subtitle with separate elements. If this information is not + available, it is acceptable to submit the title and subtitle all within the title + element with punctuation (preferably a colon) used to separate the subtitle from the + title. When a subtitle is tagged, the space and punctuation between the title and + subtitle text should not be included. The following examples illustrate correct and + incorrect tagging practices: Correct and optimal tagging: The Human + Brain + A Handbook Correct but not optimal tagging: The Human + Brain: A Handbook Incorrect: The Human Brain: + A Handbook + The Human Brain + : A Handbook + + + + + + + + + + + Month of publication. The month must be expressed in numeric format + rather spelling out the name (e.g.. submit "10", not "October"). The month must be + expressed with a leading zero if it is less than 10 (e.g. submit "05", not "5"). + When a journal issue has both an issue number and a season, the issue number should + be placed in issue. If the month of publication is not known, the season should be + placed in month as a two-digit value as follows: Season Value Spring 21 Summer 22 + Autumn 23 Winter 24 First Quarter 31 Second Quarter 32 Third Quarter 33 Fourth + Quarter 34 In cases when an issue covers multiple months, e.g. "March-April", + include only the digits for the first month of the range. + + + + + Day of publication. The should must be expressed with a leading zero + if it is less than 10 (e.g. submit "05", not "5"). + + + + + Year of publication. + + + + + The date of publication. In all cases, multiple dates are allowed to + allow for different dates of publication for online and print versions. This element + was previously called date, but was renamed publication_date to distinguish more + clearly from conference_date. + + + + + + + + + + + + + + + + + + + + + + + + + + The container for information about page ranges. When an entity has + non-contiguous page information, you should capture the first page range in + first_page and last_page. Any additional page information should be captured in + other_pages. Punctuation is only allowed in other_pages. It should not appear in + first_page and last_page. Page number letter prefixes or suffixes should be + included. Roman numeral pages are permitted in both upper case and lower case. Data + may be alpha, numeric or a combination. + + + + + + + + + + + + First page number where an entity is located. Data may be alpha, + numeric or a combination. + + + + + + + + + + + The last page number of an entity. last_page should not be used when + the last page number is the same as the first page number (i.e. when the entire + entity fits on one page). Do not include punctuation for a page range in last_page. + If the entity has non-contiguous paging, use last_page for the last page of the + first range and place all other page information into other_pages. Data may be + alpha, numeric or a combination. + + + + + + + + + + + Used to capture additional page information when items do not + encompass contiguous page ranges. When an entity has non-contiguous page + information, you should capture the first page range in first_page and last_page. + Any additional page information should be captured in other_pages. You should + include commas or hyphens to express discrete pages or page ranges. endash entities + should be converted to ASCII hyphens. Spaces should not be included. Note that + punctuation should never appear in first_page and last_page. Data may be alpha, + numeric or a combination. + + + + + + + + + + + + + + + + + DOI for an entity being registered with CrossRef. In 2008 CrossRef restricted DOI suffix + characters to the following: "a-z", "A-Z", "0-9" and "-._;()/" + + Existing DOIs with suffix characters outside of the allowed set are still supported. For additional + information on DOI syntax, see http://help.crossref.org/#ID5755 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The ISBN assigned to an entity. If a multi-volume work has one ISBN + per volume and a unique ISBN for the series, all may be registered. The ISBN for the + series must be in series_metadata, and the ISBN for each volume in + proceedings_metadata, or book_metadata, respectively. The text "ISBN" should not be + included in the ISBN element in CrossRef submissions. Although not required, the + ISBN number should retain spaces or hyphens that appear in the formatted number + because they aid in human-readability. For more information, please see + http://www.isbn.org/standards/home/isbn/international/hyphenation- instructions.asp + or http://www.isbn.org. + + + + + + + + + + + + Identifies books or conference proceedings that have no ISBN + assigned. In very limited cases a book may never have an ISBN, this is particularly + true for older texts. Conference proceedings, however, may regularly have a volume + number but no ISBN or volume title. + + + + + + + + + + + + + + + + + + + + + + + + The ISSN assigned to an entity. The ISSN must consist of eight digits + (where the last digit may be an X), or it must consist of eight digits in two groups + of four with a hyphen between the two groups. Spaces or other delimiters should not + be included in an ISSN. For more information, please see + http://www.issn.org:8080/English/pub/getting- checking/checking or + http://www.issn.org. The text "ISSN" should not be included in the issn element in + CrossRef submissions. CrossRef validates all ISSNs supplied in deposits, only valid + ISSNs will be accepted. + + + + + + + + + + + + The coden assigned to a journal or conference + proceedings. + + + + + + + + + + + The volume number of a published journal, or the number of a printed + volume for a book or conference proceedings. A journal volume is contained in the + journal_volume element to allow for the assignment of a DOI to an entire journal + volume. Do not include the words "Volume" or "vol." in this element. Data may be + alpha, numeric or a combination. Roman numerals are acceptable. + + + + + + + + + + + + + + Container element for archive. + + + + + + + + + + Used to indicate the designated archiving organization(s) for an + item. Values for the name attribute are CLOCKSS, LOCKSS Portico, KB, DWT (Deep Web + Technologies), Internet Archive,WebCite + + + + + + + + + + + + + + + + + + + + + The date on which a dissertation was accepted by the institution + awarding the degree, a report was approved, or a standard was accepted. + approval_date includes the same elements as publication_date, but it has no + attributes. It is a distinct element from publication_date to reflect that an + important but different semantic meeting from publication_date + + + + + + + + + + + + A list of articles, books, and other items cited by the parent item + for which the DOI in the doi_data is being deposited. Some articles may have + multiple lists of citations (e.g. main reference list, appendix reference list, + etc.). All citations for one article should be included in a single citation_list + regardless of whether one or more citation lists were in the original item. When + combining multiple reference lists from an item into one citation_list element, but + sure to give each citation a unique key attribute value. For example, if an appendix + in an item has a separate citation list that restarts numbering at 1, these + citations should be given key attributes such as "ab1" rather than "b1". Some + articles may contain "Further Reading" or "Bibliography" lists. The distinguishing + factor in these lists is that the references have not been cited from the + article—they only provide a list of additional related reading material. It will be + left to the discretion of the publisher if these items are to be considered + citations and should be deposited. NOTE: If a citation_list element is given and is + empty then all citations for the given DOI will be deleted, otherwise any existing + citations for the given DOI are left intact in the database. It is quite common that + a publisher wants to fix the DOI's metadata without resubmitting the citations. + Leaving out the citation_list element will do that. Also note that any given + citations will override older citations for the given DOI so citation_lists are not + cumulative over multiple records or submissions. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + citation is used to deposit each citation in the reference list of + the item for which the DOI is being deposited. The citations in the list will be run + by the CrossRef system as queries looking for the DOI of the articles being cited. + NOTE: Because the citation list is used to support forward linking, the more + information supplied in the citation the better the chance of finding a match. For + each citation that is deposited, one of four models should be used: 1. Parsed + journal data 2. Parsed book or conference data 3. DOI 4. Unstructured citation (not + yet supported for resolution) When parsed journal, book or conference data is + deposited, CrossRef will perform a lookup to find the DOI. Each citation must be + given a unique ID in the key attribute. It is recommended that this number be the + citation number if the reference list is numbered in the published article, or the + underlying XML ID if the reference list is name/date style in article. When + submitting a journal citation, it should include an issn, journal_title or both. + journal_title only is preferred over issn only. In addition the first author and + first_page number should be submitted. The first_page number is preferred, but for + those citations that are "in press", the author should be submitted. All elements + are optional, however for best linking results, as much information as is known + should be submitted. When submitting a book or conference citation, it should + include an isbn, series_title, volume_title, or any combination of these three + elements as may be available. All elements are optional, however for best linking + results, as much information as is known should be submitted. When a DOI is already + known for a citation, you may submit just the doi without additional information. + When parsed information is not available for a citation, or the citation is of a + type other than journal, book, or conference proceeding that is supported by + CrossRef (e.g. standard, patent, thesis, newspaper, personal communication, etc.), + it may be submitted using the unstructured_citation element. CrossRef is able to + process some unstructured citations. When submitting unstructured citations, it is + helpful, but not required to include all available face markup (e.g. bold, italic, + etc) as this will make possible future parsing of the unstructured citation more + accurate. In such cases, it is preferred, but not required, if the citation number + (when Vancouver style is used) be removed from the unparsed citation. This number + can be submitted using the key attribute Only the first author of a citation should + be submitted, not the entire author list. Only the surname is required. Initials may + be included, but are not recommended because the best linking results can be + provided if initials are omitted. Author titles, roles and generation information + should not be included. If the first author is an organization, the organization + name should be submitted in the author element. cYear has a loose text model that + can accommodate non-standard years such as year ranges such as "1998-1999". Note + that years such as "1998a" or "1999b" should be deposited without the letter, e.g. + "1998" or "1999", whenever possible. Citations that are "in press" should be + submitted with as much information as is available. + + + + + + + + + + + + + + citation_key allows the publisher to assign a unique ID to each + citation that is deposited. It is recommended that this attribute be given the + reference number if the publication uses reference numbers. For those publications + that use name/date style citations, it is recommended that this attribute be used to + indicate the sequential number of the citation in the reference list. However, some + schema must be utilized as this is a required attribute. The system will use this + key value to track the specific reference query and will return this value along + with the DOI. + + + + + + + + + + + + + + + A citation that is to an item other than a journal article, book, or + conference paper and cannot be structured with the CrossRef citation model. Also, it + is used for a citation to a journal article, book, or conference paper for which the + depositing publisher does not have structured information. unstructured_citation + allows a publisher to deposit references for which no structural information is + available. These may be journal, book, or conference references for which the + supplier did not provide markup, or other types of references (e.g. standards, + patents, etc) which are not supported by CrossRef. This structure permits publishers + to deposit complete reference lists, without regard to the availability of markup, + or the need to parse references beyond those types that CrossRef supports. + CrossRef's ability to process unstructured citations is limited, for details see + http://help.crossref.org/#ID38855 + + + + + + + + + + Journal title in a citation. Only used in the citation element. + Journal title in citation deposits is used for both abbreviated and spelled out + journal names. No attribute is required to distinguish between name types. Both + Proc. Natl. Acad. Sci. U.S.A. and + Proceedings of the National Academy of Sciences of the United + States of America are valid journal titles to use in this + element. + + + + + Book series title in a citation. Only used in the citation element. + series_title is an element for the deposit of book or conference series titles in + citations without the hierarchy required by the series_metadata element. Note that + face markup is not permitted when this element is deposited as part of a + citation. + + + + + Book volume title in a citation. Only used in the citation element. + volume_title is an element for the deposit of book or conference volume titles in + citations without the hierarchy required by the titles element. Note that face + markup is not permitted when this element is deposited as part of a + citation. + + + + + First author in a citation. Only used in the citation element. The + author element tags one author name in a citation without the hierarchy required by + the contributors or person_name elements Only the first author should be deposited + for each item. The author surname is required. Author initials may be added but are + not recommended because queries work best when only the last name is provided. For + example, the author "John Doe" can be deposited as Doe or + Doe J, but the former style is recommended. If the author of a + work is an organization rather than a person, the organization may be deposited as + in: World Health Organization + + + + + Year of publication in citation. Unlike the year element, cYear has a + loose text model that can accommodate non-standard years such as year ranges such as + "1998-1999". Note that years such as "1998a" or "1999b" should be deposited without + the letter, e.g. "1998" or "1999". The letter is used for internal source document + linking in name/date (Harvard) style documents rather than external cross reference + linking to the original item. + + + + + Article title in a citation. Use care to remove face markup (such as + italic applied to genus or species names) from article titles as this markup is not + supported by CrossRef. + + + + + + + The element for depositing a stand alone component. The parent DOI + must already exist (created in an earlier deposit or via some other registration + process). + + + + + + + + + + + + + + + + + + The wrapper element for including a group of components under a + journal article, conference proceeding, book chapter, stand alone component, + dissertation, technical report or working paper, standard, or + database. + + + + + + + + + + A container element that allows registration of supplemental + information for a journal article, book chapter, or conference paper such as + figures, tables, videos, or data sets. Currently, the deposit of components + primarily achieves only the first objective as the CrossRef system is not setup yet + to support queries for components. The metadata associated with a component is + intended to enable simple lookup searches of components in the future. When + deposited as part of the metadata for a higher level work the parent DOI is + implicitly known via the XML hierarchy. When deposited separately the DOI of the + higher level work must be provided explicitly (see sa_component) All descriptive + elements are optional allowing for the creation of simple anonymous DOIs. The + 'parent_relation' attribute is mandatory and refers to the DOI described in the + component's direct parent. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Normally book content that is published as a series is required to + have a series title with an ISSN and a book title and/or a book volume number along + with a book ISBN. An exception is when book chapters are published on line first + prior to being assigned to a specific book in which case only the series title (and + ISSN) is known at time of DOI registration. Element unassigned_content is used as a + placeholder to force recognition of this condition and thus prevent accidental + omission of book level title information. When unassigned_content is present the + system will allow omission of the ISBN. If unassigned_content is not present the + system will require an ISBN for the book title. + + + + + + + + + + + + + + A narrative description of a file (e.g. a figure caption or + video description) which may be independent of the host document context. The + description element may be present more than once to provide alternative language + values. + + + + + + + + + + + + + + + + + A narrative description of a component's file format and/or the file + extension (for mime types refer to http://www.iana.org/assignments/media-types/) The + format element may contain only the mime_type attribute, or in addition it may + contain a narrative description of the file format. Be sure to use the narrative + portion to description only the format of the component and not the actual content + of the component (use description to describe the component's + content). + + + + + + + + + + + + + + Container element for CrossMark data. + + + + + + + + + Required element. Some publishers encourage broad third + party hosting of the publisher's content. Other publishers do not. And + still others vary their policy depending on whether a particular article + has been published under an OA policy or not. This boolean flag allows + the publisher to indicate whether the CrossMarked content will only + legitimately be updated on the CrossMark domain (true) or whether the + publisher encourages updating the content on other sites as well + (false). + + + + + + + + + + + + A DOI which points to a publisher's CrossMark policy document. + Publishers might have different policies for different + publications. + + + + + + Container element for crossmark_domain. A list of domains where the + publisher maintains updates and corrections to their content. Minimally, one of + these should include the Internet domain name of the publisher's web site(s), but + the publisher might also decide to include 3rd party aggregators (e.g. Ebsco, + IngentaConnect) or archives with which the publisher has agreements to update the + content + + + + + + + + + + This should be a simple Internet domain name or subdomain name (e.g. + www.psychoceramics.org or psychoceramics.org). It is used to identify when a + referring URL is coming from a CrossMark domain. A "crossmark_domain" is made up of + two subelements; a "domain" and a "filter". The domain is required but the filter is + optional and is only needed for use in situations where content from multiple + publishers/publications is on the same host with the same domain name (e.g. an + aggregator) and one needs to use the referrer's URI "path" to further determine + whether the content in a crossmark domain. + + + + + + + + + + + Required element. This should be a simple Internet domain name or + subdomain name (e.g. www.psychoceramics.org or psychoceramics.org). It is used to + identify when a referring URL is coming from a CrossMark domain. + + + + + Optional element. The filter element is used to disambiguate content + in situations where multiple publishers share the same host (e.g. when on an + aggregated platform). It should contain a substring of the path that can be used to + uniquely identify a publisher's or publication's content. For instance, using the + string "alpsp" here would help the CrossMark system distinguish between ALPSP + publications on the ingentaconnect host and other publications on the same + host. + + + + + + Optional element. A document might provide updates (e.g. corrections, + clarifications, retractions) to several other documents. When this is the case, the + DOIs of the documents that are being *updated* should be listed + here. + + + + + + + + + + The DOI of the content being updated (e.g. corrected, retracted, + etc.) In the CrossMark Terms and Conditions "updates" are defined as changes that + are likely to "change the reader’s interpretation or crediting of the work." That + is, *editorially significant* changes. "Updates" should not include minor changes to + spelling, punctuation, formatting, etc. Attributes: label: Required attribute. This + should be a human-readable version of the "type" attribute. This is what gets + displayed in the CrossMark dialog when there is an update. type: Required attribute. + This attribute should be used to give the machine-readable name of the update type. + The human-readable version of the type should be but in the "label" attribute. There + are many "types" of updates. "Corrections, "clarifications", "retractions" and + "withdrawals" are just a few of the better-known types. For these common types we + recommend you use the values "correction", "clarification", "retraction" and + "withdrawal" respectively as per your editorial policy. However, different + publishers sometimes have to support different, custom update types- for instance, + "protocol amendments", "letters of concern", "comments", etc. The attribute supports + custom types as well. date: The date of the update will be displayed in the + CrossMark dialog and can help the researcher easily tell whther they are likley to + have seen the update. + + + + + + + Required attribute. This attribute should be used to + list the update type. Allowed update types are: +
    +
  • addendum
  • +
  • clarification
  • +
  • correction
  • +
  • corrigendum
  • +
  • erratum
  • +
  • expression_of_concern
  • +
  • new_edition
  • +
  • new_version
  • +
  • partial_retraction
  • +
  • removal
  • +
  • retraction
  • +
  • withdrawal
  • +
+ +
+
+
+ + + Required attribute. The date of the update will be + displayed in the CrossMark dialog and can help the researcher easily + tell whther they are likley to have seen the + update. + + +
+
+
+
+ + + Optional element. Publishers are encouraged to provided any + non-bibliographical metadata that they feel might help the researcher evaluate and + make better use of the content that the Crossmark record refers to. For example, + publishers might want to provide funding information, clinical trial numbers, + information about the peer-review process or a summary of the publication history of + the document. + + + + + + + + + + + + + + + + + + + An assertion is a piece of custom, non-bibliographic metadata that + the publisher is asserting about the content to which the CrossMark refers. + assertion attributes: explanation: If the publisher wants to provide a further + explanation of what the particular "assertion" means, they can link to such an + explanation by providing an appropriate url on the "explanation" attribute. + group_label: This is the human-readable form of the "group_name" attribute. This is + what will be displayed in the group headings on the CrossMark metadata record + dialog. group_name: Some assertions could be logically "grouped" together in the + CrossMark dialog. For instance, if the publisher is recording several pieces of + metadata related to funding sources (source name, percentage, grant number), they + may want to make sure that these three assertions are grouped next to each-other in + the CrossMark dialog. The group_name attribute is the machine-readable value that + will be used for grouping such assertions. label: This is the human-readable version + of the name attribute which will be displayed in the CrossMark dialog. If this + attribute is missing, then the value of the assertion will *not* be displayed in the + dialog. Publishers may want to "hide" assertions this way in cases where the + assertion value is too large or too complex to display in the dialog, but where the + assertion is nonetheless valuable enough to include in API queries and metadata + dumps (e.g. detailed licensing terms). name: This is the machine-readable name of + the assertion. Use the "label" attribute to provide a human-readable version of the + name. order: The publisher may want to control the order in which assertions are + displayed to the user in the CrossMark dialog. All assertions will be sorted by this + element if it is present. + + + + + + + + Optional attribute. If the publisher wants to provide a + further explanation of what the particular "assertion" means, they can link + to such an explanation by providing an appropriate url on the "explanation" + attribute. + + + + + Optional attribute. This is the human-readable form of the + "group_name" attribute. This is what will be displayed in the group headings + on the CrossMark metadata record dialog. + + + + + Optional attribute. Some assertions could be logically + "grouped" together in the CrossMark dialog. For instance, if the publisher + is recording several pieces of metadata related to funding sources (source + name, percentage, grant number), they may want to make sure that these three + assertions are grouped next to each-other in the CrossMark dialog. The + group_name attribute is the machine-readable value that will be used for + grouping such assertions. + + + + + Optional attribute. This is the human-readable version of the + name attribute which will be displayed in the CrossMark dialog. If this + attribute is missing, then the value of the assertion will *not* be + displayed in the dialog. Publishers may want to "hide" assertions this way + in cases where the assertion value is too large or too complex to display in + the dialog, but where the assertion is nonetheless valuable enough to + include in API queries and metadata dumps (e.g. detailed licensing + terms) + + + + + Required attribute. This is the machine-readable name of the + assertion. Use the "label" attribute to provide a human-readable version of + the name. + + + + + Optional attribute. The publisher may want to control the + order in which assertions are displayed to the user in the CrossMark dialog. + All assertions will be sorted by this element if it is + present. + + + + + Optional attribute + + + + + + + + + A wrapper for designators or other primary identifiers for a + standard. + + + + + + + + + + + + + + Designator or other primary identifier for the standard being + deposited. Required. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Designator for standard replacing the standard being deposited. + + + + + + + + + + + + + + Designator for standard from which the current deposit is adopted. + + + + + + + + + + + + + + Designator for the previous revision of the standard being deposited. + + + + + + + + + + + + + + A wrapper for standards body information. + + + + + + + + + + + Name of the standards organization / publisher. + Required. + + + + + Acronym for standards body. Will be used for query matching - + required. + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/export/xsd/crossref/common4.4.0.xsd b/export/xsd/crossref/common4.4.0.xsd new file mode 100644 index 0000000..859ac41 --- /dev/null +++ b/export/xsd/crossref/common4.4.0.xsd @@ -0,0 +1,2846 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + language attributes are based on iso 639 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Mime types for component format. For mime types refer to + http://www.iana.org/assignments/media-types/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Use to flag metadata for distribution. "query" is the default and + follows current protocol - bibliographic metadata is distributed to anyone in a + query response, bulk distribution is only allowed per CMS rules. "any" allows bulk + distribution of metadata to anyone using OAI-PMH queries. + + + + + + + + + + + + + Use to flag references for distribution. "none" is the default and + follows current protocol - references are only distributed to everyone if the prefix + level permission is set, otherwise reference distribution is limited to the DOI + owner. Setting the value to "query" releases references to anyone making a query + request (this overrides any established prefix level permission). Value "any" allows + bulk distribution to anyone (using a CrossRef query account) using the OAI-PMH + protocol, and also releases references to anyone making a query + request. + + + + + + + + + + + + + + + + + + + + The following are basic data types for face markup. Face markup that + appears in the title, subtitle, and original_language_title elements should be + retained when depositing metadata. Face markup in other elements (e.g. small caps in + author names) must be dropped. Face markup support includes bold (b), italic (i), + underline (u), over-line (ovl), superscript (sup), subscript (sub), small caps + (scp), and typewriter text (tt). See + http://help.crossref.org/#face_markup + +MathML may also be included using the 'mml' namespace prefix. + + + + + + + + + + + + + + + + + + + + + + + + + + The following are basic data types for date + parts. + + + + + + + + + + + + + + + + + + + + + + + + + Publisher generated ID that uniquely identifies the DOI submission + batch. It will be used as a reference in error messages sent by the MDDB, and can be + used for submission tracking. The publisher must insure that this number is unique + for every submission to CrossRef. + + + + + + + + + + + Indicates version of a batch file instance or DOI. timestamp is used + to uniquely identify batch files and DOI values when a DOI has been updated one or + more times. timestamp is an integer representation of date and time that serves as a + version number for the record that is being deposited. Because CrossRef uses it as a + version number, the format need not follow any public standard and therefore the + publisher can determine the internal format. The schema format is a double of at + least 64 bits, insuring that a fully qualified date/time stamp of 19 digits can be + submitted. When depositing data, CrossRef will check to see if a DOI has already + been deposited for the specific doi value. If the newer data carries a time stamp + value that is equal to or greater than the old data based on a strict numeric + comparison, the new data will replace the old data. If the new data value is less + than the old data value, the new data will not replace the old data. timestamp is + optional in doi_data and required in head. The value from the head instance + timestamp will be used for all instances of doi_data that do not include a timestamp + element. + + + + + Information about the organization submitting DOI metadata to + CrossRef + + + + + + + + + + + Name of the organization registering the DOIs. The name placed in + this element should match the name under which a depositing organization has + registered with CrossRef. + + + + + + + + + + + e-mail address to which batch success and/or error messages are sent. + It is recommended that this address be unique to a position within the organization + submitting data (e.g. "doi@...") rather than unique to a person. In this way, the + alias for delivery of this mail can be changed as responsibility for submission of + DOI data within the organization changes from one person to another. + + + + + + + + + + + + + The organization that owns the information being registered. + + + + + + + + + + + + + The chapter, section, part, etc. number for a content item in a book. + Unlike volume and edition_number, component_number should include any additional + text that helps identify the type of component. In the example above, the text + "Section 8" appeared on the table of contents and it is reflected here. "8" is also + acceptable, however the former treatment is preferred. The type of the component is + given the component_type attribute of content_item. + + + + + + + + + + + The edition number of a book. edition_number should include only a + number and not additional text such as "edition". For example, you should submit + "3", not "third edition" or "3rd edition". Roman numerals are acceptable. Publishers + will update a print edition with a new edition number when more than ten percent of + the content has changed. However, publishers expect to continuously update online + editions of books without changing the edition number. The ability to update the + electronic version independent of the print version could be problematic for + researchers. For example, if a research article cites the print version of a + chapter, and a researcher subsequently links to the online version of the same + chapter, the content may be different from the print version without the typical + indication of a new edition. This topic requires further discussion outside of + the scope of this specification. + + + + + + + + + + + + The issue number in which an article is published. Only one issue + name should be used for the issue. The issue number takes precedence over any other + name. For example, if an issue has only a seasonal name, then the season should be + listed in issue. However, if an issue has a number and a season, then only the + number should be listed in issue, and the season should be placed in month (see the + table in month, below, for proper encoding of the season) if the specific month of + publication is not known. Do not include the words "issue", "No" or "number" in this + element. When submitting DOIs for journal articles published online ahead of print, + you should submit the issue number, when known, even if the pagination information + for the entity is not yet known. Data may be alpha, numeric or a combination. + Examples: 74(3):1999 + + 1999 + + + 74 + + 3 + Volume 74, Spring 1999 + + 1999 + + + 74 + + Spring + Volume 74, issue 3 Spring 1999 + + 21 + 1999 + + + 74 + + 3 + + + + + + + + + + + + + The container for elements related directly to a DOI. doi_data + contains the doi, timestamp (version) and corresponding resource (URI) data for the + doi. Cases of single-resolution (i.e. one DOI with a single corresponding URI) + should be tagged with a doi/resource pair in doi_data. If additional resources are + to be proved the <collection> element may also be used. The single URL + provided in the <resource> is mandatory and serves as the single resolution + target for the DOI. Note: A timestamp value placed inside doi_data will override any + timestamp value placed in the <head> element. + + + + + + + + + + + + + The element that contains a URI associated with a DOI. URLs are + referred to as resources in the 2.0 CrossRef schema because they can be any valid + URI. Cases of single-resolution (i.e. one DOI with a single corresponding URI) + should be tagged with a doi/resource pair in doi_data. Only one resource is allowed + per doi_data, the exception being resource elements within a collection element. + Values for the "content_version" attribute are vor (version of record) and am + (advance manuscript). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A collection is a container for one or more items each holding a doi + or a resource (URI) which is related to the DOI in the ancestor <doi_data> + element. A collection must be qualified by a property attibute or the + multi-resolution attribute. property attributes: list-based: uses an interim page + and presents the list of items to the user (via Multiple Resolution) country-based: + proxy picks destination based on the country code of the user's location (this + option is not currently active, contact support@crossref.org for more info) + crawler-based: identifies resource to be crawled by the specified crawlers. + text-mining: identifies resource to be used for text and data mining unspecified: + identifies resource with unspecified usage syndication: identifies resource to be + used for syndication The multi-resolution attribute may be used to lock or unlock + DOIs for multiple resolution. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A container used to associate a collection, doi, or resource (URI) + with zero or more property elements. item is currently used for supplying as-crawled + URLs (http://help.crossref.org/#as-crawled-urls) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + property elements qualify the semantic meaning of a item or + collection. property elements consist of a type/value pair where the property type + is found in the type attribute and the value is found in the element content. The + property element is not currently in use. + + + + + + + + + + + + The container for all who contributed to authoring or editing an + entity. + + + + + + + + + + + + + + + + + + + + The name of an organization (as opposed to a person) that contributed + to authoring an entity. If multiple organizations authored an entity, each one + should be captured in a unique organization element. If an entity was authored by + individuals in addition to one or more organizations, person_name and organization + may be freely intermixed within contributors. contributor_role should be set, as + appropriate, to author or editor. When a contributor translated a work, set + contributor_role to "translator". "chair" should only be used for conference + proceedings to indicate a conference chair. + + + + + + + + + + + + The name of a person (as opposed to an organization) that contributed + to authoring an entity. Authors with name suffixes should have the suffix captured + in suffix, not in surname. Author prefixes such as "Dr.", "Prof.", or "President" + should not be included in person_name or any other element. Author degrees (e.g. + M.D., Ph.D.) also should not be included in CrossRef submissions. contributor_role + should be set, as appropriate, to author or editor. When a contributor translated a + work, set contributor_role to "translator". "chair" should only be used for + conference proceedings to indicate a conference chair. + + + + + + + + + + + + + + + + A contributor's given name. The given_name, combined with surname, + forms the name of an author or editor. given_name may be submitted as either + initials or a full name. Do not place given_name within the surname unless it is + unclear how to distinguish the given name from the surname, as may be the case in + non-Western names. Do not include titles such as "Dr.", "Prof.", or "President" in + given_name. These titles should not be submitted to CrossRef. + + + + + + + + + + + + + The surname of an author or editor. The surname, combined with + given_name, forms the name of an author or editor. Whenever possible, the given name + should not be included in the surname element. In cases where the given name is not + clear, as may happen with non-Western names or some societies in which surnames are + not distinguished, you may place the entire name in surname, e.g.: Leonardo + da Vinci If an author is an organization, you should use organization, + not surname. Suffixes should be tagged with suffix. Author degrees (e.g. M.D., + Ph.D.) should not be included in CrossRef submissions. + + + + + + + + + + + + + The suffix of an author name, e.g. junior or senior. A name suffix, + that typically denotes a generation distinction, is tagged with suffix. Author + degrees (e.g. M.D., Ph.D.) should not be included in CrossRef + submissions. + + + + + + + + + + + + + + + + + The ORCID for an author. The schema performs basic pattern validation, checksum validation is performed upon deposit via a system check. + + + + + + + + + + + + + The institution(s) with which a contributor is affiliated. This + element may hold the name and location of an affiliation with which a contributor is + affiliated. Please note the following points when using this element: 1. A + contributor may have up to five affiliations. Each affiliation should be in a unique + <affiliation> element. The following is correct: <affiliation>University + of New Mexico</affiliation> <affiliation>Sandia National + Laboratories</affiliation> The following is NOT correct + <affiliation>University of New Mexico; Sandia National + Laboratories</affiliation> 2. The name of the institution is required in this + element. The location is optional. Both of the following are correct: + <affiliation>Harvard University</affiliation> <affiliation>Harvard + University, Cambridge, MA</affiliation> 3. Additional address information such + as a URL or email address should NOT be deposited in this element 4. Visual linking + indicators used in publication to connect authors with their affiliations such as + footnote symbols or initials should NOT be included in the <affiliation> + element 5. If you have only a single string that has the affiliation for multiple + contributors to a work and that string is not broken out into the individual + affliations for each author, please do NOT deposit the affilation information. This + element is to be used only for affiliation information that is directly connected to + the author with whom this information is included within the person_name element. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A container for the title and original language title + elements. + + + + + + + + + + + + + + + + + The title of the entity being registered. When a title contains a + subtitle, it is preferable to capture the subtitle portion in the subtitle element. + Only minimal face markup is supported, see See http://help.crossref.org/#face_markup + + + + + + + + + + The title of an entity in its original language if the registration + is for a translation of a work. When providing the original language of a title, you + should set the language attribute. + + + + + + + + + + + + The sub-title portion of an entity title. When possible, it is better + to tag a title and subtitle with separate elements. If this information is not + available, it is acceptable to submit the title and subtitle all within the title + element with punctuation (preferably a colon) used to separate the subtitle from the + title. When a subtitle is tagged, the space and punctuation between the title and + subtitle text should not be included. The following examples illustrate correct and + incorrect tagging practices: Correct and optimal tagging: The Human + Brain + A Handbook Correct but not optimal tagging: The Human + Brain: A Handbook Incorrect: The Human Brain: + A Handbook + The Human Brain + : A Handbook + + + + + + + + + + + Month of publication. The month must be expressed in numeric format + rather spelling out the name (e.g.. submit "10", not "October"). The month must be + expressed with a leading zero if it is less than 10 (e.g. submit "05", not "5"). + When a journal issue has both an issue number and a season, the issue number should + be placed in issue. If the month of publication is not known, the season should be + placed in month as a two-digit value as follows: Season Value Spring 21 Summer 22 + Autumn 23 Winter 24 First Quarter 31 Second Quarter 32 Third Quarter 33 Fourth + Quarter 34 In cases when an issue covers multiple months, e.g. "March-April", + include only the digits for the first month of the range. + + + + + + + + Day of publication. The should must be expressed with a leading zero + if it is less than 10 (e.g. submit "05", not "5"). + + + + + Year of publication. + + + + + database_date records key dates in the life of a database or dataset item. + + Within database_date, creation_date is the date the item was first created, publication_date is the date the item was first published, and update_date is the date the item was last updated. + + + + + + + + + + + + The date a database or dataset item was created. + + + + + + + + + + + The date a content item was created. + + + + + + + + + + + The date a pre-print was posted to a repository. + + + + + + + + + + + The date a manuscript was accepted for publication. + + + + + + + + + + + The date a database or dataset item was updated. + + + + + + + + + + + The date of publication. In all cases, multiple dates are allowed to + allow for different dates of publication for online and print versions. This element + was previously called date, but was renamed publication_date to distinguish more + clearly from conference_date. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The container for information about page ranges. When an entity has + non-contiguous page information, you should capture the first page range in + first_page and last_page. Any additional page information should be captured in + other_pages. Punctuation is only allowed in other_pages. It should not appear in + first_page and last_page. Page number letter prefixes or suffixes should be + included. Roman numeral pages are permitted in both upper case and lower case. Data + may be alpha, numeric or a combination. + + + + + + + + + + + + First page number where an entity is located. Data may be alpha, + numeric or a combination. + + + + + + + + + + + The last page number of an entity. last_page should not be used when + the last page number is the same as the first page number (i.e. when the entire + entity fits on one page). Do not include punctuation for a page range in last_page. + If the entity has non-contiguous paging, use last_page for the last page of the + first range and place all other page information into other_pages. Data may be + alpha, numeric or a combination. + + + + + + + + + + + Used to capture additional page information when items do not + encompass contiguous page ranges. When an entity has non-contiguous page + information, you should capture the first page range in first_page and last_page. + Any additional page information should be captured in other_pages. You should + include commas or hyphens to express discrete pages or page ranges. endash entities + should be converted to ASCII hyphens. Spaces should not be included. Note that + punctuation should never appear in first_page and last_page. Data may be alpha, + numeric or a combination. + + + + + + + + + + + + + + + + + DOI for an entity being registered with CrossRef. In 2008 CrossRef restricted DOI suffix + characters to the following: "a-z", "A-Z", "0-9" and "-._;()/" + + Existing DOIs with suffix characters outside of the allowed set are still supported. For additional + information on DOI syntax, see http://help.crossref.org/#ID5755 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The ISBN assigned to an entity. If a multi-volume work has one ISBN + per volume and a unique ISBN for the series, all may be registered. The ISBN for the + series must be in series_metadata, and the ISBN for each volume in + proceedings_metadata, or book_metadata, respectively. The text "ISBN" should not be + included in the ISBN element in CrossRef submissions. Although not required, the + ISBN number should retain spaces or hyphens that appear in the formatted number + because they aid in human-readability. For more information, please see + http://www.isbn.org/standards/home/isbn/international/hyphenation- instructions.asp + or http://www.isbn.org. + + + + + + + + + + + + Identifies books or conference proceedings that have no ISBN + assigned. In very limited cases a book may never have an ISBN, this is particularly + true for older texts. Conference proceedings, however, may regularly have a volume + number but no ISBN or volume title. + + + + + + + + + + + + + + + + + + + + + + + + The ISSN assigned to an entity. The ISSN must consist of eight digits + (where the last digit may be an X), or it must consist of eight digits in two groups + of four with a hyphen between the two groups. Spaces or other delimiters should not + be included in an ISSN. For more information, please see + http://www.issn.org:8080/English/pub/getting- checking/checking or + http://www.issn.org. The text "ISSN" should not be included in the issn element in + CrossRef submissions. CrossRef validates all ISSNs supplied in deposits, only valid + ISSNs will be accepted. + + + + + + + + + + + + The coden assigned to a journal or conference + proceedings. + + + + + + + + + + + The volume number of a published journal, or the number of a printed + volume for a book or conference proceedings. A journal volume is contained in the + journal_volume element to allow for the assignment of a DOI to an entire journal + volume. Do not include the words "Volume" or "vol." in this element. Data may be + alpha, numeric or a combination. Roman numerals are acceptable. + + + + + + + + + + + + + + Container element for archive. + + + + + + + + + + Used to indicate the designated archiving organization(s) for an + item. Values for the name attribute are CLOCKSS, LOCKSS Portico, KB, DWT (Deep Web + Technologies), Internet Archive + + + + + + + + + + + + + + + + + + + + The date on which a dissertation was accepted by the institution + awarding the degree, a report was approved, or a standard was accepted. + approval_date includes the same elements as publication_date, but it has no + attributes. It is a distinct element from publication_date to reflect that an + important but different semantic meeting from publication_date + + + + + + + + + + + + A list of articles, books, and other items cited by the parent item + for which the DOI in the doi_data is being deposited. Some articles may have + multiple lists of citations (e.g. main reference list, appendix reference list, + etc.). All citations for one article should be included in a single citation_list + regardless of whether one or more citation lists were in the original item. When + combining multiple reference lists from an item into one citation_list element, but + sure to give each citation a unique key attribute value. For example, if an appendix + in an item has a separate citation list that restarts numbering at 1, these + citations should be given key attributes such as "ab1" rather than "b1". Some + articles may contain "Further Reading" or "Bibliography" lists. The distinguishing + factor in these lists is that the references have not been cited from the + article—they only provide a list of additional related reading material. It will be + left to the discretion of the publisher if these items are to be considered + citations and should be deposited. NOTE: If a citation_list element is given and is + empty then all citations for the given DOI will be deleted, otherwise any existing + citations for the given DOI are left intact in the database. It is quite common that + a publisher wants to fix the DOI's metadata without resubmitting the citations. + Leaving out the citation_list element will do that. Also note that any given + citations will override older citations for the given DOI so citation_lists are not + cumulative over multiple records or submissions. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + citation is used to deposit each citation in the reference list of + the item for which the DOI is being deposited. The citations in the list will be run + by the CrossRef system as queries looking for the DOI of the articles being cited. + NOTE: Because the citation list is used to support forward linking, the more + information supplied in the citation the better the chance of finding a match. For + each citation that is deposited, one of four models should be used: 1. Parsed + journal data 2. Parsed book or conference data 3. DOI 4. Unstructured citation (not + yet supported for resolution) When parsed journal, book or conference data is + deposited, CrossRef will perform a lookup to find the DOI. Each citation must be + given a unique ID in the key attribute. It is recommended that this number be the + citation number if the reference list is numbered in the published article, or the + underlying XML ID if the reference list is name/date style in article. When + submitting a journal citation, it should include an issn, journal_title or both. + journal_title only is preferred over issn only. In addition the first author and + first_page number should be submitted. The first_page number is preferred, but for + those citations that are "in press", the author should be submitted. All elements + are optional, however for best linking results, as much information as is known + should be submitted. When submitting a book or conference citation, it should + include an isbn, series_title, volume_title, or any combination of these three + elements as may be available. All elements are optional, however for best linking + results, as much information as is known should be submitted. When a DOI is already + known for a citation, you may submit just the doi without additional information. + When parsed information is not available for a citation, or the citation is of a + type other than journal, book, or conference proceeding that is supported by + CrossRef (e.g. standard, patent, thesis, newspaper, personal communication, etc.), + it may be submitted using the unstructured_citation element. CrossRef is able to + process some unstructured citations. When submitting unstructured citations, it is + helpful, but not required to include all available face markup (e.g. bold, italic, + etc) as this will make possible future parsing of the unstructured citation more + accurate. In such cases, it is preferred, but not required, if the citation number + (when Vancouver style is used) be removed from the unparsed citation. This number + can be submitted using the key attribute Only the first author of a citation should + be submitted, not the entire author list. Only the surname is required. Initials may + be included, but are not recommended because the best linking results can be + provided if initials are omitted. Author titles, roles and generation information + should not be included. If the first author is an organization, the organization + name should be submitted in the author element. cYear has a loose text model that + can accommodate non-standard years such as year ranges such as "1998-1999". Note + that years such as "1998a" or "1999b" should be deposited without the letter, e.g. + "1998" or "1999", whenever possible. Citations that are "in press" should be + submitted with as much information as is available. + + + + + + + + + + + + + + citation_key allows the publisher to assign a unique ID to each + citation that is deposited. It is recommended that this attribute be given the + reference number if the publication uses reference numbers. For those publications + that use name/date style citations, it is recommended that this attribute be used to + indicate the sequential number of the citation in the reference list. However, some + schema must be utilized as this is a required attribute. The system will use this + key value to track the specific reference query and will return this value along + with the DOI. + + + + + + + A citation that is to an item other than a journal article, book, or + conference paper and cannot be structured with the CrossRef citation model. Also, it + is used for a citation to a journal article, book, or conference paper for which the + depositing publisher does not have structured information. unstructured_citation + allows a publisher to deposit references for which no structural information is + available. These may be journal, book, or conference references for which the + supplier did not provide markup, or other types of references (e.g. standards, + patents, etc) which are not supported by CrossRef. This structure permits publishers + to deposit complete reference lists, without regard to the availability of markup, + or the need to parse references beyond those types that CrossRef supports. + CrossRef's ability to process unstructured citations is limited, for details see + http://help.crossref.org/#ID38855 + + + + + + + + + + Journal title in a citation. Only used in the citation element. + Journal title in citation deposits is used for both abbreviated and spelled out + journal names. No attribute is required to distinguish between name types. Both + Proc. Natl. Acad. Sci. U.S.A. and + Proceedings of the National Academy of Sciences of the United + States of America are valid journal titles to use in this + element. + + + + + Book series title in a citation. Only used in the citation element. + series_title is an element for the deposit of book or conference series titles in + citations without the hierarchy required by the series_metadata element. Note that + face markup is not permitted when this element is deposited as part of a + citation. + + + + + Book volume title in a citation. Only used in the citation element. + volume_title is an element for the deposit of book or conference volume titles in + citations without the hierarchy required by the titles element. Note that face + markup is not permitted when this element is deposited as part of a + citation. + + + + + First author in a citation. Only used in the citation element. The + author element tags one author name in a citation without the hierarchy required by + the contributors or person_name elements Only the first author should be deposited + for each item. The author surname is required. Author initials may be added but are + not recommended because queries work best when only the last name is provided. For + example, the author "John Doe" can be deposited as Doe or + Doe J, but the former style is recommended. If the author of a + work is an organization rather than a person, the organization may be deposited as + in: World Health Organization + + + + + Year of publication in citation. Unlike the year element, cYear has a + loose text model that can accommodate non-standard years such as year ranges such as + "1998-1999". Note that years such as "1998a" or "1999b" should be deposited without + the letter, e.g. "1998" or "1999". The letter is used for internal source document + linking in name/date (Harvard) style documents rather than external cross reference + linking to the original item. + + + + + Article title in a citation. Use care to remove face markup (such as + italic applied to genus or species names) from article titles as this markup is not + supported by CrossRef. + + + + + + + The element for depositing a stand alone component. The parent DOI + must already exist (created in an earlier deposit or via some other registration + process). + + + + + + + + + + + + + + + + + + The wrapper element for including a group of components under a + journal article, conference proceeding, book chapter, stand alone component, + dissertation, technical report or working paper, standard, or + database. + + + + + + + + + + A container element that allows registration of supplemental + information for a journal article, book chapter, or conference paper such as + figures, tables, videos, or data sets. Currently, the deposit of components + primarily achieves only the first objective as the CrossRef system is not setup yet + to support queries for components. The metadata associated with a component is + intended to enable simple lookup searches of components in the future. When + deposited as part of the metadata for a higher level work the parent DOI is + implicitly known via the XML hierarchy. When deposited separately the DOI of the + higher level work must be provided explicitly (see sa_component) All descriptive + elements are optional allowing for the creation of simple anonymous DOIs. The + 'parent_relation' attribute is mandatory and refers to the DOI described in the + component's direct parent. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Normally book content that is published as a series is required to + have a series title with an ISSN and a book title and/or a book volume number along + with a book ISBN. An exception is when book chapters are published on line first + prior to being assigned to a specific book in which case only the series title (and + ISSN) is known at time of DOI registration. Element unassigned_content is used as a + placeholder to force recognition of this condition and thus prevent accidental + omission of book level title information. When unassigned_content is present the + system will allow omission of the ISBN. If unassigned_content is not present the + system will require an ISBN for the book title. + + + + + + + + + + + + + + A narrative description of a file (e.g. a figure caption or + video description) which may be independent of the host document context. The + description element may be present more than once to provide alternative language + values. + + + + + + + + + + + + Pubsliher's custom statement for their intent to publish content for which a pre-register DOI has been created + + + + + + + + + + + + + + + + + + A narrative description of a component's file format and/or the file + extension (for mime types refer to http://www.iana.org/assignments/media-types/) The + format element may contain only the mime_type attribute, or in addition it may + contain a narrative description of the file format. Be sure to use the narrative + portion to description only the format of the component and not the actual content + of the component (use description to describe the component's + content). + + + + + + + + + + + + + + Container element for CrossMark data. + + + + + + + + + Some publishers encourage broad third + party hosting of the publisher's content. Other publishers do not. And + still others vary their policy depending on whether a particular article + has been published under an OA policy or not. This boolean flag allows + the publisher to indicate whether the CrossMarked content will only + legitimately be updated on the CrossMark domain (true) or whether the + publisher encourages updating the content on other sites as well + (false). + + + + + + + + + + + + A DOI which points to a publisher's CrossMark policy document. + Publishers might have different policies for different + publications. + + + + + + Container element for crossmark_domain. A list of domains where the + publisher maintains updates and corrections to their content. Minimally, one of + these should include the Internet domain name of the publisher's web site(s), but + the publisher might also decide to include 3rd party aggregators (e.g. Ebsco, + IngentaConnect) or archives with which the publisher has agreements to update the + content + + + + + + + + + + This should be a simple Internet domain name or subdomain name (e.g. + www.psychoceramics.org or psychoceramics.org). It is used to identify when a + referring URL is coming from a CrossMark domain. A "crossmark_domain" is made up of + two subelements; a "domain" and a "filter". The domain is required but the filter is + optional and is only needed for use in situations where content from multiple + publishers/publications is on the same host with the same domain name (e.g. an + aggregator) and one needs to use the referrer's URI "path" to further determine + whether the content in a crossmark domain. + + + + + + + + + + + Required element. This should be a simple Internet domain name or + subdomain name (e.g. www.psychoceramics.org or psychoceramics.org). It is used to + identify when a referring URL is coming from a CrossMark domain. + + + + + Optional element. The filter element is used to disambiguate content + in situations where multiple publishers share the same host (e.g. when on an + aggregated platform). It should contain a substring of the path that can be used to + uniquely identify a publisher's or publication's content. For instance, using the + string "alpsp" here would help the CrossMark system distinguish between ALPSP + publications on the ingentaconnect host and other publications on the same + host. + + + + + + Optional element. A document might provide updates (e.g. corrections, + clarifications, retractions) to several other documents. When this is the case, the + DOIs of the documents that are being *updated* should be listed + here. + + + + + + + + + + The DOI of the content being updated (e.g. corrected, retracted, + etc.) In the CrossMark Terms and Conditions "updates" are defined as changes that + are likely to "change the reader’s interpretation or crediting of the work." That + is, *editorially significant* changes. "Updates" should not include minor changes to + spelling, punctuation, formatting, etc. Attributes: label: Required attribute. This + should be a human-readable version of the "type" attribute. This is what gets + displayed in the CrossMark dialog when there is an update. type: Required attribute. + This attribute should be used to give the machine-readable name of the update type. + The human-readable version of the type should be but in the "label" attribute. There + are many "types" of updates. "Corrections, "clarifications", "retractions" and + "withdrawals" are just a few of the better-known types. For these common types we + recommend you use the values "correction", "clarification", "retraction" and + "withdrawal" respectively as per your editorial policy. However, different + publishers sometimes have to support different, custom update types- for instance, + "protocol amendments", "letters of concern", "comments", etc. The attribute supports + custom types as well. date: The date of the update will be displayed in the + CrossMark dialog and can help the researcher easily tell whther they are likley to + have seen the update. + + + + + + + Required attribute. This attribute should be used to + list the update type. Allowed update types are: +
    +
  • addendum
  • +
  • clarification
  • +
  • correction
  • +
  • corrigendum
  • +
  • erratum
  • +
  • expression_of_concern
  • +
  • new_edition
  • +
  • new_version
  • +
  • partial_retraction
  • +
  • removal
  • +
  • retraction
  • +
  • withdrawal
  • +
+ +
+
+
+ + + Required attribute. The date of the update will be + displayed in the CrossMark dialog and can help the researcher easily + tell whther they are likley to have seen the + update. + + +
+
+
+
+ + + Optional element. Publishers are encouraged to provided any + non-bibliographical metadata that they feel might help the researcher evaluate and + make better use of the content that the Crossmark record refers to. For example, + publishers might want to provide funding information, clinical trial numbers, + information about the peer-review process or a summary of the publication history of + the document. + + + + + + + + + + + + + + + + + + + + + + + + + An assertion is a piece of custom, non-bibliographic metadata that + the publisher is asserting about the content to which the CrossMark refers. + assertion attributes: explanation: If the publisher wants to provide a further + explanation of what the particular "assertion" means, they can link to such an + explanation by providing an appropriate url on the "explanation" attribute. + group_label: This is the human-readable form of the "group_name" attribute. This is + what will be displayed in the group headings on the CrossMark metadata record + dialog. group_name: Some assertions could be logically "grouped" together in the + CrossMark dialog. For instance, if the publisher is recording several pieces of + metadata related to funding sources (source name, percentage, grant number), they + may want to make sure that these three assertions are grouped next to each-other in + the CrossMark dialog. The group_name attribute is the machine-readable value that + will be used for grouping such assertions. label: This is the human-readable version + of the name attribute which will be displayed in the CrossMark dialog. If this + attribute is missing, then the value of the assertion will *not* be displayed in the + dialog. Publishers may want to "hide" assertions this way in cases where the + assertion value is too large or too complex to display in the dialog, but where the + assertion is nonetheless valuable enough to include in API queries and metadata + dumps (e.g. detailed licensing terms). name: This is the machine-readable name of + the assertion. Use the "label" attribute to provide a human-readable version of the + name. order: The publisher may want to control the order in which assertions are + displayed to the user in the CrossMark dialog. All assertions will be sorted by this + element if it is present. + + + + + + + + Optional attribute. If the publisher wants to provide a + further explanation of what the particular "assertion" means, they can link + to such an explanation by providing an appropriate url on the "explanation" + attribute. + + + + + Optional attribute. This is the human-readable form of the + "group_name" attribute. This is what will be displayed in the group headings + on the CrossMark metadata record dialog. + + + + + Optional attribute. Some assertions could be logically + "grouped" together in the CrossMark dialog. For instance, if the publisher + is recording several pieces of metadata related to funding sources (source + name, percentage, grant number), they may want to make sure that these three + assertions are grouped next to each-other in the CrossMark dialog. The + group_name attribute is the machine-readable value that will be used for + grouping such assertions. + + + + + Optional attribute. This is the human-readable version of the + name attribute which will be displayed in the CrossMark dialog. If this + attribute is missing, then the value of the assertion will *not* be + displayed in the dialog. Publishers may want to "hide" assertions this way + in cases where the assertion value is too large or too complex to display in + the dialog, but where the assertion is nonetheless valuable enough to + include in API queries and metadata dumps (e.g. detailed licensing + terms) + + + + + Required attribute. This is the machine-readable name of the + assertion. Use the "label" attribute to provide a human-readable version of + the name. + + + + + Optional attribute. The publisher may want to control the + order in which assertions are displayed to the user in the CrossMark dialog. + All assertions will be sorted by this element if it is + present. + + + + + Optional attribute + + + + + + + + + A wrapper for designators or other primary identifiers for a + standard. + + + + + + + + + + + + + + + + + + + + + + + + + + + Designator or other primary identifier for the standard being + deposited. Required. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Provides for defining a DOI for a broad grouping of standards. + + + + + + + Provides for defining a DOI for a set of standards (sometimes know as truncated form). + + + + + + + + + + + + + + Provides for defining a DOI for a group of closely related standard documents (undated form is a stem for any dated form) + + + + + + + + + + + + + + Designator for standard being replaced by the standard being deposited. + + + + + + Designator for standard from which the current deposit is adopted. + + + + + + Designator for the previous revision of the standard being deposited. (note: use alt_as_published for revisions within designators having common stem) + + + + + + A wrapper for standards body information. + + + + + + + + + + + Name of the standards organization / publisher. + Required. + + + + + Acronym for standards body. Will be used for query matching - + required. + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/export/xsd/crossref/crossref4.4.0.xsd b/export/xsd/crossref/crossref4.4.0.xsd new file mode 100644 index 0000000..b2de142 --- /dev/null +++ b/export/xsd/crossref/crossref4.4.0.xsd @@ -0,0 +1,2086 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Top level element for a metadata submission to CrossRef. This element + indicates the start and end of the XML file. The version number is fixed to the + version of the schema. Be sure to set the name space attributes as shown above in + order for the Xerces parser to process the instance correctly. For the purposes of + parsing, you may also set xsi:schemaLocation to http://www.crossref.org/schema/4.x.x + http://www.crossref.org /schema/4.x.x/crossref.xsd. A copy of the schema is located + on the CrossRef server at this URL and will remain constant for a given version of + the schema. This location permits you to have a constant location for the schema for + parsing without relying on a hardwired local directory on your development platform. + + + + + + + + + + + + + The container for information related to the DOI batch submission. + This element uniquely identifies the batch deposit to CrossRef and contains + information that will be used as a reference in error messages sent by the MDDB. + + + + + + + + + + + + + + The container for the main body of a DOI record submission. The body + contains a set of journal, book, conference proceedings or stand alone component + records. It is not possible to mix genres within a single DOI submission. It is + possible to include records for multiple journals, books, conferences, or stand + alone components in a single submission. + + + + + + + + + + + + + + + + + + + The container for all information about a single journal and the + articles being registered within the journal. journal is the core container for + information about a single journal and articles submitted for registration from that + journal. Within a journal instance, you may register articles from a single issue, + detailed in journal_issue. If you want to register items from more than one issue, + you must use multiple journal instances, which can be done within a single batch + submission. If you have articles that have not been assigned to an issue, you may + register them within a single journal instance. In this case, do not include a + journal_issue. You may chose to submit only top level journal_metadata and + journal_issue metadata for any journal or issue, allowing you to register DOIs for + an entire journal, or any issue or volume within a journal. + + + + + + + + + + + + The container for metadata that defines a + journal. + + + + + + + + + + + + + + + + + + The full title by which a journal is commonly known or cited. + full_title and abbrev_title must both be submitted even if they are identical. Note: + In version 4.1.0 and later, this element is allowed up to 10 times to allow for a) + journal name changes over time, b) translated journal names (e.g. the Japanese name + and the English equivalent), and c) common author mis-spellings of a given journal + name. + + + + + + + + + + + + This element contains the common abbreviation or abbreviations used + when citing this journal. It is preferred, but not required, that periods be + included after abbreviated words within the title. full_title and abbrev_title must + both be submitted, and they can be identical. If you do not know the abbreviated + title for a specific journal, please supply the full title in the abbrev_title + element. Note: In version 4.1.0 and later, this element is no longer required in + journal_metadata because some journals do not have abbreviated journal names. + + + + + + + + + + + + The container for metadata that defines a single issue of a journal. + Special issue numbering information for a journal should be placed in + special_numbering. You may register a DOI for an entire issue by including doi_data + in journal_issue. The URI should resolve to the table of contents for the issue. + contributors is included in journal_issue to allow inclusion of editors of special + issues. This element allows linking from a reference such as: R.Glaser, L.Bond + (Eds.), Testing: concepts and research, American Psychologist 36 (10-12) (1981) + (special issue). You should not include contributors for the regular editors of + regular issues. + + + + + + + + + + + Issue level numbering for supplements or special issues. + Text defining the type of special issue (e.g. "suppl") should be + included in this element along with the number. + + + + + + + + + + The container for the journal volume and DOI assigned to an entire + journal volume. You may register a DOI for an entire volume by including doi_data in + journal_volume. + + + + + + + + + + + + + Issue level numbering for supplements or special issues. Text + defining the type of special issue (e.g. "suppl") should be included in this element + along with the number. + + + + + + + + + + + The container for all information about a single journal article. A + journal article is required to have title and doi_data. All other information is + optional. When registering items that do not have titles, use the appropriate + heading from the journal section or table of contents (e.g. "Errata") in title. + journal_article allows for multiple titles per entity. In some cases it may be + helpful to submit multiple titles. For example, if an erratum carries title of the + original article and the heading "Errata", both should be submitted by using two + titles elements. + + + + + + + + The abstract element allows depositors to include + abstracts extracted from NLM or JATS XML in CrossRef deposits. The jats: + namespace prefix must be included. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The container for all information about a single conference and its + proceedings. conference is the core container for information about a single + conference and its proceedings. If a conference proceedings spans multiple volumes, + each volume must be contained in a unique conference element. You may choose to + submit only top level contributors, event_metadata and proceedings_metadata for any + conference, or you may choose to submit these elements along with metadata for each + conference_paper. It is not necessary to submit metadata for all items listed on the + proceedings table of contents. You may chose to drop items of lesser significance + such as front and back matter. NOTE: The CrossRef system currently uses the + proceedings_title and conference_acronym in the query matching process. This system + can cause problems when the proceedings have a simple non-changing title (e.g + PRoceedings of SPIE) and the conference event name, conference_name, is used to + differentiate conference topics (e.g. Optoelectronic Integrated Circuits II). To + avoid this problem, CrossRef recommends that you make sure the conference_acronym + accurately reflects the event name (e.g OpIC II in this + example). + + + + + + + + + + + + + + + + A container for all information that applies to a conference event. + event_metadata captures information about a conference event. Data about conference + proceedings is captured in proceedings_metadata. NOTE: The CrossRef system currently + uses the proceedings_title and conference_acronym in the query matching process. + This system can cause problems when the proceedings have a simple non-changing title + (e.g PRoceedings of SPIE) and the conference event name, conference_name, is used to + differentiate conference topics (e.g. Optoelectronic Integrated Circuits II). To + avoid this problem, CrossRef recommends that you make sure the conference_acronym + accurately reflects the event name (e.g OpIC II in this + example). + + + + + + + + + + + + + + + + The official name of the conference. conference_name does not include + "Proceedings of". For example, "The 23rd Annual Meeting of the American Society for + Information Science" is a correct conference name. It is quite common for a + conference name to include the conference number or subject. When any of these + metadata items appear in the conference name, they should be included in this + element, and also in the respective sub-element, conference_number or + proceedings_subject. The following example shows incorrect tagging of a conference + name and then the corrected version: INCORRECT: + the second international conference + IEA/AIE '89 + 1989 + Tullahoma, TN + + + + Proceedings of the second international conference on + Industrial and engineering applications of artificial intelligence and + expert systems - IEA/AIE'89 + Industrial and engineering applications of artificial + intelligence and expert systems + CORRECT: + The second international conference on Industrial and + engineering applications of artificial intelligence and expert + systems + IEA/AIE '89 + 2 + Tullahoma, TN + + + + Proceedings of the second international conference on + Industrial and engineering applications of artificial intelligence and + expert systems - IEA/AIE '89 + Industrial and engineering applications of artificial + intelligence and expert systems + Authors commonly cite a conference by the official name, so + it is important to provide this information as accurately as + possible. + + + + + + + + + + + The theme is the slogan or special emphasis of a conference in a + particular year. The theme is the slogan of the conference. It differs from the + subject of a conference in that the subject is stable over the years while the theme + may vary from year to year. For example, the American Society for Information + Science and Technology conference theme was "Knowledge: Creation, Organization and + Use" in 1999 and "Defining Information Architecture" in 2000. + + + + + + + + + + + The popularly known as or jargon name (e.g. SIGGRAPH for "Special + Interest Group on Computer Graphics"). Authors commonly cite the conference acronym + rather than the full conference or proceedings name, so it is best to include this + element when it is available. The conference acronym often includes the year of the + conference (e.g. SGML '97) or, less often, the conference number. It is preferred, + but not required, that submission of metadata exclude number or year information + from the conference acronym. It is better to include such information in + conference_number, or conference_date, respectively. + + + + + + + + + + + The sponsoring organization(s) of a conference. Multiple sponsors may + be given if a conference is hosted by more than one + organization. + + + + + + + + + + + The number of a conference. conference_number should include only the + number of the conference without any extra text. For example, "The 24th Annual + Conference on..." should be tagged as shown in the example above, and "th" should + not be included. Roman numerals are acceptable. When a conference is named such that + the year of the conference indicates the number (e.g. "SGML 1994"), the year appears + in conference_name, conference_date, and conference_number, as in: + SGML 1994 + SGML + 1994 + November 7-10, 1994 + + + + + + + + + + + + The location of the conference. The city and country of the + conference. If the conference is in the United States, the appropriate state should + also be provided, and the country may be omitted. If the conference is in Canada, + the province should be provided, and the country may be omitted. The specific venue + or address within a city (e.g. conference center, hotel, etc.) should not be + provided. + + + + + + + + + + + + + + + + + The start and end dates of a conference event. conference_date may be + used in three ways: 1. If publishers that do not have parsed date values, provide + just text with the conference dates. The date text should be taken from the + proceedings title page. 2. If publishers have parsed date values, provide them in + the attributes. 3. If both parsed date values and the date text are available, both + should be provided. This is the preferred tagging for conference_date. For example: + Jan. 15-17, 1997 + + + + + + + + + + + + + + + + + + A container for all information that applies to a non-series + conference proceeding. proceedings_metadata captures information about conference + proceedings. Data about conference events is captured in + event_metadata + + + + + + + + + + + + + + + + + + + + + + + A container for all information that applies to a specific conference + proceeding that is part of a series. A conference proceedings published as a series + can sometimes look just like a journnal in that there is no volume information (no + volume title, no ISBN). In these cases the conference proceeding may be deposited as + a journal (which more accurately should have been called a 'series_publication'). To + allow for the use of a consistent XML heirarchy we will allow a + proceedings_series_metadata root element to also describe such a publication. Note: + this structure is organized to allow backward compatibility with previous schema + versions by maintaining the prior sequence of elements. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The title of the conference proceedings as printed on title page of + the published conference proceedings. proceedings_title is the undifferentiated + title of a conference proceedings. It should generally be the title as it appears on + the cover of the printed proceedings. In some cases, proceedings_title may differ + from conference_name only in that the text "Proceedings of" often appears at the + start of the proceedings_title, and it this text should never be included in + conference_name. In other cases, the proceedings_title and conference_name may be + quite different. + + + + + + + + + + + The subject of the printed conference proceedings, e.g. "Computer + Graphics" is the subject matter of SIGGRAPH. This element is useful because an + author may cite a conference paper by the conference subject. For example, + "Proceedings of the 1999 ACM Conference on Computer Graphics" + + + + + + + + + + + The container for all information about a single conference paper. A + conference paper is required to have contributors, title and doi_data. All other + information is optional. + + + + + + + + The abstract element allows depositors to include + abstracts extracted from NLM or JATS XML in CrossRef deposits. The jats: + namespace prefix must be included. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The container for all information about a single book. book is the + core container for information about a specific book. Books may be in the form of + edited books (i.e. a contributed volume with one or more editors), monographs + (single-authored works), or reference works (e.g. encyclopedias). If a book contains + multiple volumes, each volume must be contained in a unique book element. You may + chose to submit only top level contributors and book_metadata for any book, or you + may chose to submit these elements along with metadata for each content_item. A + content item is typically any entity that is listed on the table of contents such as + a chapter, section, etc. It is not necessary to submit metadata for all items listed + on the table of contents. You may chose to drop items of lesser significance such as + front and back matter. Book-level metadata is captured within book_metadata, + book_series_metadata, or book_set_metadata. If a books is a single-volume work, use + book_metadata. If the book is a volume from a multi-volume work that is also a + serial publication (and therefore has an ISSN), use book_series_metadata. If the + book is a volume of non-serial publication, then it is considered a set and you + should use book_set_metadata book_type should be set to "monograph" when the same + author or authors wrote the majority of the content. It should be set to + "edited_book" when a book primarily consists of contributed chapters, each chapter + written by different authors. It should be set to "reference" for major reference + works such as encyclopedias. Use "other" when the author of the content does not fit + any of the other categories. + + + + + + + + + + + + + + + + + + + + + + + + + A container for all information that applies to a monograph. It does + not include metadata about individual chapters. The language of the book should be + specified in the book_metadata language attribute. If a book contains items in + multiple languages this attribute should be set for the predominant language of the + book. Individual items may have their language specified in content_item. If all + content items are the same language, it is only necessary to specify the language of + the book in this element. The contributors are the author(s) or editor(s) of the + entire work. When using book_metadata, specify the title of the book within + book_metadata. edition_number, when given, should include only a number and not + additional text such as "edition" or "ed". publisher_item, when given, specifies + this information for the entire book or volume. This element also appears in + content_item. doi_data is required for each book or volume that you submit. It is + not possible to submit DOI information for individual chapters without assigning a + DOI to the entire work. Note: citation_list should only be used in book_metadata + instead of content_item when the reference list is a separate section of the book, + and content_items are not included in the deposit (e.g. you are depositing a book + with a bibliography, but not the chapters of the book) In very limited circumstances + a book may be deposited without an ISBN, in which case the noisbn element must be + supplied to explicitly declare that an ISBN is not accidentily omitted. Great care + should be taken when choossing to use noisbn since it may adversely effect matching. + This provision is primarily being made to allow for the deposit of DOIs for + historical volumes that are difficult to obtain ISBNs. + + + + + + + + The abstract element allows depositors to include + abstracts extracted from NLM or JATS XML in CrossRef deposits. The jats: + namespace prefix must be included. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A container for all information that applies to an individual volume + of a book series. It does not include metadata about individual chapters. The + language of the book should be specified in the book_series_metadata language + attribute. If a book contains items in multiple languages this attribute should be + set for the predominant language of the book. Individual items may have their + language specified in content_item. If all content items are the same language, it + is only necessary to specify the language of the book in this element. The + contributors are the author(s) or editor(s) of the entire work. If a multi- volume + work has separate editors for each volume, those editors should be specified in this + element, and the series editors are listed in the series_metadata contributors. + Series titles should be specified within series_metadata. Volume titles (when + present) are captured in book_series_metadata. If the volumes of a series only have + volume numbers and not individual titles, you may specify the volume number within + volume_metadata, and no title is required. volume and edition_number, when given, + should include only a number and not additional text such as "volume" or "edition". + For example, you should submit "3", not "third edition". If a work spans multiple + volumes with a unique ISBN for each volume and the whole series, you should specify + the series ISBN in isbn in series_metadata and the volume ISBN in isbn in + book_series_metadata. WARNING: Care must be taken when submitting books with series. + If a series title is submitted and no book title is supplied but an ISBN is supplied + at the book_series_metadata level and not with the series title, the CrossRef system + will index a series title with no ISBN and an ISBN with no title. Please take care + to associate the ISBN at the correct level of the XML hierarchy. publisher_item, + when given, specifies this information for the entire book or volume. This element + also appears in content_item. doi_data is required for each book or volume that you + submit. It is not possible to submit DOI information for individual chapters without + assigning a DOI to the entire work. Note: citation_list should only be used in + book_series_metadata instead of content_item when the reference list is a separate + section of the book, and content_items are not included in the deposit (e.g. you are + depositing a book with a bibliography, but not the chapters of the book) Normally + book content that is published as a series is required to have a series title with + an ISSN and a book title and/or a book volume number along with a book ISBN. An + exception is when book chapters are published on line first prior to being assigned + to a specific book in which case only the series title (and ISSN) is known at time + of DOI registration. Element unassigned_content is used as a placeholder to force + recognition of this condition and thus prevent accidental omission of book level + title information. + + + + + + + + + + + + + The abstract element allows depositors to + include abstracts extracted from NLM or JATS XML in + CrossRef deposits. The jats: namespace prefix must be + included. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A container for all information that applies to an individual volume + of a book set. It does not include metadata about individual chapters. A set is a + finite series, and does not have an ISSN The language of the book should be + specified in the book_set_metadata language attribute. If a book contains items in + multiple languages this attribute should be set for the predominant language of the + book. Individual items may have their language specified in content_item. If all + content items are the same language, it is only necessary to specify the language of + the book in this element. The contributors are the author(s) or editor(s) of the + entire work. If a multi- volume work has separate editors for each volume, those + editors should be specified in this element, and the series editors are listed in + the series_metadata contributors. When using book_set_metadata, specify the title of + the entire set and the isbn of the set. Specify the title of the volume in + volume_metadata. If the volumes of a set only have volume numbers and not individual + titles, you may specify the volume number within volume_metadata, and no title is + required. volume and edition_number, when given, should include only a number and + not additional text such as "volume" or "edition". For example, you should submit + "3", not "third edition". If a work spans multiple volumes with a unique ISBN for + each volume and the whole series, you should specify the series ISBN in isbn in + series_metadata and the volume ISBN in isbn in book_series_metadata. publisher_item, + when given, specifies this information for the entire book or volume. This element + also appears in content_item. doi_data is required for each book or volume that you + submit. It is not possible to submit DOI information for individual chapters without + assigning a DOI to the entire work. Note: citation_list should only be used in + book_series_metadata instead of content_item when the reference list is a separate + section of the book, and content_items are not included in the deposit (e.g. you are + depositing a book with a bibliography, but not the chapters of the + book) + + + + + + + + + + + The abstract element allows depositors to include + abstracts extracted from NLM or JATS XML in CrossRef deposits. + The jats: namespace prefix must be included. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + An entity in a book, such as a chapter, for which a DOI is being + registered. A content item is typically an entity listed on the table of contents. + There need not be a one-to-one correlation between content listings and content + items (e.g. you may choose not to register front and back matter items listed in the + table of contents). The language of a content_item only need be set if it differs + from the language of book_metadata. The component_type indicates the type of content + item you are registering. Please see the example of a book submission in this + documentation for a better understanding of how this attribute may be used in nested + tables of contents. level_sequence_number indicates the level of nesting for content + items. For example, you may use it to indicate when one content item, such as a + chapter, is actually inside another content item, such as a section. Please see the + example of a book submission in this documentation for a better understanding of how + this attribute may be used in nested tables of contents. Note: Because the CrossRef + schema uses a flat model to indicate hierarchically nested content items, there is + an implicit assumption that content items will be listed in the CrossRef submission + in the same order in which they appear in the table of contents. Please follow this + protocol when submitting DOI data for books. This order is not required for journal + and conference data. contributors for a content_item need not be listed if all items + in a book have the same contributors listed in book_metadata. In other words, + contributors must be listed for edited books, but they should not be listed for each + content_item in a monograph. The exception case is when a content item such as a + Preface or Forward for a monograph has a different author from that of the + monograph. In this case, the contributors should be given. The title of each content + item must be submitted. If, however, you are submitted data for a monograph that + simply has "Chapter 1", "Chapter 2", etc., you should put this information in + component_number, not titles. + + + + + + + + The abstract element allows depositors to include + abstracts extracted from NLM or JATS XML in CrossRef deposits. The jats: + namespace prefix must be included. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The container for metadata about a series publication. When a book, + conference proceeding, or report consists of multiple volumes, series_metadata is + used to describe information about the entire series. If a work spans multiple + volumes, you should use titles in series_metadata. If a work spans multiple volumes + with a unique title for each volume and the whole series, you should specify the + series title in titles in series_metadata and the volume title in titles in + book_series_metadata. If a unique ISBN has been assigned to the entire series (as + opposed to the individual volumes), it should given in series_metadata. You may + assign and register a DOI that encompasses an entire series by adding doi_data in + series_metadata. This element is optional for a series. + + + + + + + + The abstract element allows depositors to include + abstracts extracted from NLM or JATS XML in CrossRef deposits. The jats: + namespace prefix must be included. + + + + + + + + + + + + + + + + + + + + + When a book consists of multiple volumes that are not part of a + serial publication (series), set_metadata is used to describe information about the + entire set. + + + + + + + + + + + + + + + + + + + The series number within a specific published conference discipline. + The series number is different from the volume number. A volume number is the number + of a book in a physically printed set and typically appears in sequence. The series + number is not tied to the physical manifestation of the printed volume and need not + be strictly in sequence. It is most commonly used in "Lectures" published by + Springer-Verlag. This element is available in series_metadata, however it should + only be used for conference proceedings, not for books. + + + + + + + + + + + The part number of a given volume. Deposited within + book_set_metadata. In some cases, a book set will have multiple parts, and then one + or more volumes within each part. The part number of a given volume should be + deposited in this element. + + + + + + + + + + + A container for information about the publisher of a book or + conference proceedings. + + + + + + + + + + + The city where the publisher of this work is located. publisher_place + gives the primary city location of the publisher. When the location is a major city + (e.g. New York, Amsterdam), no qualifying country, U.S. state, or Canadian province + need be given. If the city is not a major city, the appropriate country, U.S. state, + or Canadian province should be added. + + + + + + + + + + + The name of the publisher of a book or conference proceedings. + publisher_name is the imprint of the publication (what the author will likely cite), + not the organization registering the DOI, if for any reason they are different. When + registering a translation, the translation publisher, not the original publisher, + should be given. + + + + + + + + + + + + A container for item identification numbers set by a publisher. + item_number within publisher_item may also be used to provide an article number when + a first_page is not available or applicable. In certain cases it may be deemed + in-appropriate to 'misuse' the first_page element to provide a value that has + meaning in an on-line only publication and does not convey an form of page number. + In these circumstances the attribute <item_number + item_number_type="article-number"> will instruct the CrossRef system to treat the + value of item_number in the same manner as first_page. This value then becomes a + critical part of the query process. If both <item_number + item_number_type="article-number"> and first_page are present, first_page will + take precedence. + + + + + + + + + + + + + + + + + A publisher identifier that can be used to uniquely identify the + entity being registered. This identifier is a publisher-assigned number that + uniquely identifies the entity being registered. This element should be used for + identifiers based on publisher internal standards. Use identifier for a publisher + identifier that is based on a public standard such as PII or SICI. If the + item_number and identifier are identical, there is no need to submit both. In this + case, the preferred element to use is identifier. Data may be alpha, numeric or a + combination. item_number has an optional attribute, item_number_type. It is assigned + by the publisher to provide context for the data in item_number. If item_number + contains only a publisher's tracking number, this attribute need not be supplied. If + the item_number contains other data, this attribute can be used to define the + content. For example, if a journal is published online (i.e. it has no page + numbers), and each article on the table of contents is assigned a sequential number, + this article number can be placed in item_number, and the item_number_type attribute + can be set to "article_number". Although CrossRef has not provided a set of + enumerated types for this attribute, please check with CrossRef before using this + attribute to determine if a standard attribute has already been defined for your + specific needs. If a dissertation DAI has been assigned, it should be deposited in + the identifier element with the id_type attribute set to "dai". If an institution + has its own numbering system, it should be deposited in item_number, and the + item_number_type should be set to "institution" If the report number of an item + follows Z39.23, the number should be deposited in the identifier element with the + id_type attribute set to "Z39.23". If a report number uses its own numbering system, + it should be deposited in the identifier element, and the id_type should be set to + "report-number" The designation for a standard should be placed inside the + identifier element with the id_type attribute set to "ISO-std-ref" or + "std-designation" (more generic label) + + + + + + + + + + + + + + + + + + A public standard identifier that can be used to uniquely identify + the entity being registered. This identifier is a publisher-assigned number that + uniquely identifies the entity being registered. This element should be used for + identifiers based on public standards. Use item_number for a publisher identifier + that is based on a publisher's internal systems rather than on a public standard. + The supported standards are: PII - Publisher Item Identifier SICI - Serial Item and + Contribution Identifier DOI - Digital Object Identifier + + + + + + + + + + + + + + + + + + + + + + + + + + + dissertation is the top level element for deposit of metadata about + one or more dissertations. The dissertation element does not have publisher, or issn + elements. It is expected that the dissertation element will be used for deposit of + items that have not been published in books or journals. If a dissertation is + published as a book or within a serial, it should be deposited using the top-level + element for the appropriate publication type. If a DAI has been assigned, it should + be deposited in the identifier element with the id_type attribute set to "dai". If + an institution has its own numbering system, it should be deposited in item_number, + and the item_number_type should be set to "institution" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + report-paper is the top level element for deposit of metadata about + one or more reports or working papers. component_list is included in report-paper to + handle items that have components but do not have content_item elements (i.e. a + report that is not divided into multiple chapters). If an item has content_item + elements, then component_list inside of content_item must be used rather than the + element available in report-paper + + + + + + + + + + + + + + + + report-paper_metadata is used as a wrapper for the metadata related + to a Technical Report or Working Paper. report-paper_metadata is almost identical to + book_metadata. It differs only in that report-paper_metadata removes the volume + number and adds the elements institution and contract_number. Please see the + comments for book_metadata about the usage of most elements in report- + paper_metadata. Reports and Working Papers are often sponsored by either + universities or by a non-academic organization (corporate or government). Such + institutions are not typically considered "publishers" and so the item may be + deposited using the institution element. Multiple element instances are permitted so + the sponsoring institution and publishing institution can both be deposited as + authors may cite either. If the report number of an item follows Z39.23, the number + should be deposited in the identifier element with the id_type attribute set to + "Z39.23". If a report number uses its own numbering system, it should be deposited + in item_number. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + report-paper_series_metadata is used as a wrapper for the metadata + related to a Technical Report or Working Paper that is part of a series. + report-paper_series_metadata is almost identical to book_series_metadata. It differs + only in that report-paper_metadata removes the volume number and adds the elements + institution and contract_number. Please see the comments for book_series_metadata + about the usage of most elements in report- paper_series_metadata. Reports and + Working Papers are often sponsored by either universities or by a non-academic + organization (corporate or government). Such institutions are not typically + considered "publishers" and so the item may be deposited using the institution + element. Multiple element instances are permitted so the sponsoring institution and + publishing institution can both be deposited as authors may cite either. If the + report number of an item follows Z39.23, the number should be deposited in the + identifier element with the id_type attribute set to "Z39.23". If a report number + uses its own numbering system, it should be deposited in + item_number. + + + + + + + + + + + The abstract element allows depositors to include + abstracts extracted from NLM or JATS XML in CrossRef deposits. + The jats: namespace prefix must be included. + + + + + + + + + + + + + + + + + + + + + + + + + + + standard is the top level element for deposit of metadata about + standards developed by Standards Development Organizations (SDOs) or Consortia. + CrossRef does not determine if a new DOI should be created for each revision or + reaffirmation of a standard. The decision will be left to the individual standards + organizations. As of schema version 4.3.3, CrossRef recommends that the full + standard designation be placed in the as_published element (within + standard_designator). For backwards compatibility, the full designation may also be + included in the identifier element with the id_type attribute set to "ISO-std-ref". + In addition, CrossRef requires that the publisher of the standard be included in + standards_body_name, and the acronym within standards_acronym. The as_published and + standards_acronym elements will be combined to identify a standard for query + matching. component_list is included in standard to handle items that have + components but do not have content_item elements (i.e. a standard that is not + divided into multiple chapters). If an item has content_item elements, then + component_list inside of content_item must be used rather than the parent standard + element. + + + + + + + + + + + + + + + + Standard_metadata is used as a wrapper for the metadata related to a + Standard that is not part of a series. standard_metadata is similar to + book_metadata. It differs in that standard_metadata adds the elements institution + and approval_date. contributors contains the author(s) of the standard. In most + cases, it is expected that the organization element will be used rather than + person_name element for standards. However in some cases, standards are cited by + their individual authors. In such cases, individual authors should be deposited with + person_name, and the SDO or consortia name should be deposited with the organization + element in contributors and also the standards_body_name element in standards_body + Note that when the organization element is used in contributors, it should have the + name of the committee (when appropriate) that developed the standard, not the name + of the Standards Development Organization (SDO) or consortia. The SDO or consortia + name should be placed in the publisher or standards_body element (as appropriate) + Standards more often have version numbers than edition numbers. However the + edition_number element can be used for deposit of the version number of a standard + approval_date should be used for the date that a standard has been accepted or + re-affirmed if different from the date of publication. Both may be provided even if + identical Within publisher_item, the designation should be placed inside the + item_number element, and the id_type should be set to "designation" to indicate a + standard designation. Standards are typically sponsored or hosted by SDOs or + Consortia. In some cases standards are published by a traditional publisher rather + than by the owning organization. Such cases may be deposited with one or more + publishers. + + + + + + + + The abstract element allows depositors to include + abstracts extracted from NLM or JATS XML in CrossRef deposits. The jats: + namespace prefix must be included. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + database is the top level element for deposit of metadata about one + or more datasets or records in a database. Database structures allow for the + assignment of DOIs to entire databases at the aggregate level and at two lower + levels. The top level may be a physical/functional database or a logical abstration + acting as a collection much the same as a journal is a collection of articles. The + need to assign specific fields of metadata at each level depends on the nature of + the top most level (e.g. publication date may be appropriate at the top level for a + physical object but only at lower levels for an abstract top level object) The first + sub-level is the dataset which may be a basic record of the top level object or a + collection in its own right. In either case dataset must represent a physical + construct. A third level is provided in the component_list. NOTE: component_list in + <database> (rather than in dataset may be used as a second level when no third + level is required and the second level objects derive most of their qualities from + the parent. NOTE: This model is not intended to show relationships between different + dataset entries in the form of a relational database. However in the future it is + possible that multiple resolution may be used to express such + relationships + + + + + + + + + + + + + + database_metadata contains metadata about the database. contributors + contains the author(s) of the database. In most cases, it is expected that the + organization element will be used rather than person_name element for the primary + database authoring information. contributors should not be confused with publisher + and institution. In many cases, databases are more likely to have one or both of the + latter elements rather than contributors at the top level (dataset elements are more + likely to have contributors). In most cases, the institution element may be the best + choice to deposit the database host organization because it includes the + institution_acronym element along with the name. The titles element is used to + capture the name of the database. The description element can be used to capture a + fuller description of the nature of the database than might be inferred from the + title. database_date should be used to capture the date that a database was first + created. Whenever updated records are deposited with CrossRef, the update_date + should be set to the date of the most recent CrossRef deposit. publisher_item may be + used to record an identifying number for the database other than the + DOI. + + + + + + + + + + + + + + + + + + + + + + dataset is used to capture information about one or more database + records or collections. The dataset_type attribute should be set to either "record" + or "collection" to indicate the type of deposit. The default value of this attribute + is "record". dataset entries are not intended to contain the entire database record + or collection. They are only intended to contain the metadata for each database + record or collection. The metadata can include: contributors: the author(s) of a + database record or collection titles: the title of a database record or collection + database_date: the creation date, publication date (if different from the creation + date) and the date of last update of the record publisher_item: the record number of + the dataset item. In this context, publisher_item can be used for the record number + of each item in the database. description: a brief summary description of the + contents of the database format: the format type of the dataset item if it includes + files rather than just text. Note the format element here should not be used to + describe the format of items deposited as part of the component_list doi_data: the + doi of the item. citation_list: a list of items (e.g. journal articles) cited by the + dataset item. For example, dataset entry from a taxonomy might cite the article in + which a species was first identified. component_list: a list of components included + in the dataset item such as supporting figures + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Posted-content is for the assignment of DOIs to content that may + subsequently be formally published. Non-DOI identifiers associated with the content + may be recorded in the item_number element. We encourage the inclussion of an + abstract. The relation program (rel:program) should be used to link this content + item to other DOIs including the DOI of the published version of record. Pre-print + should not be used to assign DOIs to accepted manuscripts. A DOI may be assigned to + an accepted manuscript using the content type appropriate for early registration. + DOIs assigned to accepted manuscripts should be reused (e.g. reassigned to the + published article). POsted-contnet DOIs must be continuously supported by maintaining + their metadata and the URL at which the content is available. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Prepublication content items may be organzed into groupings within a given publisher. + This element provides for naming the group. It is expected that publishers will have a small number of groups + each of which reflect a topic or subject area. + + + + + + + + + + + + Wrapper element for information about an organization that sponsored + or hosted an item but is not the publisher of the item. The institution element + should be used to deposit metadata about an organization that sponsored or hosted + the research or development of the published material but was not actually the + publisher of the information. The institution is distinctly different from the + publisher because it may not be a publishing organization. It is typically an + organization such as a university, corporation, government agency, NGO or consortia. + If the content was published by an organization other than the sponsor, the use of + both the publisher and institution elements is encouraged because authors may cite + either one in a reference, and the availability of both may allow for more precise + matching in queries. + + + + + + + + + + + + + The full name of an institution. Examples are: World Health + Organization; University of California, Davis. Corresponding institution_acronym + content for these organizations would be WHO and UCD, + respectively. + + + + + + + + + + + The acronym of the institution. Note that authors often cite with + acronyms and this information can be important in matching a query Examples: WHO, + UCDavis, UCD Note: as shown above, an institution may be know by multiple acronyms, + in which case all common acronyms should be deposited. + + + + + + + + + + + The primary city location of the institution. institution_place gives + the primary city location of the institution. When the location is a major city + (e.g. New York, Amsterdam), no qualifying country or U.S. state need be given. If + the city is not a major city, the appropriate country and/or state or province + should be added. + + + + + + + + + + + The department within an institution. institution_department gives + the department within an institution. A common use is the department under which a + dissertation was completed. Note that the institution_department is repeatable. If + multiple departments are to be deposited, each one should be given in a unique + institution_department element. Example: Department of Psychology + + + + + + + + + + + + + + The degree(s) awarded for a dissertation. + + + + + + + + + + + The contract number under which a report or paper was + written. + + + + + + + + + diff --git a/export/xsd/crossref/fundref.xsd b/export/xsd/crossref/fundref.xsd new file mode 100644 index 0000000..1b2b58e --- /dev/null +++ b/export/xsd/crossref/fundref.xsd @@ -0,0 +1,85 @@ + + + + + + + + + + FundRef documentation and examples: http://help.crossref.org/#fundref + + As part of CrossMark metadata, a deposit may contain what is called FundRef info. This details the funding behind a published article. The schema is a sequence of nested <assertion> tags. + + If a DOI is not participating in CrossMark, FundRef data may be deposited as part of the <journal_article> metadata. + + Note: Some rules will be enforced by the deposit logic (e.g. not the schema). + + FundRef data includes one or more award numbers (award_number), each of which may have one or more funders (funder_name). Each funder may have one or more optional identifiers (funder_identifier). + + A FundRef deposit begins with a <fr:program> tag within the <crossmark> structure (where fr is the namespace for the FundRef program). + + The <program> element is an implicit funder_group and will typically contain: + + A) one or more funder_name assertions and an award_number assertion. + + or + + B) one or more funder_group assertions where each funder_group should contain one or more funder_name assertions and at least one award_number assertion. + + Multiple 'award_number's may be included in a single program or fundgroup. Deposits without an award_number will be accepted, but award_number should be provided whenever possible. Items with several award numbers associated with a single funding organization should be grouped together by enclosing the "funder_name", "funder_identifier", and award_number(s) within a "fundgroup" assertion. + + + + + + + + + + + + FundRef attributes included in assertion are: + +fundgroup: used to group funding info for items with multiple funding sources. Required for items with multiple award_number assertions, optional for items with a single award_number + +funder_identifier: funding agency identifier, must be nested within the funder_name assertion + +funder_name: name of the funding agency (required) + +award_number: grant number or other fund identifier + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/export/xsd/crossref/relations.xsd b/export/xsd/crossref/relations.xsd new file mode 100644 index 0000000..c5891af --- /dev/null +++ b/export/xsd/crossref/relations.xsd @@ -0,0 +1,207 @@ + + + + Version: beta 0.3 + + This schema provides for creating relationships between items represented by crossref DOIs and other items that may + be defined by a DOI (crossref or other RA) or by some other identifier. New relation types will be added as they're needed. + Please contact support@crossref.org to request additions or changes. + + Certain relationship types are covered elsewhere in the main deposit schema due primarily to specific processing or + the need to logically group those relations alongside other relevant metadata. For example cited-by relations are + created by the deposit of a citation_list. Crossmark->Updates addresses relationships between DOIs where a primary + item is updated, revised, hasErratum, withdrawn ... etc. When constructing relations please be sure to use the + the most appropriate metadata structure. + + Relationships between DOIs in crossref are established bidirectionally between those DOIs making it unnecessary to + deposit relationship metadata for both DOIs. + Example: + DOI A metadata contains 'hasTranslation' with a target of DOI B will automatically + make this claim visible in metadata for B. + Seen from the perspective of B: A claims it hasTranslation of which B is the target of the claim. + + Change history: + 10/3/14 CSK removed reg-agency attribute. This is not necessary, can be derived from the DOI + 10/3/14 CSK split into inter and intra relation elements + 10/3/14 CSK pulled in common crossref schema for description element and language attributes + 12/21/16 CSK added comments to each relation type indicating the appropriate inverse relation type + + ====== C O N V E N T I O N ============================================================================================== + Relationships between two objects have an implicit directionality that in natural language terms dictate which object is the actor + and which is acted-upon. This directionality is semantically based on the relationship name. Crossref's model makes + no attempt to automatically 'understand' this semantic. + + The identifier parent to the PROGRAM element is considered the claimant of the relationship (e.g. the entity that + establishes the relationship). + + yes: 10.1234/abcd references 10.5678/efgh => 10.1234/abcd claims that it references 10.5678/efgh + yes: 10.1234/abcd referencedBy 10.5678/efgh => 10.1234/abcd claims that it is referenced by 10.5678/efgh + + ========================================================================================================================== + + + + + + + + Accommodates deposit of relationship claims between items. + + + + + + + + + + + + + + + Description of the relationship to the target item or of the target item itself + + + + + + + + + + + + + + + Used to describe relations between items that are not the same work. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Used to define relations between items that are essentially the same work but may differ in + format, language, revision ... etc. Assigning different identifers to exactly the same item + available in one place or as copies in multiple places can be problematic and should be avoided. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + An identifier systems may require a namespace that is needed in addition to the identifer value to provide uniqueness. + + + + + + + + + + + + diff --git a/export/xsd/doajArticles.xsd b/export/xsd/doaj/doajArticles.xsd similarity index 100% rename from export/xsd/doajArticles.xsd rename to export/xsd/doaj/doajArticles.xsd diff --git a/publication/documents_affiliations.py b/publication/documents_affiliations.py index 42fa13e..5cc300f 100644 --- a/publication/documents_affiliations.py +++ b/publication/documents_affiliations.py @@ -60,6 +60,7 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"document publishing ID (PID SciELO)") header.append(u"document publishing year") @@ -90,7 +91,7 @@ def run(self): logger.info('Export finished') def items(self): - + if not self.issns: self.issns = [None] @@ -101,7 +102,6 @@ def items(self): yield item def join_line(self, line): - return ','.join(['"%s"' % i.replace('"', '""') for i in line]) def fmt_csv(self, data): @@ -124,6 +124,7 @@ def fmt_csv(self, data): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) diff --git a/bibliometric/citedby.py b/publication/documents_affiliations_nationality.py similarity index 67% rename from bibliometric/citedby.py rename to publication/documents_affiliations_nationality.py index f730438..82f1c25 100644 --- a/bibliometric/citedby.py +++ b/publication/documents_affiliations_nationality.py @@ -1,14 +1,11 @@ # coding: utf-8 """ -Este processamento gera uma tabulação de citações concedidas no SciELO por -artigos da coleção SciELO. -Formato de saída: -"PID","ISSN","título","área temática","ano de publicação","tipo de documento","título do documento","citado por PID","citado por ISSN","citado por título","citado por título do documento" +Este processamento gera uma tabulação de idiomas de publicação de cada artigo +da coleção SciELO. """ import argparse import logging import codecs -import json import datetime import utils @@ -46,11 +43,12 @@ def _config_logging(logging_level='INFO', logging_file=None): class Dumper(object): - def __init__(self, collection, issns=None, output_file=None): + def __init__(self, home_nationality, collection, issns=None, output_file=None): - self._citedby = utils.citedby_server() + self._ratchet = utils.ratchet_server() self._articlemeta = utils.articlemeta_server() self.collection = collection + self.home_nationality = home_nationality.upper() self.issns = issns self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file header = [] @@ -63,13 +61,18 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"document publishing ID (PID SciELO)") header.append(u"document publishing year") + header.append(u'document is citable') header.append(u"document type") - header.append(u"document is citable") - - header = [u"PID", u"ISSN", u"título", u"área temática", u"ano de publicação", u"tipo de documento", u"título do documento", u"citado por PID", u"citado por ISSN", u"citado por título", u"citado por título do documento"] + header.append(u"home nationality") + header.append(u"total of affiliations") + header.append(u"national") + header.append(u"foreign") + header.append(u"undefined") + header.append(u"empty") self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) @@ -83,13 +86,6 @@ def run(self): for item in self.items(): self.write(item) - def citedby(self, pid): - data = self._citedby.citedby_pid(pid, False) - dataj = json.loads(data) - if isinstance(dataj, dict): - for item in dataj.get('cited_by', []): - yield item - def items(self): if not self.issns: @@ -97,12 +93,11 @@ def items(self): for issn in self.issns: for data in self._articlemeta.documents(collection=self.collection, issn=issn): - logger.debug('Reading document: %s' % data.publisher_id) - for item in self.citedby(data.publisher_id): - yield self.fmt_csv(data, item) + logger.debug(u'Reading document: %s' % data.publisher_id) + yield self.fmt_csv(data) - def fmt_csv(self, data, citedby): - know_languages = set(['pt', 'es', 'en']) + def fmt_csv(self, data): + know_languages = set([u'pt', u'es', u'en']) languages = set(data.languages()) issns = [] @@ -124,20 +119,43 @@ def fmt_csv(self, data, citedby): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) - line.append(data.document_type) line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') - line.append(data.original_title() or '') - line.append(citedby.get('code', '')) - line.append(citedby.get('issn', '')) - line.append(citedby.get('source', '')) - - if 'titles' in citedby and len(citedby['titles']) > 0: - line.append(citedby['titles'][0]) - else: - line.append('') + line.append(data.document_type) + line.append(self.home_nationality) + line.append(str(len(data.mixed_affiliations)) if data.mixed_affiliations else '0') + + national = 0 + foreign = 0 + undefined = 0 + empty = 0 + if data.mixed_affiliations: + for aff in data.mixed_affiliations: + aff_value = aff.get('country_iso_3166', '').upper() + + if aff_value == '': + empty += 1 + continue + + if aff_value == self.home_nationality: + national += 1 + continue + + if aff_value in choices.ISO_3166.keys() and aff_value != self.home_nationality: + foreign += 1 + continue + + if aff_value not in choices.ISO_3166.keys(): + undefined += 1 + continue + + line.append(str(national)) + line.append(str(foreign)) + line.append(str(undefined)) + line.append(str(empty)) joined_line = ','.join(['"%s"' % i.replace('"', '""') for i in line]) @@ -162,6 +180,13 @@ def main(): help='Collection Acronym' ) + parser.add_argument( + '--home_nationality', + '-n', + required=True, + help='ISO 3166 two letters country code which will be considered as the home nationality.' + ) + parser.add_argument( '--output_file', '-r', @@ -190,6 +215,6 @@ def main(): if len(args.issns) > 0: issns = utils.ckeck_given_issns(args.issns) - dumper = Dumper(args.collection, issns, args.output_file) + dumper = Dumper(args.home_nationality, args.collection, issns, args.output_file) dumper.run() diff --git a/publication/documents_authors.py b/publication/documents_authors.py index 2cf229b..2842f7b 100644 --- a/publication/documents_authors.py +++ b/publication/documents_authors.py @@ -60,6 +60,7 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"document publishing ID (PID SciELO)") header.append(u"document publishing year") @@ -128,6 +129,7 @@ def fmt_csv(self, data): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) diff --git a/publication/documents_counts.py b/publication/documents_counts.py index 5d3b366..dcb10c9 100644 --- a/publication/documents_counts.py +++ b/publication/documents_counts.py @@ -74,6 +74,7 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"document publishing ID (PID SciELO)") header.append(u"document publishing year") @@ -140,6 +141,7 @@ def fmt_csv(self, data): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) @@ -154,7 +156,7 @@ def fmt_csv(self, data): line.append(u'1' if tot_authors == 5 else u'0') # total de autores line.append(u'1' if tot_authors >= 6 else u'0') # total de autores line.append(unicode(pages(data.start_page, data.end_page))), # total de páginas - line.append(unicode(len(data.citations or []))) # total de citações + line.append(unicode(len(data.citations or []))) # total de citações joined_line = u','.join([u'"%s"' % i.replace(u'"', u'""') for i in line]) diff --git a/publication/documents_dates.py b/publication/documents_dates.py index d3b2dc4..c27a9ae 100644 --- a/publication/documents_dates.py +++ b/publication/documents_dates.py @@ -8,6 +8,8 @@ import codecs import datetime +import xylose.scielodocument + import utils import choices @@ -60,15 +62,16 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"document publishing ID (PID SciELO)") header.append(u"document publishing year") header.append(u"document type") header.append(u"document is citable") - header.append(u"document submited at") - header.append(u"document submited at year") - header.append(u"document submited at month") - header.append(u"document submited at day") + header.append(u"document submitted at") + header.append(u"document submitted at year") + header.append(u"document submitted at month") + header.append(u"document submitted at day") header.append(u"document accepted at") header.append(u"document accepted at year") header.append(u"document accepted at month") @@ -77,6 +80,10 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"document reviewed at year") header.append(u"document reviewed at month") header.append(u"document reviewed at day") + header.append(u"document published as ahead of print at") + header.append(u"document published as ahead of print at year") + header.append(u"document published as ahead of print at month") + header.append(u"document published as ahead of print at day") header.append(u"document published at") header.append(u"document published at year") header.append(u"document published at month") @@ -114,6 +121,19 @@ def items(self): yield self.fmt_csv(data) def fmt_csv(self, data): + document_publication_date = ( + data.document_publication_date or + data.creation_date or + data.update_date or + '' + ) + document_publication_date_splitted = utils.split_date( + document_publication_date) + + issue_publication_date = data.issue_publication_date + issue_publication_date_splitted = utils.split_date( + issue_publication_date) + issns = [] if data.journal.print_issn: issns.append(data.journal.print_issn) @@ -133,9 +153,10 @@ def fmt_csv(self, data): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) - line.append(data.publication_date[0:4]) + line.append(document_publication_date_splitted[0]) line.append(data.document_type) line.append(u'1' if data.document_type.lower() in choices.CITABLE_DOCUMENT_TYPES else '0') line.append(data.receive_date or '') @@ -153,16 +174,30 @@ def fmt_csv(self, data): line.append(review_splited[0]) # year line.append(review_splited[1]) # month line.append(review_splited[2]) # day - line.append(data.publication_date or '') - publication_splited = utils.split_date(data.publication_date or '') - line.append(publication_splited[0]) # year - line.append(publication_splited[1]) # month - line.append(publication_splited[2]) # day - line.append(data.creation_date or '') - creation_splited = utils.split_date(data.creation_date or '') - line.append(creation_splited[0]) # year - line.append(creation_splited[1]) # month - line.append(creation_splited[2]) # day + + try: + aop_pubdate = data.ahead_publication_date or '' + except xylose.scielodocument.UnavailableMetadataException: + aop_pubdate = '' + + line.append(aop_pubdate) + ahead_publication_date_splited = utils.split_date(aop_pubdate) + line.append(ahead_publication_date_splited[0]) # year + line.append(ahead_publication_date_splited[1]) # month + line.append(ahead_publication_date_splited[2]) # day + + # u"document published at" (collection ou issue) + line.append(issue_publication_date) + line.append(issue_publication_date_splitted[0]) # year + line.append(issue_publication_date_splitted[1]) # month + line.append(issue_publication_date_splitted[2]) # day + + #u"document published in SciELO at" (pub, ou creation, ou last update) + line.append(document_publication_date) + line.append(document_publication_date_splitted[0]) # year + line.append(document_publication_date_splitted[1]) # month + line.append(document_publication_date_splitted[2]) # day + line.append(data.update_date or '') update_splited = utils.split_date(data.update_date or '') line.append(update_splited[0]) # year diff --git a/publication/documents_languages.py b/publication/documents_languages.py index 230a00d..88f8252 100644 --- a/publication/documents_languages.py +++ b/publication/documents_languages.py @@ -60,10 +60,11 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"document publishing ID (PID SciELO)") header.append(u"document publishing year") - header.append(u'docuemnt is citable') + header.append(u'document is citable') header.append(u"document type") header.append(u"document languages") header.append(u"document pt") @@ -116,6 +117,7 @@ def fmt_csv(self, data): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) diff --git a/publication/documents_licenses.py b/publication/documents_licenses.py index 90b2601..593a654 100644 --- a/publication/documents_licenses.py +++ b/publication/documents_licenses.py @@ -60,12 +60,13 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"document publishing ID (PID SciELO)") - header.append(u"docuemnt publishing year") + header.append(u"document publishing year") header.append(u"document type") header.append(u"document is citable") - header.append(u"docuemnt license") + header.append(u"document license") self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) @@ -110,6 +111,7 @@ def fmt_csv(self, data): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.journal.subject_areas or []) > 2 else '0') line.append(data.journal.current_status) line.append(data.publisher_id) line.append(data.publication_date[0:4]) diff --git a/publication/dumper.py b/publication/dumper.py index c4eb9ec..2d2f973 100644 --- a/publication/dumper.py +++ b/publication/dumper.py @@ -6,8 +6,15 @@ import utils -import documents_counts, documents_affiliations, documents_languages, documents_licenses, documents_authors, documents_dates - +from publication import ( + documents_counts, + documents_affiliations, + documents_affiliations_nationality, + documents_languages, + documents_licenses, + documents_authors, + documents_dates +) logger = logging.getLogger(__name__) @@ -42,18 +49,21 @@ def _config_logging(logging_level='INFO', logging_file=None): class Dumper(object): - def __init__(self, collection, issns=None): + def __init__(self, collection, home_nationality=None, issns=None): self._ratchet = utils.ratchet_server() self._articlemeta = utils.articlemeta_server() self.collection = collection self.issns = issns + self.home_nationality = home_nationality self.documents_counts = documents_counts.Dumper(collection, output_file='documents_counts.csv') self.documents_affiliations = documents_affiliations.Dumper(collection, output_file='documents_affiliations.csv') self.documents_languages = documents_languages.Dumper(collection, output_file='documents_languages.csv') self.documents_licenses = documents_licenses.Dumper(collection, output_file='documents_licenses.csv') self.documents_authors = documents_authors.Dumper(collection, output_file='documents_authors.csv') self.documents_dates = documents_dates.Dumper(collection, output_file='documents_dates.csv') + if self.home_nationality: + self.documents_affiliations_nationality = documents_affiliations_nationality.Dumper(home_nationality, collection, output_file='documents_affiliation_nationality.csv') def run(self): @@ -69,6 +79,8 @@ def run(self): self.documents_licenses.write(self.documents_licenses.fmt_csv(data)) self.documents_authors.write(self.documents_authors.fmt_csv(data)) self.documents_dates.write(self.documents_dates.fmt_csv(data)) + if self.home_nationality: + self.documents_affiliations_nationality.write(self.documents_affiliations_nationality.fmt_csv(data)) logger.info('Export finished') @@ -91,6 +103,12 @@ def main(): help='Collection Acronym' ) + parser.add_argument( + '--home_nationality', + '-n', + help='ISO 3166 two letters country code which will be considered as the home nationality.' + ) + parser.add_argument( '--logging_file', '-o', @@ -113,6 +131,6 @@ def main(): if len(args.issns) > 0: issns = utils.ckeck_given_issns(args.issns) - dumper = Dumper(args.collection, issns) + dumper = Dumper(args.collection, home_nationality=args.home_nationality, issns=issns) dumper.run() diff --git a/publication/journals.py b/publication/journals.py index 4a6dd64..57dec6f 100644 --- a/publication/journals.py +++ b/publication/journals.py @@ -13,6 +13,7 @@ import choices from clients.analytics import Analytics +from scieloh5m5 import h5m5 logger = logging.getLogger(__name__) @@ -72,7 +73,7 @@ def __init__(self, collection, issns=None, output_file=None, years=6): self._lines = [] self.output_file = codecs.open(output_file, 'w', encoding='utf-8') if output_file else output_file now = datetime.date.today().year - years_range = [i for i in range(now, now-self._years, -1)] + self.years_range = [i for i in range(now, now-self._years, -1)] header = [] header.append(u"extraction date") header.append(u"study unit") @@ -83,6 +84,7 @@ def __init__(self, collection, issns=None, output_file=None, years=6): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"title + subtitle SciELO") header.append(u"short title SciELO") @@ -102,21 +104,25 @@ def __init__(self, collection, issns=None, output_file=None, years=6): header.append(u"volume of the last document") header.append(u"issue of the last document") header.append(u"total of issues") - header += [u"issues at %s" % str(i) for i in years_range] + header += [u"issues at %s" % str(i) for i in self.years_range] header.append(u"total of regular issues") - header += [u"regular issues at %s" % str(i) for i in years_range] + header += [u"regular issues at %s" % str(i) for i in self.years_range] header.append(u"total of documents") - header += [u"documents at %s" % str(i) for i in years_range] + header += [u"documents at %s" % str(i) for i in self.years_range] header.append(u"citable documents") - header += [u"citable documents at %s" % str(i) for i in years_range] - for year in years_range: + header += [u"citable documents at %s" % str(i) for i in self.years_range] + for year in self.years_range: header.append(u'portuguese documents at %s ' % year) - for year in years_range: + for year in self.years_range: header.append(u'spanish documents at %s ' % year) - for year in years_range: + for year in self.years_range: header.append(u'english documents at %s ' % year) - for year in years_range: + for year in self.years_range: header.append(u'other language documents at %s ' % year) + for year in self.years_range: + header.append(u'google scholar h5 %s ' % year) + for year in self.years_range: + header.append(u'google scholar m5 %s ' % year) self.write(u','.join([u'"%s"' % i.replace(u'"', u'""') for i in header])) @@ -157,6 +163,9 @@ def _first_included_document_by_journal(self, issn, collection): document = self._articlemeta.document(fid['pid'], fid['collection']) + if not document.data: + return None + return document def _last_included_document_by_journal(self, issn, collection): @@ -169,6 +178,9 @@ def _last_included_document_by_journal(self, issn, collection): document = self._articlemeta.document(lid['pid'], lid['collection']) + if not document.data: + return None + return document def _impact_factor(self, issn, collection): @@ -235,6 +247,7 @@ def fmt_csv(self, data): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.subject_areas or []) > 2 else '0') line.append(data.current_status) line.append(u' '.join([data.title or u'', data.subtitle or u''])) line.append(data.abbreviated_title or u'') @@ -248,11 +261,11 @@ def fmt_csv(self, data): line.append(interruption[0][:4] if interruption else u'') line.append(interruption[2][:4] if interruption else u'') line.append(first_document.publication_date or u'' if first_document else u'') - line.append(first_document.issue.volume or u'' if first_document else u'') - line.append(first_document.issue.number or u'' if first_document else u'') + line.append(first_document.issue.volume or u'' if first_document and first_document.issue else u'') + line.append(first_document.issue.number or u'' if first_document and first_document.issue else u'') line.append(last_document.publication_date or u'' if last_document else u'') - line.append(last_document.issue.volume or u'' if last_document else u'') - line.append(last_document.issue.number or u'' if last_document else u'') + line.append(last_document.issue.volume or u'' if last_document and last_document.issue else u'') + line.append(last_document.issue.number or u'' if last_document and last_document.issue else u'') line.append(unicode(self._number_of_issues_by_year( data.scielo_issn, @@ -335,6 +348,16 @@ def fmt_csv(self, data): for years, values in sorted(languages.items(), reverse=True): line.append(unicode(values['other'])) + for year in self.years_range: + h5 = h5m5.get(data.scielo_issn, str(year)) + h5 = h5.get('h5', None) if h5 else None + line.append(h5 or '') + + for year in self.years_range: + m5 = h5m5.get(data.scielo_issn, str(year)) + m5 = m5.get('m5', None) if m5 else None + line.append(m5 or '') + joined_line = u','.join([u'"%s"' % i.replace(u'"', u'""') for i in line]) return joined_line diff --git a/publication/journals_status_changes.py b/publication/journals_status_changes.py index 17dbcaf..ba09971 100644 --- a/publication/journals_status_changes.py +++ b/publication/journals_status_changes.py @@ -61,6 +61,7 @@ def __init__(self, collection, issns=None, output_file=None): header.append(u"title thematic areas") for area in choices.THEMATIC_AREAS: header.append(u"title is %s" % area.lower()) + header.append(u"title is multidisciplinary") header.append(u"title current status") header.append(u"status change date") header.append(u"status change year") @@ -115,6 +116,7 @@ def fmt_csv(self, data, history): line.append(u'1') else: line.append(u'0') + line.append('1' if len(data.subject_areas or []) > 2 else '0') line.append(data.current_status) line.append(hist) hist_splited = utils.split_date(hist or '') diff --git a/requirements.txt b/requirements.txt index 285cdef..e87df26 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,20 @@ -thriftpy==0.3.1 -packtools -django==1.8.3 -requests==2.8.1 --e git+https://github.com/scieloorg/xylose@1.3.4#egg=xylose --e git+https://github.com/scieloorg/processing@0.3.1#egg=processing --e git+https://github.com/fabiobatalha/doaj_client@0.1#egg=doaj_client +accessstatsapi==1.2.1 +articlemetaapi==1.26.5 +certifi==2019.11.28 +chardet==3.0.4 +citedbyapi==1.11.3 +-e git+https://github.com/fabiobatalha/doaj_client@27c2c17dc6d0d9ee3aa12283b20c1ac6170869b6#egg=doaj_client-master +idna==2.7 +legendarium==2.0.2 +lxml==4.5.0 +packtools==2.5.3 +pathlib==1.0.1 +picles.plumber==0.11 +ply==3.11 +publicationstatsapi==1.2.1 +requests==2.19.1 +-e git+https://github.com/scieloorg/scieloh5m5.git@1.9.6#egg=scieloh5m5 +thriftpy==0.3.9 +urllib3==1.23 +wsgiref==0.1.2 +-e git+https://github.com/scieloorg/xylose.git@1.35.13#egg=xylose diff --git a/setup.py b/setup.py index 6ebb4ac..9e06469 100644 --- a/setup.py +++ b/setup.py @@ -2,25 +2,28 @@ from setuptools import setup, find_packages install_requires = [ - 'thriftpy==0.3.1', - 'xylose', - 'packtools', - 'django>=1.8.3', + 'thriftpy', + 'packtools<=2.5.3', 'requests', - 'lxml>=3.4.4', - 'doaj_client' + 'lxml', + 'doaj_client', + 'scieloh5m5', + 'xylose', + 'articlemetaapi<=1.26.5', + 'publicationstatsapi>=1.2.1', + 'accessstatsapi>=1.2.1', + 'citedbyapi>=1.11.3', + 'legendarium>=2.0.2', ] tests_require = [] setup( name="processing", - version="0.3.1", + version="1.32.7", description="SciELO processing modules for analytics, access statistics, etc", author="SciELO", author_email="scielo-dev@googlegroups.com", - maintainer="Fabio Batalha", - maintainer_email="fabio.batalha@scielo.org", url="http://github.com/scieloorg/processing", packages=find_packages(), include_package_data=True, @@ -31,8 +34,8 @@ "Programming Language :: Python :: 2.7", ], dependency_links=[ - "git+https://git@github.com/scieloorg/xylose.git@v1.0.1#egg=xylose", - "git+https://github.com/fabiobatalha/doaj_client@0.1#egg=doaj_client" + "git+https://github.com/fabiobatalha/doaj_client@0.2#egg=doaj_client", + "git+https://github.com/scieloorg/xylose@1.35.8#egg=xylose", ], tests_require=tests_require, test_suite='tests', @@ -43,6 +46,7 @@ processing_accesses_documents_by_journals=accesses.documents_by_journals:main processing_publication_documents_languages=publication.documents_languages:main processing_publication_documents_affiliations=publication.documents_affiliations:main + processing_publication_documents_affiliations_nationality=publication.documents_affiliations_nationality:main processing_publication_documents_authors=publication.documents_authors:main processing_publication_documents_counts=publication.documents_counts:main processing_publication_documents_licenses=publication.documents_licenses:main @@ -55,11 +59,14 @@ processing_export_xmlrsps=export.xml_rsps:main processing_export_normalize_affiliations=export.normalize_affiliations:main processing_export_natural_keys=export.natural_keys:main + processing_export_crossref=export.crossref:main processing_export_doaj=export.exdoaj:main processing_export_doaj_journals=export.doaj_journals:main processing_export_kbart=export.kbart:main + processing_export_dumparticles=export.dump_articles:main processing_export_search_update_indicators=export.search_update_indicators:main - processing_bibliometric_citedby=bibliometric.citedby:main + processing_bibliometric_citedby_document=bibliometric.citedby_document:main + processing_bibliometric_citedby_journal=bibliometric.citedby_journal:main processing_bibliometric_impact_factor=bibliometric.impact_factor:main """ ) diff --git a/tests/test_accesses_dumpdata.py b/tests/test_accesses_dumpdata.py index 8931489..092e2b6 100644 --- a/tests/test_accesses_dumpdata.py +++ b/tests/test_accesses_dumpdata.py @@ -7,6 +7,47 @@ class DumpDataTest(unittest.TestCase): + def test_website_2018_urls(self): + + class Journal(object): + def __init__(self): + self.acron = None + + class Issue(object): + def __init__(self): + self.volume = None + self.number = None + self.supplement_volume = None + self.supplement_number = None + + class Document(object): + def __init__(self): + self.journal = Journal() + self.issue = Issue() + self.start_page = None + self.start_page_sequence = None + self.end_page = None + self.elocation = None + self.doi = None + self.publication_date = None + document = Document() + document.journal.acronym = 'abcd' + document.publication_date = '2018' + document.issue.volume = '22' + document.issue.number = '3' + document.issue.supplement_number = '0' + document.start_page = '10' + document.end_page = '11' + document.elocation = 'e707' + document.doi = 'doi1510.bla1' + document.issue.order = '12345' + result = dumpdata.website_2018_urls(document) + self.assertEqual( + result, + [u'/article/abcd/2018.v22n3suppl0/e707/', + u'/pdf/abcd/2018.v22n3suppl0/e707/'] + ) + def test_pdf_keys(self): data = { 'html': { @@ -21,8 +62,8 @@ def test_pdf_keys(self): result = dumpdata.pdf_keys(data) self.assertEqual( - sorted(result), - sorted(['/PDF/ABCD/V22N3/EN_V22N3A01.PDF', '/PDF/ABCD/V22N3/V22N3A01.PDF']) + sorted(result), + sorted(['/pdf/abcd/v22n3/v22n3a01.pdf', '/pdf/abcd/v22n3/en_v22n3a01.pdf']) ) def test_pdf_keys_without_pdf(self): @@ -333,17 +374,18 @@ def test_join_metadata_with_accesses(self): 'html': 1, 'pdf': 10 } - + result = dumpdata.join_metadata_with_accesses(article, '2012-01-08', accesses) expected = { 'id': 'scl_S0102-67202009000300001', 'languages': ['pt'], 'issn': '0102-6720', + 'issns': {'0102-6720'}, 'document_type': 'research-article', 'aff_countries': ['undefined'], 'document_title': 'An\u00e1lise de custos entre a raquianestesia e a anestesia venosa com propofol associada ao bloqueio perianal local em opera\u00e7\u00f5es anorretais', - 'issue_title': 'ABCD, arq. bras. cir. dig., n.22 v.3, 2009', + 'issue_title': 'ABCD, arq. bras. cir. dig., 2009, v22n3', 'access_total': 14, 'access_abstract': 3, 'access_html': 1, @@ -357,10 +399,12 @@ def test_join_metadata_with_accesses(self): 'pid': 'S0102-67202009000300001', 'collection': 'scl', 'publication_year': '2009', + 'publication_date_at_scielo': '2010-05-14', + 'journal_current_status': 'current', 'journal_title': 'ABCD. Arquivos Brasileiros de Cirurgia Digestiva (S\u00e3o Paulo)', 'processing_date': '2010-05-14', 'publication_date': '2009-09', - 'issue': 'S0102-672020090003' + 'issue': '0102-672020090003' } self.assertEqual(sorted([k+str(v) for k, v in expected.items()]), sorted([k+str(v) for k, v in result.items()])) diff --git a/tests/test_bibliometric.py b/tests/test_bibliometric.py new file mode 100644 index 0000000..cab23d2 --- /dev/null +++ b/tests/test_bibliometric.py @@ -0,0 +1,827 @@ +import unittest + +from bibliometric import citedby_journal + + +class TestBibliometric(unittest.TestCase): + + def test_compute_citations(self): + + query_result = { + "took": 1857, + "hits": { + "max_score": 0.0, + "hits": [], + "total": 195 + }, + "aggregations": { + "publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "2012" + }, + { + "doc_count": 2, + "key": "2011" + }, + { + "doc_count": 1, + "key": "2010" + }, + { + "doc_count": 2, + "key": "2008" + }, + { + "doc_count": 3, + "key": "2007" + }, + { + "doc_count": 2, + "key": "2005" + }, + { + "doc_count": 1, + "key": "2003" + }, + { + "doc_count": 1, + "key": "2001" + }, + { + "doc_count": 1, + "key": "1998" + }, + { + "doc_count": 1, + "key": "1997" + }, + { + "doc_count": 1, + "key": "1993" + }, + { + "doc_count": 1, + "key": "1990" + }, + { + "doc_count": 3, + "key": "1988" + }, + { + "doc_count": 1, + "key": "1986" + }, + { + "doc_count": 2, + "key": "1980" + }, + { + "doc_count": 1, + "key": "1979" + }, + { + "doc_count": 1, + "key": "1973" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 25, + "key": "2012" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "2013" + }, + { + "doc_count": 3, + "key": "2012" + }, + { + "doc_count": 1, + "key": "2006" + }, + { + "doc_count": 1, + "key": "2005" + }, + { + "doc_count": 1, + "key": "2004" + }, + { + "doc_count": 1, + "key": "2002" + }, + { + "doc_count": 1, + "key": "1998" + }, + { + "doc_count": 2, + "key": "1996" + }, + { + "doc_count": 1, + "key": "1995" + }, + { + "doc_count": 2, + "key": "1993" + }, + { + "doc_count": 5, + "key": "1992" + }, + { + "doc_count": 1, + "key": "1989" + }, + { + "doc_count": 1, + "key": "1988" + }, + { + "doc_count": 1, + "key": "1981" + }, + { + "doc_count": 1, + "key": "1979" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 23, + "key": "2015" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 2, + "key": "2011" + }, + { + "doc_count": 2, + "key": "2009" + }, + { + "doc_count": 1, + "key": "2008" + }, + { + "doc_count": 1, + "key": "2006" + }, + { + "doc_count": 1, + "key": "2005" + }, + { + "doc_count": 2, + "key": "2000" + }, + { + "doc_count": 2, + "key": "1997" + }, + { + "doc_count": 1, + "key": "1994" + }, + { + "doc_count": 1, + "key": "1993" + }, + { + "doc_count": 1, + "key": "1988" + }, + { + "doc_count": 1, + "key": "1986" + }, + { + "doc_count": 1, + "key": "1984" + }, + { + "doc_count": 1, + "key": "1981" + }, + { + "doc_count": 1, + "key": "1980" + }, + { + "doc_count": 2, + "key": "1974" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 20, + "key": "2013" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 2, + "key": "2012" + }, + { + "doc_count": 3, + "key": "2010" + }, + { + "doc_count": 1, + "key": "2009" + }, + { + "doc_count": 1, + "key": "2008" + }, + { + "doc_count": 1, + "key": "2006" + }, + { + "doc_count": 2, + "key": "2002" + }, + { + "doc_count": 1, + "key": "1989" + }, + { + "doc_count": 2, + "key": "1988" + }, + { + "doc_count": 3, + "key": "1984" + }, + { + "doc_count": 2, + "key": "1980" + }, + { + "doc_count": 1, + "key": "1972" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 19, + "key": "2014" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 2, + "key": "2013" + }, + { + "doc_count": 1, + "key": "2012" + }, + { + "doc_count": 3, + "key": "2011" + }, + { + "doc_count": 1, + "key": "2010" + }, + { + "doc_count": 2, + "key": "2009" + }, + { + "doc_count": 3, + "key": "2005" + }, + { + "doc_count": 1, + "key": "2004" + }, + { + "doc_count": 1, + "key": "2003" + }, + { + "doc_count": 1, + "key": "1998" + }, + { + "doc_count": 1, + "key": "1997" + }, + { + "doc_count": 1, + "key": "1995" + }, + { + "doc_count": 1, + "key": "1959" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 18, + "key": "2016" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "1996" + }, + { + "doc_count": 1, + "key": "1994" + }, + { + "doc_count": 2, + "key": "1990" + }, + { + "doc_count": 2, + "key": "1989" + }, + { + "doc_count": 1, + "key": "1988" + }, + { + "doc_count": 1, + "key": "1985" + }, + { + "doc_count": 1, + "key": "1984" + }, + { + "doc_count": 1, + "key": "1980" + }, + { + "doc_count": 1, + "key": "1975" + }, + { + "doc_count": 1, + "key": "1970" + }, + { + "doc_count": 1, + "key": "1968" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 13, + "key": "2007" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "2008" + }, + { + "doc_count": 1, + "key": "2006" + }, + { + "doc_count": 1, + "key": "2005" + }, + { + "doc_count": 1, + "key": "2002" + }, + { + "doc_count": 2, + "key": "2001" + }, + { + "doc_count": 1, + "key": "1994" + }, + { + "doc_count": 1, + "key": "1988" + }, + { + "doc_count": 1, + "key": "1985" + }, + { + "doc_count": 1, + "key": "1984" + }, + { + "doc_count": 1, + "key": "1982" + }, + { + "doc_count": 2, + "key": "1974" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 13, + "key": "2011" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "2006" + }, + { + "doc_count": 1, + "key": "2003" + }, + { + "doc_count": 1, + "key": "2001" + }, + { + "doc_count": 1, + "key": "2000" + }, + { + "doc_count": 1, + "key": "1998" + }, + { + "doc_count": 1, + "key": "1997" + }, + { + "doc_count": 1, + "key": "1995" + }, + { + "doc_count": 1, + "key": "1993" + }, + { + "doc_count": 1, + "key": "1991" + }, + { + "doc_count": 1, + "key": "1989" + }, + { + "doc_count": 1, + "key": "1984" + }, + { + "doc_count": 1, + "key": "1980" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 12, + "key": "2010" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "2008" + }, + { + "doc_count": 2, + "key": "2007" + }, + { + "doc_count": 1, + "key": "2005" + }, + { + "doc_count": 1, + "key": "2004" + }, + { + "doc_count": 1, + "key": "2002" + }, + { + "doc_count": 1, + "key": "1998" + }, + { + "doc_count": 2, + "key": "1990" + }, + { + "doc_count": 1, + "key": "1989" + }, + { + "doc_count": 1, + "key": "1968" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 11, + "key": "2009" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "1989" + }, + { + "doc_count": 1, + "key": "1988" + }, + { + "doc_count": 2, + "key": "1986" + }, + { + "doc_count": 2, + "key": "1975" + }, + { + "doc_count": 2, + "key": "1974" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 8, + "key": "1999" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "1994" + }, + { + "doc_count": 2, + "key": "1991" + }, + { + "doc_count": 1, + "key": "1989" + }, + { + "doc_count": 1, + "key": "1986" + }, + { + "doc_count": 1, + "key": "1979" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 6, + "key": "1998" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "2003" + }, + { + "doc_count": 1, + "key": "1996" + }, + { + "doc_count": 1, + "key": "1995" + }, + { + "doc_count": 1, + "key": "1994" + }, + { + "doc_count": 1, + "key": "1991" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 6, + "key": "2006" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "1999" + }, + { + "doc_count": 1, + "key": "1990" + }, + { + "doc_count": 1, + "key": "1986" + }, + { + "doc_count": 1, + "key": "1974" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 5, + "key": "2000" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "2004" + }, + { + "doc_count": 1, + "key": "2003" + }, + { + "doc_count": 1, + "key": "1994" + }, + { + "doc_count": 1, + "key": "1981" + }, + { + "doc_count": 1, + "key": "1964" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 5, + "key": "2008" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "1995" + }, + { + "doc_count": 1, + "key": "1992" + }, + { + "doc_count": 1, + "key": "1983" + }, + { + "doc_count": 1, + "key": "1971" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 4, + "key": "2004" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "1989" + }, + { + "doc_count": 1, + "key": "1979" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 2, + "key": "2001" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "1996" + }, + { + "doc_count": 1, + "key": "1989" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 2, + "key": "2005" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "1994" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 1, + "key": "1997" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "1972" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 1, + "key": "2002" + }, + { + "reference_publication_year": { + "sum_other_doc_count": 0, + "buckets": [ + { + "doc_count": 1, + "key": "1992" + } + ], + "doc_count_error_upper_bound": 0 + }, + "doc_count": 1, + "key": "2003" + } + ], + "doc_count_error_upper_bound": 0 + } + }, + "timed_out": False, + "_shards": { + "failed": 0, + "total": 5, + "successful": 5 + } + } + + result = citedby_journal.compute_citations(query_result) + + self.assertEqual([('2012', ('2012', 1)), ('2012', ('2011', 2)), ('2012', ('2010', 1)), ('2012', ('2008', 2)), ('2012', ('2007', 3)), ('2012', ('2005', 2)), ('2012', ('2003', 1)), ('2012', ('2001', 1)), ('2012', ('1998', 1)), ('2012', ('1997', 1)), ('2012', ('1993', 1)), ('2012', ('1990', 1)), ('2012', ('1988', 3)), ('2012', ('1986', 1)), ('2012', ('1980', 2)), ('2012', ('1979', 1)), ('2012', ('1973', 1)), ('2015', ('2013', 1)), ('2015', ('2012', 3)), ('2015', ('2006', 1)), ('2015', ('2005', 1)), ('2015', ('2004', 1)), ('2015', ('2002', 1)), ('2015', ('1998', 1)), ('2015', ('1996', 2)), ('2015', ('1995', 1)), ('2015', ('1993', 2)), ('2015', ('1992', 5)), ('2015', ('1989', 1)), ('2015', ('1988', 1)), ('2015', ('1981', 1)), ('2015', ('1979', 1)), ('2013', ('2011', 2)), ('2013', ('2009', 2)), ('2013', ('2008', 1)), ('2013', ('2006', 1)), ('2013', ('2005', 1)), ('2013', ('2000', 2)), ('2013', ('1997', 2)), ('2013', ('1994', 1)), ('2013', ('1993', 1)), ('2013', ('1988', 1)), ('2013', ('1986', 1)), ('2013', ('1984', 1)), ('2013', ('1981', 1)), ('2013', ('1980', 1)), ('2013', ('1974', 2)), ('2014', ('2012', 2)), ('2014', ('2010', 3)), ('2014', ('2009', 1)), ('2014', ('2008', 1)), ('2014', ('2006', 1)), ('2014', ('2002', 2)), ('2014', ('1989', 1)), ('2014', ('1988', 2)), ('2014', ('1984', 3)), ('2014', ('1980', 2)), ('2014', ('1972', 1)), ('2016', ('2013', 2)), ('2016', ('2012', 1)), ('2016', ('2011', 3)), ('2016', ('2010', 1)), ('2016', ('2009', 2)), ('2016', ('2005', 3)), ('2016', ('2004', 1)), ('2016', ('2003', 1)), ('2016', ('1998', 1)), ('2016', ('1997', 1)), ('2016', ('1995', 1)), ('2016', ('1959', 1)), ('2007', ('1996', 1)), ('2007', ('1994', 1)), ('2007', ('1990', 2)), ('2007', ('1989', 2)), ('2007', ('1988', 1)), ('2007', ('1985', 1)), ('2007', ('1984', 1)), ('2007', ('1980', 1)), ('2007', ('1975', 1)), ('2007', ('1970', 1)), ('2007', ('1968', 1)), ('2011', ('2008', 1)), ('2011', ('2006', 1)), ('2011', ('2005', 1)), ('2011', ('2002', 1)), ('2011', ('2001', 2)), ('2011', ('1994', 1)), ('2011', ('1988', 1)), ('2011', ('1985', 1)), ('2011', ('1984', 1)), ('2011', ('1982', 1)), ('2011', ('1974', 2)), ('2010', ('2006', 1)), ('2010', ('2003', 1)), ('2010', ('2001', 1)), ('2010', ('2000', 1)), ('2010', ('1998', 1)), ('2010', ('1997', 1)), ('2010', ('1995', 1)), ('2010', ('1993', 1)), ('2010', ('1991', 1)), ('2010', ('1989', 1)), ('2010', ('1984', 1)), ('2010', ('1980', 1)), ('2009', ('2008', 1)), ('2009', ('2007', 2)), ('2009', ('2005', 1)), ('2009', ('2004', 1)), ('2009', ('2002', 1)), ('2009', ('1998', 1)), ('2009', ('1990', 2)), ('2009', ('1989', 1)), ('2009', ('1968', 1)), ('1999', ('1989', 1)), ('1999', ('1988', 1)), ('1999', ('1986', 2)), ('1999', ('1975', 2)), ('1999', ('1974', 2)), ('1998', ('1994', 1)), ('1998', ('1991', 2)), ('1998', ('1989', 1)), ('1998', ('1986', 1)), ('1998', ('1979', 1)), ('2006', ('2003', 1)), ('2006', ('1996', 1)), ('2006', ('1995', 1)), ('2006', ('1994', 1)), ('2006', ('1991', 1)), ('2000', ('1999', 1)), ('2000', ('1990', 1)), ('2000', ('1986', 1)), ('2000', ('1974', 1)), ('2008', ('2004', 1)), ('2008', ('2003', 1)), ('2008', ('1994', 1)), ('2008', ('1981', 1)), ('2008', ('1964', 1)), ('2004', ('1995', 1)), ('2004', ('1992', 1)), ('2004', ('1983', 1)), ('2004', ('1971', 1)), ('2001', ('1989', 1)), ('2001', ('1979', 1)), ('2005', ('1996', 1)), ('2005', ('1989', 1)), ('1997', ('1994', 1)), ('2002', ('1972', 1)), ('2003', ('1992', 1))], result) diff --git a/tests/test_clients.py b/tests/test_clients.py index 1585b5e..4c6bf94 100644 --- a/tests/test_clients.py +++ b/tests/test_clients.py @@ -6,199 +6,6 @@ class ThirftClientsTest(unittest.TestCase): - def test_compute_documents_languages_by_year(self): - - publicationtats = publicationstats_server() - - query_result = { - "hits": { - "hits": [], - "total": 19, - "max_score": 0.0 - }, - "timed_out": False, - "took": 3, - "aggregations": { - "publication_year": { - "buckets": [ - { - "languages": { - "buckets": [ - { - "key": "pt", - "doc_count": 9 - }, - { - "key": "en", - "doc_count": 8 - }, - { - "key": "fr", - "doc_count": 1 - } - ], - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0 - }, - "key": "2016", - "doc_count": 9 - }, - { - "languages": { - "buckets": [ - { - "key": "pt", - "doc_count": 10 - }, - { - "key": "en", - "doc_count": 8 - }, - { - "key": "fr", - "doc_count": 2 - } - ], - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0 - }, - "key": "2015", - "doc_count": 10 - } - ], - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0 - } - }, - "_shards": { - "successful": 5, - "failed": 0, - "total": 5 - } - } - - expected = { - "2015": { - "other": 2, - "en": 8, - "es": 0, - "pt": 10 - }, - "2014": { - "other": 0, - "en": 0, - "es": 0, - "pt": 0 - }, - "2016": { - "other": 1, - "en": 8, - "es": 0, - "pt": 9 - }, - "2013": { - "other": 0, - "en": 0, - "es": 0, - "pt": 0 - }, - "2012": { - "other": 0, - "en": 0, - "es": 0, - "pt": 0 - } - } - - result = publicationtats._compute_documents_languages_by_year( - query_result, years=5) - - self.assertEqual(expected, result) - - def test_compute_compute_number_of_issues_by_year_0(self): - - publicationtats = publicationstats_server() - - query_result = { - "took": 18, - "timed_out": False, - "_shards": { - "total": 5, - "successful": 5, - "failed": 0 - }, - "hits": { - "total": 821, - "max_score": 0, - "hits": [] - }, - "aggregations": { - "issue": { - "value": 82 - } - } - } - - expected = 82 - - result = publicationtats._compute_number_of_issues_by_year( - query_result, years=0) - - self.assertEqual(expected, result) - - def test_compute_compute_number_of_issues_by_year(self): - - publicationtats = publicationstats_server() - - query_result = { - "hits": { - "hits": [], - "total": 19, - "max_score": 0.0 - }, - "timed_out": False, - "took": 826, - "aggregations": { - "publication_year": { - "buckets": [ - { - "issue": { - "value": 1 - }, - "key": "2016", - "doc_count": 9 - }, - { - "issue": { - "value": 2 - }, - "key": "2015", - "doc_count": 20 - } - ], - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0 - } - }, - "_shards": { - "successful": 5, - "failed": 0, - "total": 5 - } - } - - expected = [ - ('2016', 1), - ('2015', 2), - ('2014', 0), - ('2013', 0), - ('2012', 0) - ] - - result = publicationtats._compute_number_of_issues_by_year(query_result, years=5) - - self.assertEqual(expected, result) - def test_compute_last_included_document_by_journal_without_data(self): publicationtats = publicationstats_server() @@ -434,7 +241,6 @@ def test_compute_first_included_document_by_journal(self): self.assertEqual(expected, result) - def test_compute_access_lifetime(self): accessstats = accessstats_server() diff --git a/thrift/access_stats.thrift b/thrift/access_stats.thrift deleted file mode 100644 index 5b6aeef..0000000 --- a/thrift/access_stats.thrift +++ /dev/null @@ -1,22 +0,0 @@ -exception ServerError { - 1: string message, -} - -exception ValueError { - 1: string message, -} - -struct filters { - 1: string param, - 2: string value -} - -struct kwargs { - 1: string key, - 2: string value, -} - -service AccessStats { - string document(1:string code, 2: string collection) throws (1:ValueError value_err, 2:ServerError server_err), - string search(1: string body, 2: optional list parameters) throws (1:ValueError value_err, 2:ServerError server_err) -} \ No newline at end of file diff --git a/thrift/articlemeta.thrift b/thrift/articlemeta.thrift deleted file mode 100644 index 7564b2d..0000000 --- a/thrift/articlemeta.thrift +++ /dev/null @@ -1,58 +0,0 @@ -exception ValueError { - 1: string message, -} - -exception ServerError { - 1: string message, -} - -struct collection { - 1: string code - 2: string acronym - 3: string acronym2letters - 4: string status - 5: string domain - 6: string name - 7: bool has_analytics -} - -struct journal_identifiers { - 1: list code, - 2: string collection -} - -struct article_identifiers { - 1: string code, - 2: string collection, - 3: string processing_date, - 4: string aid, - 5: string doi, -} - -struct event_document { - 1: string code, - 2: string collection, - 3: string event, - 4: string date -} - -struct event_journal { - 1: list code, - 2: string collection, - 3: string event, - 4: string date -} - -service ArticleMeta { - list article_history_changes(1: string collection, 2: string event, 3: string code, 4: string from_date, 5: string until_date, 6: i32 limit, 7: i32 offset) throws (1: ValueError value_err, 2:ServerError server_err), - list journal_history_changes(1: string collection, 2: string event, 3: string code, 4: string from_date, 5: string until_date, 6:i32 limit, 7: i32 offset) throws (1: ValueError value_err, 2:ServerError server_err), - collection get_collection(1: string code) throws (1: ValueError value_err, 2:ServerError server_err), - string get_article(1: string code, 2: string collection, 3: bool replace_journal_metadata, 4: string fmt, 5: bool body) throws (1: ValueError value_err, 2:ServerError server_err), - string get_journal(1: string code, 2: string collection) throws (1: ValueError value_err, 2:ServerError server_err), - list get_journal_identifiers(1: optional string collection, 2: i32 limit, 3: i32 offset, 4: optional string extra_filter) throws (1: ValueError value_err, 2:ServerError server_err), - list get_article_identifiers(1: optional string collection, 2: optional string issn, 3: optional string from_date, 4: optional string until_date, 5: i32 limit, 6: i32 offset, 7: optional string extra_filter) throws (1:ValueError value_err, 2:ServerError server_err), - list get_collection_identifiers() throws(1: ServerError server_err), - bool set_doaj_id(1: string code, 2: string collection, 3: string doaj_id) throws (1: ValueError value_err, 2:ServerError server_err), - bool set_aid(1: string code, 2: string collection, 3: string aid) throws (1: ValueError value_err, 2:ServerError server_err), - bool exists_article(1: string code, 2: string collection) throws (1: ValueError value_err, 2:ServerError server_err) -} diff --git a/thrift/citedby.thrift b/thrift/citedby.thrift deleted file mode 100644 index a15929c..0000000 --- a/thrift/citedby.thrift +++ /dev/null @@ -1,11 +0,0 @@ -exception ServerError{ - 1: string message; -} - -service Citedby{ - string citedby_pid(1:required string q, 2:bool metaonly) throws (1:ServerError error_message) - - string citedby_doi(1:required string q, 2:bool metaonly) throws (1:ServerError error_message) - - string citedby_meta(1:required string title, 2:string author_surname, 3:i32 year, 4:bool metaonly) throws (1:ServerError error_message) -} \ No newline at end of file diff --git a/thrift/clients.py b/thrift/clients.py index df409cf..1b5bcd0 100644 --- a/thrift/clients.py +++ b/thrift/clients.py @@ -5,9 +5,16 @@ import logging from datetime import date +from articlemeta.client import ThriftClient as ArticleMetaThriftClient +from citedby.client import ThriftClient as CitedByThriftClient +from accessstats.client import ThriftClient as AccessesThriftClient +from publicationstats.client import ThriftClient as PublicationThriftClient +from citedby.custom_query import journal_titles from thriftpy.rpc import make_client from xylose.scielodocument import Article, Journal +import utils + LIMIT = 1000 logger = logging.getLogger(__name__) @@ -15,18 +22,6 @@ ratchet_thrift = thriftpy.load( os.path.join(os.path.dirname(__file__))+'/ratchet.thrift') -articlemeta_thrift = thriftpy.load( - os.path.join(os.path.dirname(__file__))+'/articlemeta.thrift') - -citedby_thrift = thriftpy.load( - os.path.join(os.path.dirname(__file__))+'/citedby.thrift') - -accessstats_thrift = thriftpy.load( - os.path.join(os.path.dirname(__file__))+'/access_stats.thrift') - -publication_stats_thrift = thriftpy.load( - os.path.join(os.path.dirname(__file__))+'/publication_stats.thrift') - class ServerError(Exception): def __init__(self, message=None): @@ -36,24 +31,7 @@ def __str__(self): return repr(self.message) -class AccessStats(object): - - def __init__(self, address, port): - """ - Cliente thrift para o Access Stats. - """ - self._address = address - self._port = port - - @property - def client(self): - - client = make_client( - accessstats_thrift.AccessStats, - self._address, - self._port - ) - return client +class AccessStats(AccessesThriftClient): def _compute_access_lifetime(self, query_result): @@ -149,34 +127,200 @@ def access_lifetime(self, issn, collection, raw=False): } query_parameters = [ - accessstats_thrift.kwargs('size', '0') + ('size', '0') ] - query_result = json.loads(self.client.search(json.dumps(body), query_parameters)) + query_result = self.search(json.dumps(body), query_parameters) computed = self._compute_access_lifetime(query_result) return query_result if raw else computed + def journal_access_monthnyear(self, issn): -class PublicationStats(object): + body = { + "query": { + "match": { + "issn": issn + } + }, + "aggs": { + "access_year": { + "terms": { + "field": "access_year", + "size": 0 + }, + "aggs": { + "access_month": { + "terms": { + "field": "access_month", + "size": 0 + }, + "aggs": { + "access_total": { + "sum": { + "field": "access_total" + } + }, + "access_epdf": { + "sum": { + "field": "access_epdf" + } + }, + "access_pdf": { + "sum": { + "field": "access_pdf" + } + }, + "access_html": { + "sum": { + "field": "access_html" + } + }, + "access_abstract": { + "sum": { + "field": "access_abstract" + } + } + } + } + } + } + } + } - def __init__(self, address, port): - """ - Cliente thrift para o PublicationStats. - """ - self._address = address - self._port = port + query_parameters = [ + ('size', '1000') + ] - @property - def client(self): - client = make_client( - publication_stats_thrift.PublicationStats, - self._address, - self._port - ) + query_result = self.search(json.dumps(body), query_parameters) + + return query_result + + def collection_access_monthnyear(self, collection): + + body = { + "query": { + "match": { + "collection": collection + } + }, + "aggs": { + "access_year": { + "terms": { + "field": "access_year", + "size": 0 + }, + "aggs": { + "access_month": { + "terms": { + "field": "access_month", + "size": 0 + }, + "aggs": { + "access_total": { + "sum": { + "field": "access_total" + } + }, + "access_epdf": { + "sum": { + "field": "access_epdf" + } + }, + "access_pdf": { + "sum": { + "field": "access_pdf" + } + }, + "access_html": { + "sum": { + "field": "access_html" + } + }, + "access_abstract": { + "sum": { + "field": "access_abstract" + } + } + } + } + } + } + } + } + + query_parameters = [ + ('size', '1000') + ] + + query_result = self.search(json.dumps(body), query_parameters) + + return query_result + + def document_access_monthnyear(self, code): + + body = { + "query": { + "match": { + "id": code + } + }, + "aggs": { + "access_year": { + "terms": { + "field": "access_year", + "size": 0 + }, + "aggs": { + "access_month": { + "terms": { + "field": "access_month", + "size": 0 + }, + "aggs": { + "access_total": { + "sum": { + "field": "access_total" + } + }, + "access_epdf": { + "sum": { + "field": "access_epdf" + } + }, + "access_pdf": { + "sum": { + "field": "access_pdf" + } + }, + "access_html": { + "sum": { + "field": "access_html" + } + }, + "access_abstract": { + "sum": { + "field": "access_abstract" + } + } + } + } + } + } + } + } + + query_parameters = [ + ('size', '0') + ] + + query_result = self.search(json.dumps(body), query_parameters) + + return query_result - return client + +class PublicationStats(PublicationThriftClient): def _compute_documents_languages_by_year(self, query_result, years=0): @@ -245,15 +389,16 @@ def documents_languages_by_year(self, issn, collection, years=0): } query_parameters = [ - publication_stats_thrift.kwargs('size', '0') + ('size', '0') ] - query_result = json.loads(self.client.search('article', json.dumps(body), query_parameters)) + query_result = self.search('article', json.dumps(body), query_parameters) return self._compute_documents_languages_by_year(query_result, years=years) def _compute_number_of_articles_by_year(self, query_result, years=0): + if years == 0: return query_result['aggregations']['id']['value'] @@ -265,7 +410,7 @@ def _compute_number_of_articles_by_year(self, query_result, years=0): if not item['key'] in years: continue - years[item['key']] = item.get('id', {}).get('value', 0) + years[item['key']] = item.get('doc_count', 0) return [(k, v) for k, v in sorted(years.items(), reverse=True)] @@ -340,10 +485,10 @@ def number_of_articles_by_year(self, issn, collection, document_types=None, year } query_parameters = [ - publication_stats_thrift.kwargs('size', '0') + ('size', '0') ] - query_result = json.loads(self.client.search('article', json.dumps(body), query_parameters)) + query_result = self.search('article', json.dumps(body), query_parameters) return self._compute_number_of_articles_by_year(query_result, years=years) @@ -419,11 +564,12 @@ def number_of_issues_by_year(self, issn, collection, years=0, type=None): } query_parameters = [ - publication_stats_thrift.kwargs('size', '0') + ('size', '0') ] - query_result = json.loads(self.client.search( - 'article', json.dumps(body), query_parameters)) + query_result = self.search( + 'article', json.dumps(body), query_parameters + ) return self._compute_number_of_issues_by_year( query_result, years=years) @@ -473,10 +619,10 @@ def first_included_document_by_journal(self, issn, collection): } query_parameters = [ - publication_stats_thrift.kwargs('size', '1') + ('size', '1') ] - query_result = json.loads(self.client.search('article', json.dumps(body), query_parameters)) + query_result = self.search('article', json.dumps(body), query_parameters) return self._compute_first_included_document_by_journal(query_result) @@ -530,198 +676,174 @@ def last_included_document_by_journal(self, issn, collection, metaonly=False): } query_parameters = [ - publication_stats_thrift.kwargs('size', '1') + ('size', '1') ] - query_result = json.loads(self.client.search('article', json.dumps(body), query_parameters)) + query_result = self.search('article', json.dumps(body), query_parameters) return self._compute_last_included_document_by_journal(query_result) -class Citedby(object): +class Citedby(CitedByThriftClient): - def __init__(self, address, port): - """ - Cliente thrift para o Citedby. - """ - self._address = address - self._port = port + def publication_and_citing_years(self, issn, titles, py_range=None): - @property - def client(self): - client = make_client( - citedby_thrift.Citedby, - self._address, - self._port - ) + body = {"query": {"filtered": {}}} - return client + fltr = { + "filter": { + "bool": { + "must": [] - def citedby_pid(self, code, metaonly=False): - """ - Metodo que faz a interface com o metodo de mesmo nome na interface - thrift, atribuindo metaonly default como FALSE. - """ + } + } + } - data = self.client.citedby_pid(code, metaonly) + if py_range: + fltr["filter"]["bool"]['must'].append( + { + "range": { + "publication_year": { + "gte": py_range[0], + "lte": py_range[1] + } + } + } + ) - return data + query = { + "query": { + "bool": { + "should": [], + "must_not": [] + } + } + } - def citedby_meta(self, title, author_surname, year, metaonly=False): + aggs = { + "aggs": { + "publication_year": { + "terms": { + "field": "publication_year", + "size": 0 + }, + "aggs": { + "reference_publication_year": { + "terms": { + "field": "reference_publication_year", + "size": 0, + "order": { + "_term": "desc" + } + } + } + } + } + } + } + + for item in self._fuzzy_custom_query(issn, titles): + query['query']['bool']['should'].append(item) + + for item in self._must_not_custom_query(issn): + query['query']['bool']['must_not'].append(item) + + body['query']['filtered'].update(fltr) + body['query']['filtered'].update(query) + body.update(aggs) + + query_parameters = [ + ('size', '0'), + ('search_type', 'count') + ] + + query_result = self.search(json.dumps(body), query_parameters) + + return query_result + + def has_optmized_journal_queries(self, issn): + + if journal_titles.load(issn): + return True + + return False + + @staticmethod + def _must_not_custom_query(issn): """ - Metodo que faz a interface com o metodo de mesmo nome na interface - thrift, atribuindo metaonly default como FALSE. + Este metodo constroi a lista de filtros por título de periódico que + será aplicada na pesquisa boleana como restrição "must_not". + A lista de filtros é coletada do template de pesquisa customizada + do periódico, quanto este template existir. """ - data = self.client.citedby_meta(title, author_surname, year, metaonly) + custom_queries = set([utils.cleanup_string(i) for i in journal_titles.load(issn).get('must_not', [])]) - return data + for item in custom_queries: + query = { + "match": { + "reference_source_cleaned": item + } + } -class Ratchet(object): + yield query - def __init__(self, address, port): + @staticmethod + def _fuzzy_custom_query(issn, titles): """ - Cliente thrift para o Ratchet. + Este metodo constroi a lista de filtros por título de periódico que + será aplicada na pesquisa boleana como match por similaridade "should". + A lista de filtros é coletada do template de pesquisa customizada + do periódico, quanto este template existir. """ - self._address = address - self._port = port - - @property - def client(self): - client = make_client( - ratchet_thrift.RatchetStats, - self._address, - self._port - ) + custom_queries = journal_titles.load(issn).get('should', []) + titles = [{'title': i} for i in titles if i not in [x['title'] for x in custom_queries]] + titles.extend(custom_queries) - return client + for item in titles: - def document(self, code): + if len(item['title'].strip()) == 0: + continue - data = self.client.general(code=code) + query = { + "fuzzy": { + "reference_source_cleaned": { + "value": utils.cleanup_string(item['title']), + "fuzziness": item.get('fuzziness', 3), + "max_expansions": 50 + } + } + } - return data + yield query -class ArticleMeta(object): +class Ratchet(object): def __init__(self, address, port): """ - Cliente thrift para o Articlemeta. + Cliente thrift para o Ratchet. """ self._address = address self._port = port @property def client(self): - client = make_client( - articlemeta_thrift.ArticleMeta, + ratchet_thrift.RatchetStats, self._address, self._port ) - return client - - def journals(self, collection=None, issn=None): - offset = 0 - while True: - identifiers = self.client.get_journal_identifiers(collection=collection, issn=issn, limit=LIMIT, offset=offset) - if len(identifiers) == 0: - raise StopIteration - - for identifier in identifiers: - - journal = self.client.get_journal( - code=identifier.code[0], collection=identifier.collection) - - jjournal = json.loads(journal) - - xjournal = Journal(jjournal) - logger.info('Journal loaded: %s_%s' % ( identifier.collection, identifier.code)) - - yield xjournal - - offset += 1000 - - def exists_article(self, code, collection): - try: - return self.client.exists_article( - code, - collection - ) - except: - msg = 'Error checking if document exists: %s_%s' % (collection, code) - raise ServerError(msg) - - def set_doaj_id(self, code, collection, doaj_id): - try: - article = self.client.set_doaj_id( - code, - collection, - doaj_id - ) - except: - msg = 'Error senting doaj id for document: %s_%s' % (collection, code) - raise ServerError(msg) - - def document(self, code, collection, replace_journal_metadata=True, fmt='xylose'): - try: - article = self.client.get_article( - code=code, - collection=collection, - replace_journal_metadata=True, - fmt=fmt - ) - except: - msg = 'Error retrieving document: %s_%s' % (collection, code) - raise ServerError(msg) - - if fmt == 'xylose': - jarticle = None - try: - jarticle = json.loads(article) - except: - msg = 'Fail to load JSON when retrienving document: %s_%s' % (collection, code) - raise ServerError(msg) - - if not jarticle: - logger.warning('Document not found for : %s_%s' % (collection, code)) - return None - - xarticle = Article(jarticle) - logger.info('Document loaded: %s_%s' % (collection, code)) - - return xarticle - - logger.info('Document loaded: %s_%s' % (collection, code)) - return article - - def documents(self, collection=None, issn=None, from_date=None, until_date=None, fmt='xylose', extra_filter=None): - offset = 0 - while True: - identifiers = self.client.get_article_identifiers( - collection=collection, issn=issn, from_date=from_date, - until_date=until_date, limit=LIMIT, offset=offset, - extra_filter=extra_filter) - - if len(identifiers) == 0: - raise StopIteration - - for identifier in identifiers: + return client - document = self.document( - code=identifier.code, - collection=identifier.collection, - replace_journal_metadata=True, - fmt=fmt - ) + def document(self, code): - yield document + data = self.client.general(code=code) - offset += 1000 + return data - def collections(self): - return [i for i in self.client.get_collection_identifiers()] +class ArticleMeta(ArticleMetaThriftClient): + pass diff --git a/thrift/publication_stats.thrift b/thrift/publication_stats.thrift deleted file mode 100644 index f28bc42..0000000 --- a/thrift/publication_stats.thrift +++ /dev/null @@ -1,44 +0,0 @@ -exception ServerError { - 1: string message, -} - -exception ValueError { - 1: string message, -} - -struct aggs { - 1: string key, - 2: i32 count, -} - -struct nested_aggs { - 1: string key, - 2: i32 count, - 3: list nested_aggs -} - -struct filters { - 1: string param, - 2: string value -} - -struct kwargs { - 1: string key, - 2: string value, -} - -service PublicationStats { - string journal(1: list aggs, 2: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - list journal_subject_areas(1: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - list journal_collections(1: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - list journal_statuses(1: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - list journal_inclusion_years(1: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - list document_subject_areas(1: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - string document(1: list aggs, 2: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - list document_collections(1: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - list document_publication_years(1: optional map filters), - list document_languages(1: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - list document_affiliation_countries(1: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - list document_types(1: optional map filters) throws (1:ValueError value_err, 2:ServerError server_err), - string search(1:string doc_type, 2: string body, 3: optional list parameters) throws (1:ValueError value_err, 2:ServerError server_err) -} \ No newline at end of file diff --git a/utils.py b/utils.py index 237cdca..e2ca973 100644 --- a/utils.py +++ b/utils.py @@ -5,8 +5,7 @@ import re import unicodedata import logging - -from django.utils.text import slugify +import string from thrift import clients @@ -18,11 +17,39 @@ logger = logging.getLogger(__name__) REGEX_ISSN = re.compile(r"^[0-9]{4}-[0-9]{3}[0-9xX]$") +TAG_RE = re.compile(r'<[^>]+>') + +def remove_tags(text): + return TAG_RE.sub('', text) -def call_django_slugify(value): - return slugify(value) +def cleanup_string(text): + + try: + nfd_form = unicodedata.normalize('NFD', text.strip().lower()) + except TypeError: + nfd_form = unicodedata.normalize('NFD', unicode(text.strip().lower())) + + cleaned_str = u''.join(x for x in nfd_form if x in string.ascii_letters or x == ' ') + + return remove_tags(cleaned_str).lower() + + +def slugify(value, allow_unicode=False): + """ + Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens. + Remove characters that aren't alphanumerics, underscores, or hyphens. + Convert to lowercase. Also strip leading and trailing whitespace. + """ + value + if allow_unicode: + value = unicodedata.normalize('NFKC', value) + else: + value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') + value = re.sub(r'[^\w\s-]', '', value).strip().lower() + + return re.sub(r'[-\s]+', '-', value) class SingletonMixin(object): @@ -66,7 +93,7 @@ def __init__(self, fp, parser_dep=ConfigParser): @classmethod def from_env(cls): try: - filepath = os.environ['PROCESSING_SETTINGS_FILE'] + filepath = os.environ['PROCESSING_SETTINGS_FILE'] except KeyError: raise ValueError('missing env variable PROCESSING_SETTINGS_FILE') @@ -98,68 +125,31 @@ def items(self): def publicationstats_server(): - try: - server = settings['app:main']['publicationstats_thriftserver'].split(':') - host = server[0] - port = int(server[1]) - except: - logger.warning('Error defining PublicationStats thrift server, assuming default server publicationstats.scielo.org:11620') - host = 'publicationstats.scielo.org' - port = 11620 - - return clients.PublicationStats(host, port) + server = settings['app:main'].get('publicationstats_thriftserver', 'publication.scielo.org:11620') + return clients.PublicationStats(server) def citedby_server(): - try: - server = settings['app:main']['citedby_thriftserver'].split(':') - host = server[0] - port = int(server[1]) - except: - logger.warning('Error defining Citedby thrift server, assuming default server citedby.scielo.org:11610') - host = 'citedby.scielo.org' - port = 11610 - - return clients.Citedby(host, port) + server = settings['app:main'].get('citedby_thriftserver', 'citedby.scielo.org:11610') + return clients.Citedby(domain=server) def ratchet_server(): - try: - server = settings['app:main']['ratchet_thriftserver'].split(':') - host = server[0] - port = int(server[1]) - except: - logger.warning('Error defining Ratchet thrift server, assuming default server ratchet.scielo.org:11630') - host = 'ratchet.scielo.org' - port = 11630 - + server = settings['app:main'].get('ratchet_thriftserver', 'ratchet.scielo.org:11630').split(':') + host = server[0] + port = int(server[1]) return clients.Ratchet(host, port) def articlemeta_server(): - try: - server = settings['app:main']['articlemeta_thriftserver'].split(':') - host = server[0] - port = int(server[1]) - except: - logger.warning('Error defining Article Meta thrift server, assuming default server articlemeta.scielo.org:11720') - host = 'articlemeta.scielo.org' - port = 11720 - - return clients.ArticleMeta(host, port) + server = settings['app:main'].get('articlemeta_thriftserver', 'articlemeta.scielo.org:11621') + admintoken = settings['app:main'].get('articlemeta_admintoken', None) + return clients.ArticleMeta(domain=server, admintoken=admintoken) def accessstats_server(): - try: - server = settings['app:main']['accessesstats_thriftserver'].split(':') - host = server[0] - port = int(server[1]) - except: - logger.warning('Error defining Access Stats thrift server, assuming default server accessstats.scielo.org:11660') - host = 'ratchet.scielo.org' - port = 11660 - - return clients.AccessStats(host, port) + server = settings['app:main'].get('accessesstats_thriftserver', 'ratchet.scielo.org:11660') + return clients.AccessStats(server) def is_valid_date(value):