Skip to content

WIP: GBIF EML Profile #268

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pygeometa/schemas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
'dcat': 'pygeometa.schemas.dcat.DCATOutputSchema',
'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema',
'wmo-wcmp2': 'pygeometa.schemas.wmo_wcmp2.WMOWCMP2OutputSchema',
'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema'
'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema',
'gbif-eml': 'pygeometa.schemas.gbif_eml.GBIF_EMLOutputSchema',
}


Expand Down
192 changes: 192 additions & 0 deletions pygeometa/schemas/gbif_eml/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import re
from pathlib import Path

from bs4 import BeautifulSoup
from pygeometa.schemas.base import BaseOutputSchema

THISDIR = Path(__file__).parent


def text_or_null(node, strip=False):
if not node:
return None

if strip:
return node.text.strip()

return node.text


def text_or_empty(node, strip=False):
if not node:
return ""

if strip:
return node.text.strip()

return node.text


def scrub_dict(d):
if type(d) is dict:
return dict(
(k, scrub_dict(v))
for k, v in d.items()
if v is not None and scrub_dict(v) is not None
)
else:
return d


def to_contact_role(node, role, mapped_role=None):
if not mapped_role:
mapped_role = role

for idx, contact in enumerate(node.find_all(role)):
name = f'{text_or_empty(contact.find("surName"))}, '
name += text_or_empty(contact.find("givenName"))
org = text_or_empty(contact.find("organizationName"))
yield (
mapped_role + (f"_{idx}" if idx else ""),
{
"organization": org,
"individualname": name,
"positionname": text_or_empty(contact.find("positionName"))
or text_or_empty(contact.find("role")),
"phone": "",
"url": "",
"fax": "",
"address": "",
"city": "",
"administrativearea": "",
"postalcode": "",
"country": text_or_empty(contact.find("country")),
"email": text_or_empty(contact.find("electronicMailAddress")),
},
)


class GBIF_EMLOutputSchema(BaseOutputSchema):
def __init__(self):
super().__init__("gbif-eml", "EML - GBIF profile", "xml", THISDIR)

def import_(self, metadata):
soup = BeautifulSoup(metadata, features="lxml-xml")
dataset = soup.find("dataset")
mcf = {
"mcf": {
"version": 1,
},
"metadata": {
"charset": "utf8",
"hierarchylevel": "dataset",
"datestamp": "$datetime$",
},
"identification": {},
"contact": {},
"distribution": {},
}

for identifier in dataset.find_all("alternateIdentifier"):
mcf["metadata"]["identifier"] = text_or_null(identifier)

if language := dataset.find("language"):
mcf["metadata"]["language"] = text_or_null(language)

idf = mcf["identification"]

idf["title"] = text_or_null(dataset.find("title"))
idf["abstract"] = text_or_null(dataset.find("abstract"))

if intellectual_rights := dataset.find("intellectualRights"):
url = (
intellectual_rights.find("ulink")["url"]
if intellectual_rights.find("ulink")
else None
)
idf["rights"] = {
"name": text_or_null(intellectual_rights.find("citetitle")),
"url": url,
}

idf["url"] = text_or_null(dataset.find("alternateIdentifier"))
idf["status"] = "completed"

# if maintenance := dataset.find("maintenance"):
# metadata.maintenance_update_description = text_or_null(
# maintenance.find("description")
# )

idf["maintenancefrequency"] = (
text_or_null(dataset.find("maintenanceUpdateFrequency")) or
"unknown"
)

idf["dates"] = {"publication": text_or_null(dataset.find("pubDate"))}
idf["extents"] = {}

if coords := dataset.find("boundingCoordinates"):
idf["extents"]["spatial"] = [{}]
spatial = idf["extents"]["spatial"][0]

spatial["bbox"] = [
float(coords.find("westBoundingCoordinate").text),
float(coords.find("southBoundingCoordinate").text),
float(coords.find("eastBoundingCoordinate").text),
float(coords.find("northBoundingCoordinate").text),
]

spatial["crs"] = "4326"
spatial["description"] = \
text_or_null(dataset.find("geographicDescription"))

# temporal = idf["extents"]["temporal"]
# temporal["begin"]
# temporal["end"]
# temporal["resolution"]

idf["keywords"] = {}

ct = mcf["contact"]

for r, obj in to_contact_role(dataset, "contact", "pointOfContact"):
ct[r] = obj

for r, obj in to_contact_role(dataset,
"metadataProvider",
"distributor"):
ct[r] = obj

for r, obj in to_contact_role(dataset, "creator"):
ct[r] = obj

for r, obj in to_contact_role(dataset,
"personnel",
"projectPersonnel"):
ct[r] = obj

for idx, keyword_set in enumerate(dataset.find_all("keywordSet")):
thesaurus = text_or_null(keyword_set.find("keywordThesaurus"))
match = re.search(r"(?P<url>https?://[^\s]+)", thesaurus)
definition = match.group("url") if match else None

idf["keywords"][f"default-{idx}"] = {
"keywords": [
text_or_null(kw) for kw in keyword_set.find_all("keyword")
],
"vocabulary": {"name": thesaurus, "url": definition},
}

mcf["spatial"] = {"datatype": "vector", "geomtype": "composite"}

mcf["distribution"] = {
"file": {
"url": idf["url"],
"type": "WWW:LINK",
"function": "information",
"description": "",
"name": "Darwin Core Archive",
}
}

return scrub_dict(mcf)
150 changes: 150 additions & 0 deletions pygeometa/schemas/gbif_eml/main.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:dc="http://purl.org/dc/terms/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.1/eml.xsd"
packageId="{{ record['metadata']['dataseturi'] }}" system="http://gbif.org"
scope="system" xml:lang="{{ record['identification']['language'] }}">

<dataset>
<alternateIdentifier>{{ record['identification']['doi' ]}}</alternateIdentifier>
<title xml:lang="{{ record['identification']['language'] }}">{{ record['identification']['title'] }}</title>
{#
<!--creator>
{% include 'person.j2' %}
</creator>
<creator>
{% include 'person.j2' %}
</creator>
<metadataProvider>
{% include 'person.j2' %}
</metadataProvider-->
#}
<pubDate>
{{ record['identification']['dates']['publication'] }}
</pubDate>
<language>{{ record['identification']['language'] }}</language>
<abstract>
<para>{{ record['identification']['abstract'] }}</para>
</abstract>
{% for group, keywords in record['identification']['keywords'].items() %}
<keywordSet>
{% for kw in keywords['keywords'] %}
<keyword>{{ kw }}</keyword>
{% endfor %}
<keywordThesaurus>{{ keywords['vocabulary']['name'] }}: {{ keywords['vocabulary']['url'] }}</keywordThesaurus>
</keywordSet>
{% endfor%}
<intellectualRights>
<para>This work is licensed under a <ulink url="{{ record['identification']['rights']['url'] }}">
<citetitle>{{ record['identification']['rights']['name'] }}</citetitle>
</ulink>.</para>
</intellectualRights>
<distribution scope="document">
<online>
{% for key, value in record['distribution'].items() %}
<url function="{{ value['function'] }}">{{ value['url' ]}}</url>
{% endfor %}
</online>
</distribution>
{% set extents = record['identification']['extents'] %}
{% set bbox = extents['spatial'][0]['bbox'] %}
<coverage>
<geographicCoverage>
<geographicDescription>{{ extents['spatial'][0]['description'] }}</geographicDescription>
<boundingCoordinates>
<westBoundingCoordinate>{{ bbox[0] }}</westBoundingCoordinate>
<eastBoundingCoordinate>{{ bbox[1] }}</eastBoundingCoordinate>
<northBoundingCoordinate>{{ bbox[2] }}</northBoundingCoordinate>
<southBoundingCoordinate>{{ bbox[3] }}</southBoundingCoordinate>
</boundingCoordinates>
</geographicCoverage>
{% if 'temporal' in extents %}
<temporalCoverage>
<rangeOfDates>
<beginDate>
<calendarDate>{{ extents['temporal'][0]['begin'] }}</calendarDate>
</beginDate>
{% if extents['temporal'][0]['end'] %}
<endDate>
<calendarDate>{{ extents['temporal'][0]['end'] }}</calendarDate>
</endDate>
{% endif %}
</rangeOfDates>
</temporalCoverage>
{% endif %}
<taxonomicCoverage>
<generalTaxonomicCoverage>
</generalTaxonomicCoverage>
<taxonomicClassification>
<taxonRankName></taxonRankName>
<taxonRankValue></taxonRankValue>
<commonName></commonName>
</taxonomicClassification>
</taxonomicCoverage>
</coverage>
<maintenance>
<description>
<para />
</description>
<maintenanceUpdateFrequency></maintenanceUpdateFrequency>
</maintenance>

{#
<contact>
{% include 'person.j2' %}
</contact>
<methods>
<methodStep>
<description>
<para></para>
</description>
</methodStep>
<sampling>
<studyExtent>
<description>
<para>
</para>
</description>
</studyExtent>
<samplingDescription>
<para></para>
</samplingDescription>
</sampling>
<qualityControl>
<description>
<para></para>
</description>
</qualityControl>
</methods>
<project>
<title></title>
<personnel>
{% include 'person.j2' %}
<role />
</personnel>
<abstract>
<para></para>
</abstract>
<funding>
<para>Artsdatabanken</para>
</funding>
<studyAreaDescription>
<descriptor name="generic" citableClassificationSystem="false">
<descriptorValue></descriptorValue>
</descriptor>
</studyAreaDescription>
</project>
#}
</dataset>
{#
<additionalMetadata>
<metadata>
<gbif>
<dateStamp></dateStamp>
<hierarchyLevel>dataset</hierarchyLevel>
<citation></citation>
<resourceLogoUrl></resourceLogoUrl>
</gbif>
</metadata>
</additionalMetadata>
#}
</eml:eml>
11 changes: 11 additions & 0 deletions pygeometa/schemas/gbif_eml/person.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<individualName>
<givenName>{{ first_name }}</givenName>
<surName>{{ last_name }}</surName>
</individualName>
<organizationName>{{ org_name }}</organizationName>
<positionName>{{ position }}</positionName>
<address>
<country>{{ country }}</country>
</address>
<electronicMailAddress>{{ email }}</electronicMailAddress>
{% if  orcid %}<userId directory="http://orcid.org/">{{ orcid }}</userId>{% endif %}
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ jsonschema
lxml
OWSLib
pyyaml
beautifulsoup4
Loading