FREYA WP2 User Story 4 | As a funder I want to see how many of the research outputs funded by me have an open license enabling reuse, so that I am sure I properly support Open Science. | |
---|---|---|
Funders that support open research are interested in monitoring the extent of open access given to the outputs of grants they award - while the grant is active as well as retrospectively.
This notebook uses the DataCite GraphQL API to retrieve and report license types of outputs of the following funders to date:Goal: By the end of this notebook you should be able to:
%%capture
# Install required Python packages
!pip install gql requests numpy plotnine
# Prepare the GraphQL client
import requests
from IPython.display import display, Markdown
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
_transport = RequestsHTTPTransport(
url='https://api.datacite.org/graphql',
use_json=True,
)
client = Client(
transport=_transport,
fetch_schema_from_transport=True,
)
Define the GraphQL query to find all outputs and associated licenses for three different funders: DFG (Deutsche Forschungsgemeinschaft, Germany), ANR (Agence Nationale de la Recherche, France) and SNF (Schweizerischer Nationalfonds zur Förderung der Wissenschaftlichen Forschung, Switzerland)).
# Generate the GraphQL query: find all outputs and their associated licenses (where available)
# for three different funders, identified by funder1, funder2 and funder3.
query_params = {
"funder1" : "https://doi.org/10.13039/501100001659",
"funder2" : "https://doi.org/10.13039/501100001665",
"funder3" : "https://doi.org/10.13039/501100001711"
}
funderId2Acronym = {
"https://doi.org/10.13039/501100001659" : "DFG",
"https://doi.org/10.13039/501100001665" : "ANR",
"https://doi.org/10.13039/501100001711" : "SNF"
}
query = gql("""query getGrantOutputsForFundersById(
$funder1: ID!,
$funder2: ID!,
$funder3: ID!
)
{
funder1: funder(id: $funder1) {
name
id
works {
totalCount
licenses {
id
title
count
}
}
},
funder2: funder(id: $funder2) {
name
id
works {
totalCount
licenses {
id
title
count
}
}
},
funder3: funder(id: $funder3) {
name
id
works {
totalCount
licenses {
id
title
count
}
}
},
funder1Dataset: funder(id: $funder1) {
name
id
works(resourceTypeId: "Dataset") {
totalCount
licenses {
id
title
count
}
}
},
funder1Text: funder(id: $funder1) {
name
id
works(resourceTypeId: "Text") {
totalCount
licenses {
id
title
count
}
}
},
funder2Dataset: funder(id: $funder2) {
name
id
works(resourceTypeId: "Dataset") {
totalCount
licenses {
id
title
count
}
}
},
funder2Text: funder(id: $funder2) {
name
id
works(resourceTypeId: "Text") {
totalCount
licenses {
id
title
count
}
}
},
funder3Dataset: funder(id: $funder3) {
name
id
works(resourceTypeId: "Dataset") {
totalCount
licenses {
id
title
count
}
}
},
funder3Text: funder(id: $funder3) {
name
id
works(resourceTypeId: "Text") {
totalCount
licenses {
id
title
count
}
}
}
}
""")
Run the above query via the GraphQL client
import json
data = client.execute(query, variable_values=json.dumps(query_params))
Plot an interactive bar plot showing the proportion of outputs issued under a given license type, for each funder.
import plotly.io as pio
import plotly.express as px
from IPython.display import IFrame
import pandas as pd
from operator import itemgetter
import re
# Adapted from: https://stackoverflow.com/questions/58766305/is-there-any-way-to-implement-stacked-or-grouped-bar-charts-in-plotly-express
def px_stacked_bar(df, color_name='License Type', y_name='Metrics', **pxargs):
idx_col = df.index.name
m = pd.melt(df.reset_index(), id_vars=idx_col, var_name=color_name, value_name=y_name)
# For Plotly colour sequences see: https://plotly.com/python/discrete-color/
return px.bar(m, x=idx_col, y=y_name, color=color_name, **pxargs,
color_discrete_sequence=px.colors.qualitative.Pastel1)
def get_grouped_license_type(licenseId):
ret = None
if re.search('cc-by-', licenseId) is not None:
ret = "cc-by"
elif re.search('cc0-', licenseId) is not None:
ret = "cc0"
elif licenseId is not None:
ret = "other"
return ret
queries = ['funder1', 'funder2', 'funder3']
# Map each license type to a dict that in turn maps the position of the output's bar in plot
# to the count of outputs corresponding to that license type.
licenseType2Pos2Count = {}
# Under the assumption of one license per work, for each funder licenseType2Pos2Count["No license"] is instantiated
# with the totalCount of works for that funder. Any work counts for a license found in funder['works']['licenses']
# will be subtracted from licenseType2Pos2Count["No license"] for that funder, in the end leaving the number of
# works with no license.
licenseType2Pos2Count["No license"] = {}
for pos1 in range(0, len(queries)):
# Initialise (no) license's counts for each funder
query = queries[pos1]
if query in data:
licenseType2Pos2Count["No license"][pos1] = data[query]['works']['totalCount']
# Populate license type counts per funder
# labels contains funder labels in bar plot - each bar corresponds to a single funder
labels = {}
pos = 0
for query in queries:
if query in data:
funder = data[query]
labels[pos] = funderId2Acronym[funder['id']]
for license in funder['works']['licenses']:
outputCount = license['count']
licenseId = get_grouped_license_type(license['id'])
if licenseId not in licenseType2Pos2Count:
licenseType2Pos2Count[licenseId] = {}
for pos1 in range(0, len(queries)):
# Initialise license's counts for each funder
licenseType2Pos2Count[licenseId][pos1] = 0
licenseType2Pos2Count[licenseId][pos] += outputCount
licenseType2Pos2Count["No license"][pos] -= outputCount
pos += 1
# Create stacked bar plot
x_name = "Funders"
dfDict = {x_name: labels}
for license in licenseType2Pos2Count:
dfDict[license] = licenseType2Pos2Count[license]
df = pd.DataFrame(dfDict)
fig = px_stacked_bar(df.set_index(x_name), y_name = "Output Counts")
# Set plot background to transparent
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)'
})
# Write interactive plot out to html file
# pio.write_html(fig, file='out.html')
# Display plot from the saved html file
display(Markdown("<br />License types of all funder's outputs to date, shown as a stacked bar plot - one bar per funder:"))
# IFrame(src="./out.html", width=500, height=500)
fig.show()
License types of all funder's outputs to date, shown as a stacked bar plot - one bar per funder:
Plot an interactive bar plot showing for each funder the proportion of outputs published in a given year under a given license type.
import plotly.express as px
import re
xstr = lambda s: 'General' if s is None else str(s)
# Populate license type counts per funder
funderQueryLabels = ['funder1', 'funder2', 'funder3']
outputTypeLabels = ["Dataset", "Text"]
funder2resType2licenceType2outputCount = {}
# funderAcronym2Name is needed for the plot legend - as funder names are too long to be shown in the plot itself
funderAcronym2Name = {}
# Collect license type counts data into funder2resType2licenceType2outputCount
for funderQueryLabel in funderQueryLabels:
for outputType in outputTypeLabels:
query = funderQueryLabel + outputType
if query in data:
funder = data[query]
funderAcronym = funderId2Acronym[funder['id']]
funderAcronym2Name[funderAcronym] = funder['name']
if funderAcronym not in funder2resType2licenceType2outputCount:
funder2resType2licenceType2outputCount[funderAcronym] = {}
if outputType not in funder2resType2licenceType2outputCount[funderAcronym]:
funder2resType2licenceType2outputCount[funderAcronym][outputType] = {}
# Under the assumption of one license per work, for each funder
# funder2resType2licenceType2outputCount[funderAcronym][outputType]["No license"] is instantiated
# with the totalCount of works for that funder and outputType. Any work counts for a license found in funder['works']['licenses']
# will be subtracted from funder2resType2licenceType2outputCount[funderAcronym][outputType]["No license"] for that funder,
# in the end leaving the number of works with no license.
if "No license" not in funder2resType2licenceType2outputCount[funderAcronym][outputType]:
funder2resType2licenceType2outputCount[funderAcronym][outputType]["No license"] = funder['works']['totalCount']
for license in funder['works']['licenses']:
outputCount = license['count']
licenseId = get_grouped_license_type(license['id'])
if licenseId not in funder2resType2licenceType2outputCount[funderAcronym][outputType]:
funder2resType2licenceType2outputCount[funderAcronym][outputType][licenseId] = 0
funder2resType2licenceType2outputCount[funderAcronym][outputType][licenseId] += outputCount
funder2resType2licenceType2outputCount[funderAcronym][outputType]["No license"] -= outputCount
# Populate data structures for faceted stacked bar plot
funders, outputTypes, licenseTypes, outputCounts = ({}, {}, {}, {})
pos = 0
for funder in funder2resType2licenceType2outputCount:
for outputType in funder2resType2licenceType2outputCount[funder]:
for licenseType in funder2resType2licenceType2outputCount[funder][outputType]:
funders[pos] = funder
outputTypes[pos] = outputType
licenseTypes[pos] = licenseType
outputCounts[pos] = funder2resType2licenceType2outputCount[funder][outputType][licenseType]
pos += 1
dfDict = {"Funder": funders, "Output Type": outputTypes, "License": licenseTypes, "Output Count": outputCounts}
df1 = pd.DataFrame(dfDict)
# Create funders legend
tableBody=""
for funderAcronym in funderAcronym2Name:
tableBody += "%s | %s\n" % (funderAcronym, funderAcronym2Name[funderAcronym])
fig2 = px.bar(df1, x="Output Type", y="Output Count", color="License", barmode="stack",
facet_row="Funder"
# facet_col=""
)
# fig2.update_traces(texttemplate='%{text:}', textposition='inside')
fig2.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
# Write interactive plot out to html file
pio.write_html(fig2, file='out2.html')
# Display plot from the saved html file
markDownContent="<br />Fo each funder, the plot below shows counts of all outputs to date of type %s, corresponding to a given license type." + \
"<br />Full information is shown when you mouse-over a bar." + \
"<br />"
display(Markdown(markDownContent % " or ".join(outputTypeLabels)))
display(Markdown("| Acronym | Funder Name|\n|---|---|\n%s" % tableBody))
# IFrame(src="./out2.html", width=500, height=700)
fig2.show()
Fo each funder, the plot below shows counts of all outputs to date of type Dataset or Text, corresponding to a given license type.
Full information is shown when you mouse-over a bar.
Acronym | Funder Name |
---|---|
DFG | Deutsche Forschungsgemeinschaft ANR | Agence Nationale de la Recherche SNF | Schweizerischer Nationalfonds zur Förderung der Wissenschaftlichen Forschung