%%capture
# Install required Python packages
!pip install gql requests


# Prepare the GraphQL client
import requests
from IPython.display import display, Markdown
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

_transport = RequestsHTTPTransport(
    url='https://api.datacite.org/graphql',
    use_json=True,
)

client = Client(
    transport=_transport,
    fetch_schema_from_transport=True,
)


# Generate the GraphQL query to retrieve up to 100 researchers matching query "John and Smith"
query_params = {
    "query" : "John AND Smith",
    "max_researchers" : 100,
    "query_end_cursor" : ""
}

query_str = """query getResearchersByName(
    $query: String!,
    $max_researchers: Int!,
    $query_end_cursor : String!
    )
{
  people(query: $query, first: $max_researchers, after: $query_end_cursor) {
    totalCount
    pageInfo {
      hasNextPage
      endCursor
    }  
    nodes {
      id
      givenName
      familyName
      name
      affiliation {
        name
      }
    }
  }
}
"""


import json
found_next_page = True

# Initialise overall data dict that will store results
data = {}

# Keep retrieving results until there are no more results left
while True:
    query = gql("%s" % query_str)
    res = client.execute(query, variable_values=json.dumps(query_params))
    if "people" not in data:
        data = res
    else:
        people = res["people"]
        data["people"]["nodes"].extend(people["nodes"])
        pageInfo = people["pageInfo"]
        if pageInfo["hasNextPage"]:
            if pageInfo["endCursor"] is not None:
                query_params["query_end_cursor"] = pageInfo["endCursor"]            
            else:
                break
        else:
            break


# Collect names and affiliations for the researchers found
# Test if fieldValue matches (case-insensitively) a Solr-style query (with " AND " representing the logical AND, and " " representing the logical OR)
def testIfPresentCaseInsensitive(solrQuery, fieldValueLowerCase):
    for orTerms in solrQuery.split(" AND "):
        present = False
        for term in orTerms.split(" "):
            if term.lower() in fieldValueLowerCase:
                present = True
                break
        if not present:
            return False
    return True

people = data['people']
af2Names = {}
totalCount = 0
for node in people['nodes']:
    id = node['id']
    name = node['name']
#     TODO: Remove if we manage to search only individual fields
    if not testIfPresentCaseInsensitive(query_params['query'], name.lower()):
        continue
    totalCount += 1
    for af in node['affiliation']:
        affiliation = af['name']
        if affiliation not in af2Names:
            af2Names[affiliation] = set()
        af2Names[affiliation].add(name)

tableBody = ""
for af,names in sorted(af2Names.items()):
    tableBody += af + " | " + ', '.join(names) + "\n"
display(Markdown("Total number of researchers found: **%d**<br>The list of researchers by affiliation is as follows:" % totalCount))
display(Markdown(""))

display(Markdown("| Affiliation | Researcher Names |\n|---|---|\n%s" % tableBody))


# Generate the GraphQL query to retrieve all researchers matching query "John and Smith" and affiliation "University of Arizona", now with works
name_query = "John AND Smith"
affiliation_query = "\"University of Arizona\""
query_params1 = {
    "query" : name_query + " AND " + affiliation_query,
    "max_researchers" : 10,
    "query_end_cursor" : ""    
}

query_str = """query getResearchersByName(
    $query: String!,
    $max_researchers: Int!,
    $query_end_cursor : String!
    )
{
  people(query: $query, first: $max_researchers, after: $query_end_cursor) {
    totalCount
    pageInfo {
      hasNextPage
      endCursor
    }      
    nodes {
      id
      givenName
      familyName
      name
      affiliation {
        name
      }
      works(first: 3) {
        nodes {
          id
          publicationYear
          publisher
          titles {
            title
          }
          creators {
            id
            name
            affiliation {
              id
              name
            }
          }
          subjects {
            subject
          }
        }
      }
    }
  }
}
"""


import json
found_next_page = True

# Initialise overall data dict that will store results
data1 = {}

# Keep retrieving results until there are no more results left
while True:
    query = gql("%s" % query_str)
    res = client.execute(query, variable_values=json.dumps(query_params1))
    if "people" not in data1:
        data1 = res
    else:
        people = res["people"]
        data1["people"]["nodes"].extend(people["nodes"])
        pageInfo = people["pageInfo"]
        if pageInfo["hasNextPage"]:
            if pageInfo["endCursor"] is not None:
                query_params["query_end_cursor"] = pageInfo["endCursor"]            
            else:
                break
        else:
            break


from textwrap import shorten

# Collect all relevant details for the researchers found
tableBody=set()
people = data1['people']
for node in people['nodes']:
    id = node['id']
    firstName = node['givenName']
    surname = node['familyName']
    name = node['name']
#     TODO: Remove if we manage to search only individual fields
    if not testIfPresentCaseInsensitive(name_query, name.lower()):
        continue    
    orcidHref = ""
    if id is not None and id != "":
        orcidHref = "["+ name +"]("+ id +")"    
    affiliations = []
    for affiliation in node['affiliation']:
        affiliations.append(affiliation['name'])
    works = ""
    if 'works' in node:
        for work in node['works']['nodes']:
            titles = []
            for title in work['titles']:
                titles.append(shorten(title['title'], width=50, placeholder="..."))
            creators = []
            cnt = 0
            for creator in work['creators']:
                cnt += 1
                # Restrict display to the first author only                 
                if (cnt > 1):
                    creators[-1] += " et al."
                    break
                if creator['id'] is not None:
                    creators.append("[" + creator['name'] + "](" + creator['id'] + ")")
                else:
                    creators.append(creator['name'])
            
            works += '; '.join(creators) + " (" + str(work['publicationYear']) + ") ["+ ', '.join(titles) +"]("+ work['id'] + ") *" + work['publisher'] + "*<br>" 
        
    tableBody.add(firstName + " | " + surname + " | " + orcidHref + " | " + '<br>'.join(sorted(affiliations)) + " | " + works)
display(Markdown("| First Name | Surname | Link to ORCID | Affiliations | Works | \n|---|---|---|---|---|\n%s" % '\n'.join(tableBody)))

Install libraries and prepare GraphQL client¶

Define and run GraphQL query¶

List researcher details¶