journalscrapper/scrapper/sinta.py

83 lines
3.6 KiB
Python
Raw Normal View History

2022-08-18 05:33:29 +02:00
import requests
from bs4 import BeautifulSoup, SoupStrainer
host = "https://sinta.kemdikbud.go.id"
affiliations_id = 476 # Universitas Tarumanagara
pddikti_id = "031015"
def getUsers(name):
data = []
try:
url = f"{host}/affiliations/authors/{affiliations_id}?q={name}"
r = requests.get(url)
users = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'class': 'au-list-affil mt-3'}))
for user in users.find_all('div', {'class': 'au-item mt-3 mb-3 pb-5 pt-3'}):
user_image = user.find('img', {'class': 'img-thumbnail avatar'})['src'].strip()
user_profile = user.find('div', {'class': 'profile-name'})
user_name = user_profile.find('a').text.strip()
user_department = user.find('div', {'class': 'profile-dept'})
user_type = user_department.find('a').text.strip()
user_id = user.find('div', {'class': 'profile-id'}).text.strip().replace('ID : ', '')
data.append({
'name': user_name,
'id': user_id,
'type': user_type,
'image': user_image
})
return data
except Exception as e:
print (e)
def getUser(user_id):
try:
url = f"{host}/authors/profile/{user_id}/?view=googlescholar"
r = requests.get(url)
user = BeautifulSoup(r.text, 'lxml')
user_name = user.find('h3').text.strip()
user_image = user.find('img', {'class': 'img-fluid img-thumbnail round-corner'})['src']
user_gscholar_id = user_image.split('&user=')[1].split('&citpid=')[0]
user_profile = user.find('div', {'class': 'meta-profile'})
user_profile_item = user_profile.find_all('a')
user_affiliation = user_profile_item[0].text.strip()
user_type = user_profile_item[1].text.strip()
user_subject = user.find('div', {'class': 'profile-subject mt-3'})
user_subject_list = []
for user_subject_item in user_subject.find_all('a'):
user_subject_list.append(user_subject_item.text.strip())
user_stats = user.find_all('div', {'class': 'pr-num'})
user_sinta_score_overall = user_stats[0].text.strip()
user_sinta_score_3yrs = user_stats[1].text.strip()
user_affil_score = user_stats[2].text.strip()
user_affil_score_3yrs = user_stats[3].text.strip()
user_summary = user.find('table', {'class': 'table table-borderless table-sm text-center stat-table'})
user_summary_list = user_summary.find_all('tr')
user_summary_list.pop(0)
user_summary_list_data = {}
for user_summary_item in user_summary_list:
user_summary_field = user_summary_item.find_all('td')
user_summary_list_data[user_summary_field[0].text.strip().replace(' ', '_').replace('-', '_').lower()] = {
'scopus': user_summary_field[1].text.strip(),
'gscholar': user_summary_field[2].text.strip(),
'wos': user_summary_field[3].text.strip()
}
data = {
'name': user_name,
'id': user_id,
'type': user_type,
'image': user_image,
'gscholar_id': user_gscholar_id,
'affiliation': user_affiliation,
'subject': user_subject_list,
'sinta_score_overall': user_sinta_score_overall,
'sinta_score_3yrs': user_sinta_score_3yrs,
'affil_score': user_affil_score,
'affil_score_3yrs': user_affil_score_3yrs,
'summary': user_summary_list_data
}
return data
except Exception as e:
print (e)