journalscrapper/scrapper/gscholar.py
2022-08-18 10:33:29 +07:00

141 lines
No EOL
5.9 KiB
Python

import requests, json
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import unquote
host = "https://scholar.google.com"
# Refresh Users Data <DEPRECATED>
def refreshGetUsers():
data = []
org = "9444447549188154848" # Universitas Tarumanagara
token = "test"
start = 10
while True:
try:
url = f"{host}/citations?view_op=view_org&hl=id&org={org}&after_author={token}&astart={start}"
r = requests.get(url)
users = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_sa_ccl'}))
for user in users.find_all('div', {'class': 'gsc_1usr'}):
user_info = user.find('h3', {'class': 'gs_ai_name'})
user_name = user_info.find('a').text
user_id = user_info.find('a')['href'].split('user=')[1]
user_type = user.find('div', {'class': 'gs_ai_eml'}).text.replace('Email yang diverifikasi di ', '')
user_image = user.find('img')['src']
data.append({
'name': user_name,
'id': user_id,
'type': user_type,
'image': user_image
})
print (start)
nextButton = users.find('button', {'class': 'gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx'})
if nextButton.get('disabled') == None:
start += 10
token = nextButton.get('onclick').replace('\\x', '%')
token = unquote(token).split('after_author=')[1].split('&astart=')[0]
else:
break
except Exception as e:
print (e)
with open('users.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
return {'message': 'sucess'}
# Get All Users Data (Cached) <DEPRECATED>
def getUsers(name):
with open('scrapper/users.json', 'r', encoding='utf-8') as f:
data = json.load(f)
res = []
for user in data:
if user['type'] == 'untar.ac.id' or user['type'] == 'fti.untar.ac.id':
if name.lower() in user['name'].lower():
if user not in res:
res.append(user)
if ' ' in name:
raw_name = name.split(' ')
raw_name.reverse()
if ' '.join(raw_name).lower() in user['name'].lower():
if user not in res:
res.append(user)
return res
# Get Citations Data with User ID <DEPRECATED>
def getCitations(user_id):
data = []
start = 0
end = 100
while True:
try:
url = f"{host}/citations?user={user_id}&hl=id&oi=sra&cstart={start}&pagesize={end}"
r = requests.get(url)
citations = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_art'}))
for citation in citations.find_all('tr', {'class': 'gsc_a_tr'}):
citation_title = citation.find('a', {'class': 'gsc_a_at'}).text.replace('\\','').replace('"','').replace("'","")
citation_id = citation.find('a', {'class': 'gsc_a_at'})['href'].split('citation_for_view=')[1]
citation_info = citation.find_all('div', {'class': 'gs_gray'})
citation_author = citation_info[0].text
for x in citation_info[1].find_all('span', {'class': 'gs_oph'}):
x.decompose()
citation_journal = citation_info[1].text
citation_year = citation.find('td', {'class': 'gsc_a_y'}).text
if citation_journal.lower() != 'turnitin':
data.append({
'title': citation_title,
'id': citation_id,
'author': citation_author,
'journal': citation_journal,
'year': citation_year
})
nextButton = citations.find('button', {'id': 'gsc_bpf_more'})
if nextButton.get('disabled') == None:
start += 100
end += 100
else:
break
except Exception as e:
print (e)
return data
# Get Citation Data with Citation ID
def getCitation(citation_id):
url = f"{host}/citations?view_op=view_citation&hl=en&citation_for_view={citation_id}"
r = requests.get(url)
citation = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_vcpb'}))
citation_title = citation.find('a', {'class': 'gsc_oci_title_link'}).text
citation_url = citation.find('a', {'class': 'gsc_oci_title_link'})['href']
citation_info = {}
citation_info_field = citation.find_all('div', {'class': 'gsc_oci_field'})
citation_info_value = citation.find_all('div', {'class': 'gsc_oci_value'})
for x in range(len(citation_info_field)):
if citation_info_field[x].text.lower() != 'total citations' and citation_info_field[x].text.lower() != 'scholar articles':
citation_info[citation_info_field[x].text.lower()] = citation_info_value[x].text
# Check if Downloadable
citation_download = False
citation_download_link = None
citation_download_raw = citation.find('div', {'class': 'gsc_oci_title_ggi'})
if citation_download_raw != None:
citation_download_check = citation_download_raw.find('a').text.lower()
if '[pdf]' in citation_download_check:
citation_download = True
citation_download_link = citation_download_raw.find('a')['href']
data = {
'title': citation_title,
'url': citation_url,
'info': citation_info,
'download': citation_download,
'download_link': citation_download_link
}
return data