mirror of
https://gitlab.com/moepoi/journalscrapper.git
synced 2024-12-31 00:30:48 +01:00
141 lines
No EOL
5.9 KiB
Python
141 lines
No EOL
5.9 KiB
Python
import requests, json
|
|
|
|
from bs4 import BeautifulSoup, SoupStrainer
|
|
from urllib.parse import unquote
|
|
|
|
host = "https://scholar.google.com"
|
|
|
|
# Refresh Users Data <DEPRECATED>
|
|
def refreshGetUsers():
|
|
data = []
|
|
org = "9444447549188154848" # Universitas Tarumanagara
|
|
token = "test"
|
|
start = 10
|
|
|
|
while True:
|
|
try:
|
|
url = f"{host}/citations?view_op=view_org&hl=id&org={org}&after_author={token}&astart={start}"
|
|
r = requests.get(url)
|
|
users = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_sa_ccl'}))
|
|
for user in users.find_all('div', {'class': 'gsc_1usr'}):
|
|
user_info = user.find('h3', {'class': 'gs_ai_name'})
|
|
user_name = user_info.find('a').text
|
|
user_id = user_info.find('a')['href'].split('user=')[1]
|
|
user_type = user.find('div', {'class': 'gs_ai_eml'}).text.replace('Email yang diverifikasi di ', '')
|
|
user_image = user.find('img')['src']
|
|
data.append({
|
|
'name': user_name,
|
|
'id': user_id,
|
|
'type': user_type,
|
|
'image': user_image
|
|
})
|
|
|
|
print (start)
|
|
|
|
nextButton = users.find('button', {'class': 'gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx'})
|
|
if nextButton.get('disabled') == None:
|
|
start += 10
|
|
token = nextButton.get('onclick').replace('\\x', '%')
|
|
token = unquote(token).split('after_author=')[1].split('&astart=')[0]
|
|
else:
|
|
break
|
|
except Exception as e:
|
|
print (e)
|
|
|
|
with open('users.json', 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
|
|
|
return {'message': 'sucess'}
|
|
|
|
# Get All Users Data (Cached) <DEPRECATED>
|
|
def getUsers(name):
|
|
with open('scrapper/users.json', 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
res = []
|
|
for user in data:
|
|
if user['type'] == 'untar.ac.id' or user['type'] == 'fti.untar.ac.id':
|
|
if name.lower() in user['name'].lower():
|
|
if user not in res:
|
|
res.append(user)
|
|
|
|
if ' ' in name:
|
|
raw_name = name.split(' ')
|
|
raw_name.reverse()
|
|
if ' '.join(raw_name).lower() in user['name'].lower():
|
|
if user not in res:
|
|
res.append(user)
|
|
return res
|
|
|
|
# Get Citations Data with User ID <DEPRECATED>
|
|
def getCitations(user_id):
|
|
data = []
|
|
start = 0
|
|
end = 100
|
|
|
|
while True:
|
|
try:
|
|
url = f"{host}/citations?user={user_id}&hl=id&oi=sra&cstart={start}&pagesize={end}"
|
|
r = requests.get(url)
|
|
citations = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_art'}))
|
|
for citation in citations.find_all('tr', {'class': 'gsc_a_tr'}):
|
|
citation_title = citation.find('a', {'class': 'gsc_a_at'}).text.replace('\\','').replace('"','').replace("'","")
|
|
citation_id = citation.find('a', {'class': 'gsc_a_at'})['href'].split('citation_for_view=')[1]
|
|
citation_info = citation.find_all('div', {'class': 'gs_gray'})
|
|
citation_author = citation_info[0].text
|
|
for x in citation_info[1].find_all('span', {'class': 'gs_oph'}):
|
|
x.decompose()
|
|
citation_journal = citation_info[1].text
|
|
citation_year = citation.find('td', {'class': 'gsc_a_y'}).text
|
|
if citation_journal.lower() != 'turnitin':
|
|
data.append({
|
|
'title': citation_title,
|
|
'id': citation_id,
|
|
'author': citation_author,
|
|
'journal': citation_journal,
|
|
'year': citation_year
|
|
})
|
|
|
|
nextButton = citations.find('button', {'id': 'gsc_bpf_more'})
|
|
if nextButton.get('disabled') == None:
|
|
start += 100
|
|
end += 100
|
|
else:
|
|
break
|
|
except Exception as e:
|
|
print (e)
|
|
|
|
return data
|
|
|
|
# Get Citation Data with Citation ID
|
|
def getCitation(citation_id):
|
|
url = f"{host}/citations?view_op=view_citation&hl=en&citation_for_view={citation_id}"
|
|
r = requests.get(url)
|
|
citation = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_vcpb'}))
|
|
citation_title = citation.find('a', {'class': 'gsc_oci_title_link'}).text
|
|
citation_url = citation.find('a', {'class': 'gsc_oci_title_link'})['href']
|
|
citation_info = {}
|
|
citation_info_field = citation.find_all('div', {'class': 'gsc_oci_field'})
|
|
citation_info_value = citation.find_all('div', {'class': 'gsc_oci_value'})
|
|
for x in range(len(citation_info_field)):
|
|
if citation_info_field[x].text.lower() != 'total citations' and citation_info_field[x].text.lower() != 'scholar articles':
|
|
citation_info[citation_info_field[x].text.lower()] = citation_info_value[x].text
|
|
|
|
# Check if Downloadable
|
|
citation_download = False
|
|
citation_download_link = None
|
|
citation_download_raw = citation.find('div', {'class': 'gsc_oci_title_ggi'})
|
|
if citation_download_raw != None:
|
|
citation_download_check = citation_download_raw.find('a').text.lower()
|
|
if '[pdf]' in citation_download_check:
|
|
citation_download = True
|
|
citation_download_link = citation_download_raw.find('a')['href']
|
|
|
|
data = {
|
|
'title': citation_title,
|
|
'url': citation_url,
|
|
'info': citation_info,
|
|
'download': citation_download,
|
|
'download_link': citation_download_link
|
|
}
|
|
|
|
return data |