import requests, json from bs4 import BeautifulSoup, SoupStrainer from urllib.parse import unquote host = "https://scholar.google.com" # Refresh Users Data def refreshGetUsers(): data = [] org = "9444447549188154848" # Universitas Tarumanagara token = "test" start = 10 while True: try: url = f"{host}/citations?view_op=view_org&hl=id&org={org}&after_author={token}&astart={start}" r = requests.get(url) users = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_sa_ccl'})) for user in users.find_all('div', {'class': 'gsc_1usr'}): user_info = user.find('h3', {'class': 'gs_ai_name'}) user_name = user_info.find('a').text user_id = user_info.find('a')['href'].split('user=')[1] user_type = user.find('div', {'class': 'gs_ai_eml'}).text.replace('Email yang diverifikasi di ', '') user_image = user.find('img')['src'] data.append({ 'name': user_name, 'id': user_id, 'type': user_type, 'image': user_image }) print (start) nextButton = users.find('button', {'class': 'gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx'}) if nextButton.get('disabled') == None: start += 10 token = nextButton.get('onclick').replace('\\x', '%') token = unquote(token).split('after_author=')[1].split('&astart=')[0] else: break except Exception as e: print (e) with open('users.json', 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) return {'message': 'sucess'} # Get All Users Data (Cached) def getUsers(name): with open('scrapper/users.json', 'r', encoding='utf-8') as f: data = json.load(f) res = [] for user in data: if user['type'] == 'untar.ac.id' or user['type'] == 'fti.untar.ac.id': if name.lower() in user['name'].lower(): if user not in res: res.append(user) if ' ' in name: raw_name = name.split(' ') raw_name.reverse() if ' '.join(raw_name).lower() in user['name'].lower(): if user not in res: res.append(user) return res # Get Citations Data with User ID def getCitations(user_id): data = [] start = 0 end = 100 while True: try: url = f"{host}/citations?user={user_id}&hl=id&oi=sra&cstart={start}&pagesize={end}" r = requests.get(url) citations = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_art'})) for citation in citations.find_all('tr', {'class': 'gsc_a_tr'}): citation_title = citation.find('a', {'class': 'gsc_a_at'}).text.replace('\\','').replace('"','').replace("'","") citation_id = citation.find('a', {'class': 'gsc_a_at'})['href'].split('citation_for_view=')[1] citation_info = citation.find_all('div', {'class': 'gs_gray'}) citation_author = citation_info[0].text for x in citation_info[1].find_all('span', {'class': 'gs_oph'}): x.decompose() citation_journal = citation_info[1].text citation_year = citation.find('td', {'class': 'gsc_a_y'}).text if citation_journal.lower() != 'turnitin': data.append({ 'title': citation_title, 'id': citation_id, 'author': citation_author, 'journal': citation_journal, 'year': citation_year }) nextButton = citations.find('button', {'id': 'gsc_bpf_more'}) if nextButton.get('disabled') == None: start += 100 end += 100 else: break except Exception as e: print (e) return data # Get Citation Data with Citation ID def getCitation(citation_id): url = f"{host}/citations?view_op=view_citation&hl=en&citation_for_view={citation_id}" r = requests.get(url) citation = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_vcpb'})) citation_title = citation.find('a', {'class': 'gsc_oci_title_link'}).text citation_url = citation.find('a', {'class': 'gsc_oci_title_link'})['href'] citation_info = {} citation_info_field = citation.find_all('div', {'class': 'gsc_oci_field'}) citation_info_value = citation.find_all('div', {'class': 'gsc_oci_value'}) for x in range(len(citation_info_field)): if citation_info_field[x].text.lower() != 'total citations' and citation_info_field[x].text.lower() != 'scholar articles': citation_info[citation_info_field[x].text.lower()] = citation_info_value[x].text # Check if Downloadable citation_download = False citation_download_link = None citation_download_raw = citation.find('div', {'class': 'gsc_oci_title_ggi'}) if citation_download_raw != None: citation_download_check = citation_download_raw.find('a').text.lower() if '[pdf]' in citation_download_check: citation_download = True citation_download_link = citation_download_raw.find('a')['href'] data = { 'title': citation_title, 'url': citation_url, 'info': citation_info, 'download': citation_download, 'download_link': citation_download_link } return data