From cdce7084d9c10fb97385eefa7bfcf90707721178 Mon Sep 17 00:00:00 2001 From: Moe Date: Thu, 18 Aug 2022 10:33:29 +0700 Subject: [PATCH] Initial commit --- .gitignore | 8 +++ Dockerfile | 18 ++++++ README.md | 136 +++++++++++++++++++++++++++++++++++++++++ main.py | 41 +++++++++++++ requirements.txt | 7 +++ scrapper/gscholar.py | 141 +++++++++++++++++++++++++++++++++++++++++++ scrapper/sinta.py | 83 +++++++++++++++++++++++++ 7 files changed, 434 insertions(+) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 scrapper/gscholar.py create mode 100644 scrapper/sinta.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7d4d8ba --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# Test File +*.xlsx +*.json +converter.py +database.py + +# Environment +env/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2810fdf --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.9-slim + +# copy the requirements file into the image +COPY ./requirements.txt /app/requirements.txt + +# switch working directory +WORKDIR /app + +# install the dependencies and packages in the requirements file +RUN pip install --no-cache-dir -r requirements.txt + +# copy every content from the local file to the image +COPY . /app + +# configure the container to run in an executed manner +ENTRYPOINT [ "python" ] + +CMD ["main.py" ] diff --git a/README.md b/README.md new file mode 100644 index 0000000..68a7000 --- /dev/null +++ b/README.md @@ -0,0 +1,136 @@ +# **Journal Scrapper** + +## Installation +- ### Docker + ```docker + $ docker run --name JournalScrapper -d -p 5000:5000 registry.gitlab.com/moepoi/journalscrapper:latest + ``` +- ### Manual + ```sh + $ mkdir JournalScrapper && cd JournalScrapper && git clone https://gitlab.com/moepoi/journalscrapper.git . + $ pip3 install -r requirements.txt + $ python3 main.py + ``` + +## Usage +- Get Users + ```sh + $ curl localhost:5000/getusers?name=viny + ``` + ```json + [ + { + "name": "VINY CHRISTANTI MAWARDI", + "id": "5990793", + "type": "Teknik Informatika (S1)", + "image": "https://scholar.google.co.id/citations?view_op=view_photo&user=hayqUI0AAAAJ&citpid=1" + } + ] + ``` + +- Get User + ```sh + $ curl localhost:5000/getuser?id=5990793 + ``` + ```json + { + "name": "VINY CHRISTANTI MAWARDI", + "id": "5990793", + "type": "S1 - Teknik Informatika", + "image": "https://scholar.google.co.id/citations?view_op=view_photo&user=hayqUI0AAAAJ&citpid=1", + "gscholar_id": "hayqUI0AAAAJ", + "affiliation": "Universitas Tarumanagara", + "subject": [ + "Information Retrieval" + ], + "sinta_score_overall": "438", + "sinta_score_3yrs": "94", + "affil_score": "0", + "affil_score_3yrs": "0", + "summary": { + "article": { + "scopus": "7", + "gscholar": "160", + "wos": "0" + }, + "citation": { + "scopus": "22", + "gscholar": "116", + "wos": "0" + }, + "cited_document": { + "scopus": "5", + "gscholar": "33", + "wos": "0" + }, + "h_index": { + "scopus": "3", + "gscholar": "6", + "wos": "" + }, + "i10_index": { + "scopus": "1", + "gscholar": "3", + "wos": "" + }, + "g_index": { + "scopus": "1", + "gscholar": "1", + "wos": "" + } + } + } + ``` + +- Get Citations + ```sh + $ curl localhost:5000/getcitations?id=hayqUI0AAAAJ + ``` + ```json + [ + { + "title": "Fast and accurate spelling correction using trie and Damerau-levenshtein distance bigram", + "id": "hayqUI0AAAAJ:TFP_iSt0sucC", + "author": "VM Christanti, DS Naga", + "journal": "Telkomnika 16 (2), 827-833", + "year": "2018" + }, + { + "title": "Automatic essay scoring in E-learning system using LSA method with N-gram feature for Bahasa Indonesia", + "id": "hayqUI0AAAAJ:k_IJM867U9cC", + "author": "RS Citawan, VC Mawardi, B Mulyawan", + "journal": "MATEC web of conferences 164, 01037", + "year": "2018" + }, + { + "title": "Content-based image retrieval using convolutional neural networks", + "id": "hayqUI0AAAAJ:SeFeTyx0c_EC", + "author": "Z Rian, V Christanti, J Hendryli", + "journal": "2019 IEEE International Conference on Signals and Systems (ICSigSys), 1-7", + "year": "2019" + } + //........... + ] + ``` + +- Get Citation + ```sh + $ curl localhost:5000/getcitation?id=hayqUI0AAAAJ:hkOj_22Ku90C + ``` + ```json + { + "title": "Aplikasi Clustering Berita dengan Metode K Means dan Peringkas Berita dengan Metode Maximum Marginal Relevance", + "url": "https://journal.untar.ac.id/index.php/jiksi/article/view/11560", + "info": { + "authors": "Edy Susanto, Viny Christanti Mawardi, Manatap Dolok Lauro", + "publication date": "2021", + "journal": "Jurnal Ilmu Komputer dan Sistem Informasi", + "volume": "9", + "issue": "1", + "pages": "62-68", + "description": "News is information about facts or opinions that are interesting to know. News can be obtained from various media such as newspapers and the internet. As is well known, news has various topics, such as politics, sports and others. There is also the same story written with the addition of a little information. This causes it to take more time to get the headline of the news. Therefore we need a system for news clustering using the K-Means method and news summarizing using the Maximum Marginal Relevance (MMR) method in order to obtain information from news more easily and efficiently. News that is processed in the form of a collection of files (multi document) with the extension txt. The summarization process goes through the text preprocessing stage, which consists of sentence segmentation, case folding, tokenizing, filtering, stemming. The next step is TF-IDF calculation to calculate word weight then Cosine Similarity to calculate the similarity between documents. After that, enter the K-Means stage for clustering division and proceed with determining the summary with MMR. Based on the results testing that has been done, this application is running well, the results of clustering and summarizing news can make it easier for users to get news summaries from some similar news." + }, + "download": true, + "download_link": "http://journal.untar.ac.id/index.php/jiksi/article/download/11560/7233" + } + ``` \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..8e208f4 --- /dev/null +++ b/main.py @@ -0,0 +1,41 @@ +from flask import Flask, request, jsonify +from scrapper.gscholar import getCitations, getCitation +from scrapper.sinta import getUsers, getUser + +app = Flask(__name__) + +# Disable Auto Sort key in Jsonify +app.config.update( + JSON_SORT_KEYS = False, +) + +@app.route('/') +def home(): + return jsonify({'status': 'ok'}) + +@app.route('/getusers') +def get_users(): + name = request.args['name'] + data = getUsers(name) + return jsonify(data) + +@app.route('/getuser') +def get_user(): + user_id = request.args['id'] + data = getUser(user_id) + return jsonify(data) + +@app.route('/getcitations') +def get_citations(): + user_id = request.args['id'] + data = getCitations(user_id) + return jsonify(data) + +@app.route('/getcitation') +def get_citation(): + citation_id = request.args['id'] + data = getCitation(citation_id) + return jsonify(data) + +if __name__ == '__main__': + app.run(threaded=True, host="0.0.0.0", debug=True) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e93d215 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +flask +bs4 +requests +lxml +pandas +openpyxl +pymysql \ No newline at end of file diff --git a/scrapper/gscholar.py b/scrapper/gscholar.py new file mode 100644 index 0000000..222d0eb --- /dev/null +++ b/scrapper/gscholar.py @@ -0,0 +1,141 @@ +import requests, json + +from bs4 import BeautifulSoup, SoupStrainer +from urllib.parse import unquote + +host = "https://scholar.google.com" + +# Refresh Users Data +def refreshGetUsers(): + data = [] + org = "9444447549188154848" # Universitas Tarumanagara + token = "test" + start = 10 + + while True: + try: + url = f"{host}/citations?view_op=view_org&hl=id&org={org}&after_author={token}&astart={start}" + r = requests.get(url) + users = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_sa_ccl'})) + for user in users.find_all('div', {'class': 'gsc_1usr'}): + user_info = user.find('h3', {'class': 'gs_ai_name'}) + user_name = user_info.find('a').text + user_id = user_info.find('a')['href'].split('user=')[1] + user_type = user.find('div', {'class': 'gs_ai_eml'}).text.replace('Email yang diverifikasi di ', '') + user_image = user.find('img')['src'] + data.append({ + 'name': user_name, + 'id': user_id, + 'type': user_type, + 'image': user_image + }) + + print (start) + + nextButton = users.find('button', {'class': 'gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx'}) + if nextButton.get('disabled') == None: + start += 10 + token = nextButton.get('onclick').replace('\\x', '%') + token = unquote(token).split('after_author=')[1].split('&astart=')[0] + else: + break + except Exception as e: + print (e) + + with open('users.json', 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + return {'message': 'sucess'} + +# Get All Users Data (Cached) +def getUsers(name): + with open('scrapper/users.json', 'r', encoding='utf-8') as f: + data = json.load(f) + res = [] + for user in data: + if user['type'] == 'untar.ac.id' or user['type'] == 'fti.untar.ac.id': + if name.lower() in user['name'].lower(): + if user not in res: + res.append(user) + + if ' ' in name: + raw_name = name.split(' ') + raw_name.reverse() + if ' '.join(raw_name).lower() in user['name'].lower(): + if user not in res: + res.append(user) + return res + +# Get Citations Data with User ID +def getCitations(user_id): + data = [] + start = 0 + end = 100 + + while True: + try: + url = f"{host}/citations?user={user_id}&hl=id&oi=sra&cstart={start}&pagesize={end}" + r = requests.get(url) + citations = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_art'})) + for citation in citations.find_all('tr', {'class': 'gsc_a_tr'}): + citation_title = citation.find('a', {'class': 'gsc_a_at'}).text.replace('\\','').replace('"','').replace("'","") + citation_id = citation.find('a', {'class': 'gsc_a_at'})['href'].split('citation_for_view=')[1] + citation_info = citation.find_all('div', {'class': 'gs_gray'}) + citation_author = citation_info[0].text + for x in citation_info[1].find_all('span', {'class': 'gs_oph'}): + x.decompose() + citation_journal = citation_info[1].text + citation_year = citation.find('td', {'class': 'gsc_a_y'}).text + if citation_journal.lower() != 'turnitin': + data.append({ + 'title': citation_title, + 'id': citation_id, + 'author': citation_author, + 'journal': citation_journal, + 'year': citation_year + }) + + nextButton = citations.find('button', {'id': 'gsc_bpf_more'}) + if nextButton.get('disabled') == None: + start += 100 + end += 100 + else: + break + except Exception as e: + print (e) + + return data + +# Get Citation Data with Citation ID +def getCitation(citation_id): + url = f"{host}/citations?view_op=view_citation&hl=en&citation_for_view={citation_id}" + r = requests.get(url) + citation = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_vcpb'})) + citation_title = citation.find('a', {'class': 'gsc_oci_title_link'}).text + citation_url = citation.find('a', {'class': 'gsc_oci_title_link'})['href'] + citation_info = {} + citation_info_field = citation.find_all('div', {'class': 'gsc_oci_field'}) + citation_info_value = citation.find_all('div', {'class': 'gsc_oci_value'}) + for x in range(len(citation_info_field)): + if citation_info_field[x].text.lower() != 'total citations' and citation_info_field[x].text.lower() != 'scholar articles': + citation_info[citation_info_field[x].text.lower()] = citation_info_value[x].text + + # Check if Downloadable + citation_download = False + citation_download_link = None + citation_download_raw = citation.find('div', {'class': 'gsc_oci_title_ggi'}) + if citation_download_raw != None: + citation_download_check = citation_download_raw.find('a').text.lower() + if '[pdf]' in citation_download_check: + citation_download = True + citation_download_link = citation_download_raw.find('a')['href'] + + data = { + 'title': citation_title, + 'url': citation_url, + 'info': citation_info, + 'download': citation_download, + 'download_link': citation_download_link + } + + return data \ No newline at end of file diff --git a/scrapper/sinta.py b/scrapper/sinta.py new file mode 100644 index 0000000..0962aa6 --- /dev/null +++ b/scrapper/sinta.py @@ -0,0 +1,83 @@ +import requests + +from bs4 import BeautifulSoup, SoupStrainer + +host = "https://sinta.kemdikbud.go.id" +affiliations_id = 476 # Universitas Tarumanagara +pddikti_id = "031015" + +def getUsers(name): + data = [] + + try: + url = f"{host}/affiliations/authors/{affiliations_id}?q={name}" + r = requests.get(url) + users = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'class': 'au-list-affil mt-3'})) + for user in users.find_all('div', {'class': 'au-item mt-3 mb-3 pb-5 pt-3'}): + user_image = user.find('img', {'class': 'img-thumbnail avatar'})['src'].strip() + user_profile = user.find('div', {'class': 'profile-name'}) + user_name = user_profile.find('a').text.strip() + user_department = user.find('div', {'class': 'profile-dept'}) + user_type = user_department.find('a').text.strip() + user_id = user.find('div', {'class': 'profile-id'}).text.strip().replace('ID : ', '') + data.append({ + 'name': user_name, + 'id': user_id, + 'type': user_type, + 'image': user_image + }) + + return data + except Exception as e: + print (e) + +def getUser(user_id): + try: + url = f"{host}/authors/profile/{user_id}/?view=googlescholar" + r = requests.get(url) + user = BeautifulSoup(r.text, 'lxml') + user_name = user.find('h3').text.strip() + user_image = user.find('img', {'class': 'img-fluid img-thumbnail round-corner'})['src'] + user_gscholar_id = user_image.split('&user=')[1].split('&citpid=')[0] + user_profile = user.find('div', {'class': 'meta-profile'}) + user_profile_item = user_profile.find_all('a') + user_affiliation = user_profile_item[0].text.strip() + user_type = user_profile_item[1].text.strip() + user_subject = user.find('div', {'class': 'profile-subject mt-3'}) + user_subject_list = [] + for user_subject_item in user_subject.find_all('a'): + user_subject_list.append(user_subject_item.text.strip()) + user_stats = user.find_all('div', {'class': 'pr-num'}) + user_sinta_score_overall = user_stats[0].text.strip() + user_sinta_score_3yrs = user_stats[1].text.strip() + user_affil_score = user_stats[2].text.strip() + user_affil_score_3yrs = user_stats[3].text.strip() + user_summary = user.find('table', {'class': 'table table-borderless table-sm text-center stat-table'}) + user_summary_list = user_summary.find_all('tr') + user_summary_list.pop(0) + user_summary_list_data = {} + for user_summary_item in user_summary_list: + user_summary_field = user_summary_item.find_all('td') + user_summary_list_data[user_summary_field[0].text.strip().replace(' ', '_').replace('-', '_').lower()] = { + 'scopus': user_summary_field[1].text.strip(), + 'gscholar': user_summary_field[2].text.strip(), + 'wos': user_summary_field[3].text.strip() + } + data = { + 'name': user_name, + 'id': user_id, + 'type': user_type, + 'image': user_image, + 'gscholar_id': user_gscholar_id, + 'affiliation': user_affiliation, + 'subject': user_subject_list, + 'sinta_score_overall': user_sinta_score_overall, + 'sinta_score_3yrs': user_sinta_score_3yrs, + 'affil_score': user_affil_score, + 'affil_score_3yrs': user_affil_score_3yrs, + 'summary': user_summary_list_data + } + + return data + except Exception as e: + print (e) \ No newline at end of file