Initial commit

This commit is contained in:
Moe Poi ~ 2022-08-18 10:33:29 +07:00
commit cdce7084d9
Signed by: moepoi
GPG key ID: 334B501E883409AF
7 changed files with 434 additions and 0 deletions

8
.gitignore vendored Normal file
View file

@ -0,0 +1,8 @@
# Test File
*.xlsx
*.json
converter.py
database.py
# Environment
env/

18
Dockerfile Normal file
View file

@ -0,0 +1,18 @@
FROM python:3.9-slim
# copy the requirements file into the image
COPY ./requirements.txt /app/requirements.txt
# switch working directory
WORKDIR /app
# install the dependencies and packages in the requirements file
RUN pip install --no-cache-dir -r requirements.txt
# copy every content from the local file to the image
COPY . /app
# configure the container to run in an executed manner
ENTRYPOINT [ "python" ]
CMD ["main.py" ]

136
README.md Normal file
View file

@ -0,0 +1,136 @@
# **Journal Scrapper**
## Installation
- ### Docker
```docker
$ docker run --name JournalScrapper -d -p 5000:5000 registry.gitlab.com/moepoi/journalscrapper:latest
```
- ### Manual
```sh
$ mkdir JournalScrapper && cd JournalScrapper && git clone https://gitlab.com/moepoi/journalscrapper.git .
$ pip3 install -r requirements.txt
$ python3 main.py
```
## Usage
- Get Users
```sh
$ curl localhost:5000/getusers?name=viny
```
```json
[
{
"name": "VINY CHRISTANTI MAWARDI",
"id": "5990793",
"type": "Teknik Informatika (S1)",
"image": "https://scholar.google.co.id/citations?view_op=view_photo&user=hayqUI0AAAAJ&citpid=1"
}
]
```
- Get User
```sh
$ curl localhost:5000/getuser?id=5990793
```
```json
{
"name": "VINY CHRISTANTI MAWARDI",
"id": "5990793",
"type": "S1 - Teknik Informatika",
"image": "https://scholar.google.co.id/citations?view_op=view_photo&user=hayqUI0AAAAJ&citpid=1",
"gscholar_id": "hayqUI0AAAAJ",
"affiliation": "Universitas Tarumanagara",
"subject": [
"Information Retrieval"
],
"sinta_score_overall": "438",
"sinta_score_3yrs": "94",
"affil_score": "0",
"affil_score_3yrs": "0",
"summary": {
"article": {
"scopus": "7",
"gscholar": "160",
"wos": "0"
},
"citation": {
"scopus": "22",
"gscholar": "116",
"wos": "0"
},
"cited_document": {
"scopus": "5",
"gscholar": "33",
"wos": "0"
},
"h_index": {
"scopus": "3",
"gscholar": "6",
"wos": ""
},
"i10_index": {
"scopus": "1",
"gscholar": "3",
"wos": ""
},
"g_index": {
"scopus": "1",
"gscholar": "1",
"wos": ""
}
}
}
```
- Get Citations
```sh
$ curl localhost:5000/getcitations?id=hayqUI0AAAAJ
```
```json
[
{
"title": "Fast and accurate spelling correction using trie and Damerau-levenshtein distance bigram",
"id": "hayqUI0AAAAJ:TFP_iSt0sucC",
"author": "VM Christanti, DS Naga",
"journal": "Telkomnika 16 (2), 827-833",
"year": "2018"
},
{
"title": "Automatic essay scoring in E-learning system using LSA method with N-gram feature for Bahasa Indonesia",
"id": "hayqUI0AAAAJ:k_IJM867U9cC",
"author": "RS Citawan, VC Mawardi, B Mulyawan",
"journal": "MATEC web of conferences 164, 01037",
"year": "2018"
},
{
"title": "Content-based image retrieval using convolutional neural networks",
"id": "hayqUI0AAAAJ:SeFeTyx0c_EC",
"author": "Z Rian, V Christanti, J Hendryli",
"journal": "2019 IEEE International Conference on Signals and Systems (ICSigSys), 1-7",
"year": "2019"
}
//...........
]
```
- Get Citation
```sh
$ curl localhost:5000/getcitation?id=hayqUI0AAAAJ:hkOj_22Ku90C
```
```json
{
"title": "Aplikasi Clustering Berita dengan Metode K Means dan Peringkas Berita dengan Metode Maximum Marginal Relevance",
"url": "https://journal.untar.ac.id/index.php/jiksi/article/view/11560",
"info": {
"authors": "Edy Susanto, Viny Christanti Mawardi, Manatap Dolok Lauro",
"publication date": "2021",
"journal": "Jurnal Ilmu Komputer dan Sistem Informasi",
"volume": "9",
"issue": "1",
"pages": "62-68",
"description": "News is information about facts or opinions that are interesting to know. News can be obtained from various media such as newspapers and the internet. As is well known, news has various topics, such as politics, sports and others. There is also the same story written with the addition of a little information. This causes it to take more time to get the headline of the news. Therefore we need a system for news clustering using the K-Means method and news summarizing using the Maximum Marginal Relevance (MMR) method in order to obtain information from news more easily and efficiently. News that is processed in the form of a collection of files (multi document) with the extension txt. The summarization process goes through the text preprocessing stage, which consists of sentence segmentation, case folding, tokenizing, filtering, stemming. The next step is TF-IDF calculation to calculate word weight then Cosine Similarity to calculate the similarity between documents. After that, enter the K-Means stage for clustering division and proceed with determining the summary with MMR. Based on the results testing that has been done, this application is running well, the results of clustering and summarizing news can make it easier for users to get news summaries from some similar news."
},
"download": true,
"download_link": "http://journal.untar.ac.id/index.php/jiksi/article/download/11560/7233"
}
```

41
main.py Normal file
View file

@ -0,0 +1,41 @@
from flask import Flask, request, jsonify
from scrapper.gscholar import getCitations, getCitation
from scrapper.sinta import getUsers, getUser
app = Flask(__name__)
# Disable Auto Sort key in Jsonify
app.config.update(
JSON_SORT_KEYS = False,
)
@app.route('/')
def home():
return jsonify({'status': 'ok'})
@app.route('/getusers')
def get_users():
name = request.args['name']
data = getUsers(name)
return jsonify(data)
@app.route('/getuser')
def get_user():
user_id = request.args['id']
data = getUser(user_id)
return jsonify(data)
@app.route('/getcitations')
def get_citations():
user_id = request.args['id']
data = getCitations(user_id)
return jsonify(data)
@app.route('/getcitation')
def get_citation():
citation_id = request.args['id']
data = getCitation(citation_id)
return jsonify(data)
if __name__ == '__main__':
app.run(threaded=True, host="0.0.0.0", debug=True)

7
requirements.txt Normal file
View file

@ -0,0 +1,7 @@
flask
bs4
requests
lxml
pandas
openpyxl
pymysql

141
scrapper/gscholar.py Normal file
View file

@ -0,0 +1,141 @@
import requests, json
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import unquote
host = "https://scholar.google.com"
# Refresh Users Data <DEPRECATED>
def refreshGetUsers():
data = []
org = "9444447549188154848" # Universitas Tarumanagara
token = "test"
start = 10
while True:
try:
url = f"{host}/citations?view_op=view_org&hl=id&org={org}&after_author={token}&astart={start}"
r = requests.get(url)
users = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_sa_ccl'}))
for user in users.find_all('div', {'class': 'gsc_1usr'}):
user_info = user.find('h3', {'class': 'gs_ai_name'})
user_name = user_info.find('a').text
user_id = user_info.find('a')['href'].split('user=')[1]
user_type = user.find('div', {'class': 'gs_ai_eml'}).text.replace('Email yang diverifikasi di ', '')
user_image = user.find('img')['src']
data.append({
'name': user_name,
'id': user_id,
'type': user_type,
'image': user_image
})
print (start)
nextButton = users.find('button', {'class': 'gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx'})
if nextButton.get('disabled') == None:
start += 10
token = nextButton.get('onclick').replace('\\x', '%')
token = unquote(token).split('after_author=')[1].split('&astart=')[0]
else:
break
except Exception as e:
print (e)
with open('users.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
return {'message': 'sucess'}
# Get All Users Data (Cached) <DEPRECATED>
def getUsers(name):
with open('scrapper/users.json', 'r', encoding='utf-8') as f:
data = json.load(f)
res = []
for user in data:
if user['type'] == 'untar.ac.id' or user['type'] == 'fti.untar.ac.id':
if name.lower() in user['name'].lower():
if user not in res:
res.append(user)
if ' ' in name:
raw_name = name.split(' ')
raw_name.reverse()
if ' '.join(raw_name).lower() in user['name'].lower():
if user not in res:
res.append(user)
return res
# Get Citations Data with User ID <DEPRECATED>
def getCitations(user_id):
data = []
start = 0
end = 100
while True:
try:
url = f"{host}/citations?user={user_id}&hl=id&oi=sra&cstart={start}&pagesize={end}"
r = requests.get(url)
citations = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_art'}))
for citation in citations.find_all('tr', {'class': 'gsc_a_tr'}):
citation_title = citation.find('a', {'class': 'gsc_a_at'}).text.replace('\\','').replace('"','').replace("'","")
citation_id = citation.find('a', {'class': 'gsc_a_at'})['href'].split('citation_for_view=')[1]
citation_info = citation.find_all('div', {'class': 'gs_gray'})
citation_author = citation_info[0].text
for x in citation_info[1].find_all('span', {'class': 'gs_oph'}):
x.decompose()
citation_journal = citation_info[1].text
citation_year = citation.find('td', {'class': 'gsc_a_y'}).text
if citation_journal.lower() != 'turnitin':
data.append({
'title': citation_title,
'id': citation_id,
'author': citation_author,
'journal': citation_journal,
'year': citation_year
})
nextButton = citations.find('button', {'id': 'gsc_bpf_more'})
if nextButton.get('disabled') == None:
start += 100
end += 100
else:
break
except Exception as e:
print (e)
return data
# Get Citation Data with Citation ID
def getCitation(citation_id):
url = f"{host}/citations?view_op=view_citation&hl=en&citation_for_view={citation_id}"
r = requests.get(url)
citation = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'id': 'gsc_vcpb'}))
citation_title = citation.find('a', {'class': 'gsc_oci_title_link'}).text
citation_url = citation.find('a', {'class': 'gsc_oci_title_link'})['href']
citation_info = {}
citation_info_field = citation.find_all('div', {'class': 'gsc_oci_field'})
citation_info_value = citation.find_all('div', {'class': 'gsc_oci_value'})
for x in range(len(citation_info_field)):
if citation_info_field[x].text.lower() != 'total citations' and citation_info_field[x].text.lower() != 'scholar articles':
citation_info[citation_info_field[x].text.lower()] = citation_info_value[x].text
# Check if Downloadable
citation_download = False
citation_download_link = None
citation_download_raw = citation.find('div', {'class': 'gsc_oci_title_ggi'})
if citation_download_raw != None:
citation_download_check = citation_download_raw.find('a').text.lower()
if '[pdf]' in citation_download_check:
citation_download = True
citation_download_link = citation_download_raw.find('a')['href']
data = {
'title': citation_title,
'url': citation_url,
'info': citation_info,
'download': citation_download,
'download_link': citation_download_link
}
return data

83
scrapper/sinta.py Normal file
View file

@ -0,0 +1,83 @@
import requests
from bs4 import BeautifulSoup, SoupStrainer
host = "https://sinta.kemdikbud.go.id"
affiliations_id = 476 # Universitas Tarumanagara
pddikti_id = "031015"
def getUsers(name):
data = []
try:
url = f"{host}/affiliations/authors/{affiliations_id}?q={name}"
r = requests.get(url)
users = BeautifulSoup(r.text, 'lxml', parse_only=SoupStrainer('div', {'class': 'au-list-affil mt-3'}))
for user in users.find_all('div', {'class': 'au-item mt-3 mb-3 pb-5 pt-3'}):
user_image = user.find('img', {'class': 'img-thumbnail avatar'})['src'].strip()
user_profile = user.find('div', {'class': 'profile-name'})
user_name = user_profile.find('a').text.strip()
user_department = user.find('div', {'class': 'profile-dept'})
user_type = user_department.find('a').text.strip()
user_id = user.find('div', {'class': 'profile-id'}).text.strip().replace('ID : ', '')
data.append({
'name': user_name,
'id': user_id,
'type': user_type,
'image': user_image
})
return data
except Exception as e:
print (e)
def getUser(user_id):
try:
url = f"{host}/authors/profile/{user_id}/?view=googlescholar"
r = requests.get(url)
user = BeautifulSoup(r.text, 'lxml')
user_name = user.find('h3').text.strip()
user_image = user.find('img', {'class': 'img-fluid img-thumbnail round-corner'})['src']
user_gscholar_id = user_image.split('&user=')[1].split('&citpid=')[0]
user_profile = user.find('div', {'class': 'meta-profile'})
user_profile_item = user_profile.find_all('a')
user_affiliation = user_profile_item[0].text.strip()
user_type = user_profile_item[1].text.strip()
user_subject = user.find('div', {'class': 'profile-subject mt-3'})
user_subject_list = []
for user_subject_item in user_subject.find_all('a'):
user_subject_list.append(user_subject_item.text.strip())
user_stats = user.find_all('div', {'class': 'pr-num'})
user_sinta_score_overall = user_stats[0].text.strip()
user_sinta_score_3yrs = user_stats[1].text.strip()
user_affil_score = user_stats[2].text.strip()
user_affil_score_3yrs = user_stats[3].text.strip()
user_summary = user.find('table', {'class': 'table table-borderless table-sm text-center stat-table'})
user_summary_list = user_summary.find_all('tr')
user_summary_list.pop(0)
user_summary_list_data = {}
for user_summary_item in user_summary_list:
user_summary_field = user_summary_item.find_all('td')
user_summary_list_data[user_summary_field[0].text.strip().replace(' ', '_').replace('-', '_').lower()] = {
'scopus': user_summary_field[1].text.strip(),
'gscholar': user_summary_field[2].text.strip(),
'wos': user_summary_field[3].text.strip()
}
data = {
'name': user_name,
'id': user_id,
'type': user_type,
'image': user_image,
'gscholar_id': user_gscholar_id,
'affiliation': user_affiliation,
'subject': user_subject_list,
'sinta_score_overall': user_sinta_score_overall,
'sinta_score_3yrs': user_sinta_score_3yrs,
'affil_score': user_affil_score,
'affil_score_3yrs': user_affil_score_3yrs,
'summary': user_summary_list_data
}
return data
except Exception as e:
print (e)