statduck
크롤링 & 워드클라우드 본문
Python - BeautifulSoup, request 라이브러리 이용
BeautifulSoup? - HTML이나 XML 파일의 데이터를 당겨오는 라이브러리
requests?: http 호출 라이브러리
XML?: 내부적으로 트리구조를 가지는 마크업 언어 (html파일이 대표적)
requests 사용법
import requests
r = requests.get('https://api.github.com/user', auth=('user', 'pass'))
r.status_code
# 200
r.headers['content-type']
# 'application/json; charset=utf8'
r.encoding
# 'utf-8'
r.text # doctest: +ELLIPSIS
# u'{"type":"User"...'
r.json() # doctest: +ELLIPSIS
# {u'private_gists': 419, u'total_private_repos': 77, ...}
#ref: https://en.wikipedia.org/wiki/Requests_(software)
BeautifulSoup 이용하여 크롤링
import requests
from bs4 import BeautifulSoup
import nltk
def get_article_content(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml') #xml파일인 r.text를 soup 객체로 변환
title = soup.title.text
contents = soup.find_all('p', class_='story-body-text story-content')
content_text = ''
for content in contents:
content_text = content_text +' '+ content.text
return title, content_text
url1='https://www.nytimes.com/2017/11/22/health/flour-raw-danger.html'
url2='https://www.nytimes.com/2017/11/22/health/prions-brain-creutzfeldt-jakob.html'
title1 = get_article_content(url1)[0]; content1 = get_article_content(url1)[1]
title2 = get_article_content(url2)[0]; content2 = get_article_content(url2)[1]
데이터 클렌징
from nltk.corpus import stopwords # need this for importing the stopwords dictionary provided by nltk
from collections import Counter # this is required for counting word frequency
import matplotlib.pyplot as plt # this is for visualization
def text_cleaning(text):
cleaned_text = text.replace('[','').replace(']','').replace(',','').replace('.','').replace('’','')
cleaned_text = cleaned_text.replace('“','').replace('”','').replace('\n', '')
cleaned_text = cleaned_text.lower()
word_tokens = nltk.word_tokenize(cleaned_text)
tokens_pos = nltk.pos_tag(word_tokens)
Noun_words = []
for word, pos in tokens_pos:
if pos.find('NN') >= 0:
Noun_words.append(word)
return Noun_words
final_Noun_words1=text_cleaning(content1);final_Noun_words2=text_cleaning(content2)
unique_Noun_words1=set(final_Noun_words1);unique_Noun_words2=set(final_Noun_words2)
stopwords = ['e','dr','?','coli','creutzfeldt-jakob','i','im']
for word in unique_Noun_words1:
if word in stopwords:
while word in final_Noun_words1: final_Noun_words1.remove(word)
for word in unique_Noun_words2:
if word in stopwords:
while word in final_Noun_words2: final_Noun_words2.remove(word)
c1=Counter(final_Noun_words1);c2=Counter(final_Noun_words2)
print(c1.most_common(10),'\n',c2.most_common(10))
워드 클라우드 생성
from wordcloud import WordCloud
cloud_text1=''; cloud_text2=''
for word in final_Noun_words1:
cloud_text1 = cloud_text1 + ' '+ word
for word in final_Noun_words2:
cloud_text2 = cloud_text2 + ' '+ word
wordcloud1 = WordCloud(max_font_size=40, relative_scaling=.5).generate(cloud_text1)
wordcloud2 = WordCloud(max_font_size=40, relative_scaling=.5).generate(cloud_text2)
plt.figure()
plt.imshow(wordcloud2)
print(title1)
print(title2)
'Machine Learning' 카테고리의 다른 글
왜 요약통계량인가? (0) | 2022.12.28 |
---|---|
Detecting Seasonality With Fourier (0) | 2022.12.19 |
Time Series Forecasting (0) | 2022.11.26 |
Kernel Smoothing (0) | 2022.06.09 |
Smoothing Splines & Smoother Matrices (0) | 2022.06.09 |
Comments