https://library.gabia.com/contents/9239/
https://dsconsulting.tistory.com/1 → 속도개선
pip install spacy
python -m spacy download en_core_web_sm
2개를 깔아야 가능
가장 최근 ←
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import nltk
import re
import pandas as pd
import spacy
from nltk.corpus import stopwords
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
import googletrans
#python -m spacy download ja_core_news_sm
#ja_core_news_sm
nlp = spacy.load("en_core_web_sm")
def news(news_urls):
translator = googletrans.Translator()
for url_no in news_urls:
print(url_no)
driver = webdriver.Chrome()
driver.get(url_no)
html = driver.page_source
soup = BeautifulSoup(html)
str = ""
str2 = ""
result = ""
for text in soup.findAll("p"):
str += text.get_text() + "\\n"
str2 += text.get_text() + "\\n"
if(len(str)>1000):
result1 = translator.translate(str, dest='en')
result += result1.text.replace(" -", "-").replace("- ", "-")
str = ""
if str !="":
result1 = translator.translate(str, dest='en')
result += result1.text.replace(" -", "-").replace("- ", "-")
return result
def np_tag(text):
#text = re.sub('[^a-zA-Z]', ' ', text)
text = text.lower()
df = pd.DataFrame(columns = ['CHUNK'])
doc = nlp(text)
for chunk in doc.noun_chunks:
df = df.append({'CHUNK': chunk.text}, ignore_index=True)
return df
testnews = ['<https://news.yahoo.co.jp/articles/45e111649f3f3c7029f0276d90fef6f49631a5fc>']
wl =np_tag(news(testnews))
#wl=np_tag("Actors and actresses who are the “leading actors” are absolutely indispensable in Korean dramas.")
no_capitals = wl.values.tolist()
#print(no_capitals)
stops = set(stopwords.words('english'))
stemmer = nltk.stem.SnowballStemmer('english')
#testmemo =""
f = open("C:/Users/multicampus/Desktop/test/test.txt", 'w', encoding='utf-8')
for no_capital in no_capitals:
no_stops = [word for word in no_capital if not word in stops]
for tmp in no_stops:
#testmemo=testmemo+tmp+'\\n'
f.write(tmp+'\\n')
f.close()
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
baseUrl = '<https://www.google.com/search?q=>'
#구글 검색의 기본 url : 검색 여러번하고 주소창 보면 공통 형식이 보인다.
plusUrl = input('무엇을 검색할까요? : ')
a = 1
while True:
url = baseUrl + quote_plus(plusUrl)+'&tbm=nws'+'&start='+str(a)
#quote_plus : 문자열을 인터넷 검색가능한 형식으로 바꿔준다.
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html)
# 검색 했을 때 제목과 breadcrumbs, 접속주소를 가져올 것이다.
list = soup.select('#search>div>div>div>div>div')
if(len(list)==0):
#print("없어어어어어어엉~")
break
for i in list:
if(i.select_one('div>a>div>div:nth-child(2)>div:nth-child(2)') is not None):
print(i.select_one('div>a>div>div:nth-child(2)>div:nth-child(2)').text)
#else:
# print("text 속성 없어어어어어어엉~")
driver.close()
a=a+10
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
def Url(search):
url_temp = '<https://www.google.com/search?q={search}&tbm=nws>'.format(search=search)
# url_temp = '<https://search.mt.co.kr/searchNewsList.html?srchFd=TOTAL&range=IN&reSrchFlag=&preKwd=&search_type=m&kwd={search}&bgndt=20190401&enddt=20190930&category=MTNW&sortType=allwordsyn&subYear=&category=MTNW&subType=mt>'.format(search=search)
for pageNo in range(1,30,10):
url = url_temp + "&start={pageNo}".format(pageNo=pageNo)
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html)
# 검색 했을 때 제목과 breadcrumbs, 접속주소를 가져올 것이다.
# #search>div>div>div>div>div
list = soup.select('#search div>div>div>div')
if not list:
break
for i in list:
if i.select_one('div>a>div>div:nth-child(2)>div:nth-child(2)'):
print(i.select_one('.WlydOe').get("href"))
url =i.select_one('.WlydOe').get("href")
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html)
test = soup.find('p').getText()
test = test.replace('<p>',"")
test = test.replace('</p>',"")
print(test)
# print(i.select_one('div>a>div>div:nth-child(2)>div:nth-child(2)').attrs)
print(i.select_one('.iRPxbe>div:nth-child(2)').text)
else:
print("text 속성 없어어어어어어엉~")
Url('김우빈')
pip install spacy
python -m spacy download en_core_web_sm
2개를 깔아야 가능
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import re
import pandas as pd
import spacy
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
import googletrans
nlp = spacy.load("en_core_web_sm")
def news(news_urls):
translator = googletrans.Translator()
for url_no in news_urls:
print(url_no)
driver = webdriver.Chrome()
driver.get(url_no)
html = driver.page_source
soup = BeautifulSoup(html)
str = ""
str2 = ""
result = ""
for text in soup.findAll("p"):
str += text.get_text() + "\\n"
str2 += text.get_text() + "\\n"
if(len(str)>1000):
result1 = translator.translate(str, dest='en')
result += result1.text.replace(" -", "-").replace("- ", "-")
str = ""
if str !="":
result1 = translator.translate(str, dest='en')
result += result1.text.replace(" -", "-").replace("- ", "-")
return result
def np_tag(text):
df = pd.DataFrame(columns = ['CHUNK'])
doc = nlp(text)
for chunk in doc.noun_chunks:
df = df.append({'CHUNK': chunk.text}, ignore_index=True)
return df
testnews = ['<https://news.yahoo.co.jp/articles/45e111649f3f3c7029f0276d90fef6f49631a5fc>']
wl=np_tag("Actors and actresses who are the “leading actors” are absolutely indispensable in Korean dramas.")
nounphrase = wl.values.tolist()
for a in nounphrase:
print(a)
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import nltk
import re
import pandas as pd
import spacy
from konlpy.tag import Okt
from nltk.corpus import stopwords
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
import googletrans
#python -m spacy download ja_core_news_sm
#ja_core_news_sm
nlp = spacy.load("ko_core_news_sm")
def news(news_urls):
translator = googletrans.Translator()
for url_no in news_urls:
print(url_no)
driver = webdriver.Chrome()
driver.get(url_no)
html = driver.page_source
soup = BeautifulSoup(html)
str = ""
str2 = ""
result = ""
for text in soup.findAll("p"):
str += text.get_text() + "\\n"
str2 += text.get_text() + "\\n"
if(len(str)>1000):
result1 = translator.translate(str, dest='ko')
result += result1.text.replace(" -", "-").replace("- ", "-")
str = ""
if str !="":
result1 = translator.translate(str, dest='ko')
result += result1.text.replace(" -", "-").replace("- ", "-")
return result
def np_tag(text):
#text = re.sub('[^a-zA-Z]', ' ', text)
#text = text.lower()
hannanum = Okt()
df = pd.DataFrame(columns = ['CHUNK'])
doc = hannanum.nouns(text)
news_desc = ' '.join(doc)
print(news_desc)
#for chunk in doc.noun_chunks:
# df = df.append({'CHUNK': chunk.text}, ignore_index=True)
#for token in doc:
# print(token.text, token.pos_, token.dep_)
return df
testnews = ['<https://news.yahoo.co.jp/articles/45e111649f3f3c7029f0276d90fef6f49631a5fc>']
wl =np_tag(news(testnews))
#wl=np_tag("Actors and actresses who are the “leading actors” are absolutely indispensable in Korean dramas.")
no_capitals = wl.values.tolist()
#print(no_capitals)
#stops = set(stopwords.words('english'))
#print(no_capitals)
print(no_capitals)
stemmer = nltk.stem.SnowballStemmer('english')
#testmemo =""
f = open("C:/Users/multicampus/Desktop/test/test.txt", 'w', encoding='utf-8')
for no_capital in no_capitals:
no_stops = [word for word in no_capital if not word in stops]
for tmp in no_stops:
#testmemo=testmemo+tmp+'\\n'
f.write(tmp+'\\n')
f.close()