토이 플젝 | Notion

https://library.gabia.com/contents/9239/

https://dsconsulting.tistory.com/1 → 속도개선

pip install spacy

python -m spacy download en_core_web_sm

2개를 깔아야 가능

가장 최근 ←

from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import nltk
import re
import pandas as pd
import spacy

from nltk.corpus import stopwords
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
import googletrans

#python -m spacy download ja_core_news_sm
#ja_core_news_sm
nlp = spacy.load("en_core_web_sm")

def news(news_urls):
    translator = googletrans.Translator()
    for url_no in news_urls:
        print(url_no)
        driver = webdriver.Chrome()
        driver.get(url_no)
        html = driver.page_source
        soup = BeautifulSoup(html)

        str = ""
        str2 = ""
        result = ""
        for text in soup.findAll("p"):

            str += text.get_text() + "\\n"
            str2 += text.get_text() + "\\n"
            if(len(str)>1000):
                result1 = translator.translate(str, dest='en')
                result += result1.text.replace(" -", "-").replace("- ", "-")
                
                str = ""

        if str !="":
            result1 = translator.translate(str, dest='en')
            result += result1.text.replace(" -", "-").replace("- ", "-")
        return result

def np_tag(text):
    #text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()

    df = pd.DataFrame(columns = ['CHUNK'])
    
    doc = nlp(text)
    for chunk in doc.noun_chunks:
        df = df.append({'CHUNK': chunk.text}, ignore_index=True)
        
    return df

testnews = ['<https://news.yahoo.co.jp/articles/45e111649f3f3c7029f0276d90fef6f49631a5fc>']
wl =np_tag(news(testnews))
#wl=np_tag("Actors and actresses who are the “leading actors” are absolutely indispensable in Korean dramas.")
no_capitals = wl.values.tolist()
#print(no_capitals)
stops = set(stopwords.words('english'))
stemmer = nltk.stem.SnowballStemmer('english')

#testmemo =""
f = open("C:/Users/multicampus/Desktop/test/test.txt", 'w', encoding='utf-8')
for no_capital in no_capitals:
    no_stops = [word for word in no_capital if not word in stops]
    for tmp in no_stops:
        #testmemo=testmemo+tmp+'\\n'
        f.write(tmp+'\\n')
f.close()

from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver

baseUrl = '<https://www.google.com/search?q=>'
#구글 검색의 기본 url : 검색 여러번하고 주소창 보면 공통 형식이 보인다.
plusUrl = input('무엇을 검색할까요? : ')
a = 1 
while True:
    url = baseUrl + quote_plus(plusUrl)+'&tbm=nws'+'&start='+str(a)
#quote_plus : 문자열을 인터넷 검색가능한 형식으로 바꿔준다.

    driver = webdriver.Chrome()
    driver.get(url)

    html = driver.page_source
    soup = BeautifulSoup(html)

    # 검색 했을 때 제목과 breadcrumbs, 접속주소를 가져올 것이다.
    list = soup.select('#search>div>div>div>div>div')
    if(len(list)==0):
        #print("없어어어어어어엉~")
        break
    
    for i in list:
        if(i.select_one('div>a>div>div:nth-child(2)>div:nth-child(2)') is not None):
            print(i.select_one('div>a>div>div:nth-child(2)>div:nth-child(2)').text)
        #else:
         #   print("text 속성 없어어어어어어엉~")
   
    driver.close()
    a=a+10

from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver

def Url(search):
    url_temp = '<https://www.google.com/search?q={search}&tbm=nws>'.format(search=search)
#     url_temp = '<https://search.mt.co.kr/searchNewsList.html?srchFd=TOTAL&range=IN&reSrchFlag=&preKwd=&search_type=m&kwd={search}&bgndt=20190401&enddt=20190930&category=MTNW&sortType=allwordsyn&subYear=&category=MTNW&subType=mt>'.format(search=search)
    for pageNo in range(1,30,10):
        url = url_temp + "&start={pageNo}".format(pageNo=pageNo)
        driver = webdriver.Chrome()
        driver.get(url)

        html = driver.page_source
        soup = BeautifulSoup(html)

        # 검색 했을 때 제목과 breadcrumbs, 접속주소를 가져올 것이다.
        # #search>div>div>div>div>div
        list = soup.select('#search div>div>div>div')
        if not list:
            break

        for i in list:
            if i.select_one('div>a>div>div:nth-child(2)>div:nth-child(2)'):
                print(i.select_one('.WlydOe').get("href"))
                url =i.select_one('.WlydOe').get("href")
                driver = webdriver.Chrome()
                driver.get(url)
                html = driver.page_source
                soup = BeautifulSoup(html)
                test = soup.find('p').getText()
                test = test.replace('<p>',"")
                test = test.replace('</p>',"")
                print(test)     
#                 print(i.select_one('div>a>div>div:nth-child(2)>div:nth-child(2)').attrs)
                print(i.select_one('.iRPxbe>div:nth-child(2)').text)
            else:
                print("text 속성 없어어어어어어엉~")

Url('김우빈')

pip install spacy

python -m spacy download en_core_web_sm

2개를 깔아야 가능

from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import re
import pandas as pd
import spacy

from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
import googletrans

nlp = spacy.load("en_core_web_sm")

def news(news_urls):
    translator = googletrans.Translator()
    for url_no in news_urls:
        print(url_no)
        driver = webdriver.Chrome()
        driver.get(url_no)
        html = driver.page_source
        soup = BeautifulSoup(html)

        str = ""
        str2 = ""
        result = ""
        for text in soup.findAll("p"):

            str += text.get_text() + "\\n"
            str2 += text.get_text() + "\\n"
            if(len(str)>1000):
                result1 = translator.translate(str, dest='en')
                result += result1.text.replace(" -", "-").replace("- ", "-")
                
                str = ""

        if str !="":
            result1 = translator.translate(str, dest='en')
            result += result1.text.replace(" -", "-").replace("- ", "-")
        return result

def np_tag(text):
    
    df = pd.DataFrame(columns = ['CHUNK'])
    
    doc = nlp(text)
    for chunk in doc.noun_chunks:
        df = df.append({'CHUNK': chunk.text}, ignore_index=True)
        
    return df

testnews = ['<https://news.yahoo.co.jp/articles/45e111649f3f3c7029f0276d90fef6f49631a5fc>']
wl=np_tag("Actors and actresses who are the “leading actors” are absolutely indispensable in Korean dramas.")
nounphrase = wl.values.tolist()
for a in nounphrase:
    print(a)

from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import nltk
import re
import pandas as pd
import spacy
from konlpy.tag import Okt
from nltk.corpus import stopwords
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
import googletrans

#python -m spacy download ja_core_news_sm
#ja_core_news_sm
nlp = spacy.load("ko_core_news_sm")

def news(news_urls):
    translator = googletrans.Translator()
    for url_no in news_urls:
        print(url_no)
        driver = webdriver.Chrome()
        driver.get(url_no)
        html = driver.page_source
        soup = BeautifulSoup(html)

        str = ""
        str2 = ""
        result = ""
        for text in soup.findAll("p"):

            str += text.get_text() + "\\n"
            str2 += text.get_text() + "\\n"
            if(len(str)>1000):
                result1 = translator.translate(str, dest='ko')
                result += result1.text.replace(" -", "-").replace("- ", "-")
                str = ""

        if str !="":
            result1 = translator.translate(str, dest='ko')
            result += result1.text.replace(" -", "-").replace("- ", "-")
        return result

def np_tag(text):
    #text = re.sub('[^a-zA-Z]', ' ', text)
    #text = text.lower()
    hannanum = Okt()
    df = pd.DataFrame(columns = ['CHUNK'])
    doc = hannanum.nouns(text)
    news_desc = ' '.join(doc)
    print(news_desc)
    #for chunk in doc.noun_chunks:
    #    df = df.append({'CHUNK': chunk.text}, ignore_index=True)
    #for token in doc:
    #    print(token.text, token.pos_, token.dep_)    
    return df

testnews = ['<https://news.yahoo.co.jp/articles/45e111649f3f3c7029f0276d90fef6f49631a5fc>']
wl =np_tag(news(testnews))
#wl=np_tag("Actors and actresses who are the “leading actors” are absolutely indispensable in Korean dramas.")
no_capitals = wl.values.tolist()
#print(no_capitals)
#stops = set(stopwords.words('english'))
#print(no_capitals)

print(no_capitals)

stemmer = nltk.stem.SnowballStemmer('english')

#testmemo =""
f = open("C:/Users/multicampus/Desktop/test/test.txt", 'w', encoding='utf-8')
for no_capital in no_capitals:
    no_stops = [word for word in no_capital if not word in stops]
    for tmp in no_stops:
        #testmemo=testmemo+tmp+'\\n'
        f.write(tmp+'\\n')
f.close()