1. Tổng quan:

  2. Web Crawling:

    1. anime_movie_scraper:
    import scrapy
    
    '''
    dataset: Name - Other Name - Description - Year - Rating score - Number of Rating - Image
    '''
    class AnimeMovieScraperSpider(scrapy.Spider):
        name = "anime_movie_scraper"
        allowed_domains = ["animehay.life"]
        start_urls = ["<https://animehay.life/>"]
    
        def parse(self, response):
            anime_films = response.css('div.movie-item')
    
            for anime in anime_films:
                film_url = anime.css('a::attr(href)')[-1].get()
                if film_url:
                    yield scrapy.Request(url=film_url, callback=self.parse_anime_details)
    
            next_page = response.css('div.pagination a.active_page + a::attr(href)').get()
            # selector "a.active_page + a" = thẻ <a> ngay sau thẻ đang active
            if next_page:
                yield response.follow(next_page, callback=self.parse)
    
        def parse_anime_details(self, response):
            anime_details = response.css('div.info-movie')
    
            # Ten Phim
            ten_phim = anime_details.css('h1.heading_movie::text').get().strip()
    
            # Format Ten Khac
            ten_khac_list = anime_details.css('div.last div.name_other div::text').getall()
            if len(ten_khac_list) > 1:
                ten_khac = ten_khac_list[-1].strip()
            else:
                ten_khac = ''
    
            # Format Noi Dung
            # Noi dung tren div va tren p
            noi_dung_list = anime_details.css('div.desc.ah-frame-bg div::text, div.desc.ah-frame-bg p::text').getall()
    
            for i in range(len(noi_dung_list)):
                noi_dung_list[i] = noi_dung_list[i].strip()
            noi_dung = ' '.join(noi_dung_list).strip()
    
            # Format The Loai
            the_loai_list = anime_details.css('div.list_cate a::text').getall()
            for i in range(len(the_loai_list)):
                the_loai_list[i] = the_loai_list[i].strip()
                if the_loai_list[i] == 'CN Animation':
                    return
            # Rating va Danh Gia
            rating_score = anime_details.css('div.score div::text')[1].get().strip()
            rating, danh_gia_string = float(rating_score.split('||')[0]), rating_score.split('||')[1]
            danh_gia = int(danh_gia_string.split()[0])
    
            # Nam Phat Hanh
            nam_phat_hanh =  int(anime_details.css('div.update_time div ::text')[1].get().strip())
    
            # Hinh cua phim
            image = anime_details.css('div.first img::attr(src)').get()
            yield{
                'Tên Phim': ten_phim,
                'Tên Khác': ten_khac,
                'Nội Dung': noi_dung,
                'Thể Loại': the_loai_list,
                'Rating': rating,
                'Số lượng đánh giá': danh_gia,
                'Năm Phát Hành': nam_phat_hanh,
                'Image': image,
            }
    
    

    b.phimmoi_movies_scraper:

    import scrapy
    
    class MoviespiderSpider(scrapy.Spider):
        name = "moviespider"
        allowed_domains = ["phimmoi15.net"]
        start_urls = ["<https://phimmoi15.net/phim-le/>"]
    
        def parse(self, response):
            # pass
            movies = response.css('ul.last-film-box li')
            for movie in movies:
                url = movie.css('a.movie-item::attr(href)').get()
                name = movie.css('div.movie-title-1::text').get()
                img = movie.css('div.public-film-item-thumb::attr(data-wpfc-original-src)').get()
                other_name = movie.css('span.movie-title-2::text').get()
                if name:
                    name = name.strip()
                if other_name:
                    other_name = other_name.strip()
                next_page = response.css('ul.pagination-lg li a::attr(href)').getall()
                if url:
                    yield response.follow(url,
                                          callback=self.parse_movie_page,
                                          meta={'img': img,
                                                'name': name,
                                                'other_name': other_name,
                                                'url' : url})
                if next_page:
                    next_page_url = next_page[-1]
                    yield response.follow(next_page_url, callback=self.parse)
    
        def parse_movie_page(self, response):
            tags = response.css('div.block-tags::text')[-1].get().strip()
            raw_content = response.css('div#film-content ::text').getall()
            content = [text.strip() for text in raw_content if text.strip()]
            num_of_dl =response.css('dl.movie-dl dd').getall()
            if(len(num_of_dl) == 12):
                year = response.css('dd.movie-dd')[4]
                year = year.css('a::text').get().strip()
                genres = response.css('dd.movie-dd')[9]
                genres = genres.css('a::text').getall()
                rating = response.css('span.imdb::text').get()
                yield {
                    'name': response.meta['name'],
                    'other_name': response.meta['other_name'],
                    'content': content,
                    'genres' : genres,
                    'rating' : rating,
                    'year' : year,
                    'img': response.meta['img'],
                    'url': response.meta['url'],
                    'tags': tags,
                }
            else :
                pass