[Python] 파이썬 크롤링

개발

[Python] 파이썬 크롤링

썽연 2021. 10. 3. 15:36

728x90

캡스톤 프로젝트를 위해,

먼저 파이썬으로 인스타그램을 크롤링 할 것이다.

인스타그램을 크롤링 하는 이유는

보통 나이키에서 응모가 이루어지는데, 나이키 매장마다 응모 정보들을

인스타그램에 주로 올리기 때문에, 어플로 한번에 보기 쉽게 하기 위해선

크롤링을 할 것이다.

최종 코드는 다음과 같다.

#-*ccoding:utf-8-*-

from typing import List
from selenium import webdriver
from urllib.request import urlopen
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
import pyrebase
import json
import time
from openpyxl import Workbook
import sched



def crawling():
    #search_word = input('검색할 단어를 입력하세요: ')
    #tag_n = input('가져올 태그의 숫자를 입력하세요 : ')
    search_word = 'nike_gangnam'
    tag_n = '5'
    driver = webdriver.Chrome('chromedriver.exe')
    driver.get('https://www.instagram.com')

    with open("auth.json") as f:
        config = json.load(f)


    #driver.maximize_window()

    time.sleep(3)
    ###로그인
    e = driver.find_elements_by_class_name('_2hvTZ.pexuQ.zyHYP')[0]
    e.send_keys('인스타 아이디입력하세요')
    e = driver.find_elements_by_class_name('_2hvTZ.pexuQ.zyHYP')[1]
    e.send_keys('인스타 비밀번호 입력하세요')
    e.send_keys(Keys.ENTER)
    time.sleep(3)



    ###해시태그나 아이디로 넘어감
    #url = 'https://www.instagram.com/explore/tags/' + search_word+'/'
    url = 'https://www.instagram.com/' + search_word+'/'
    driver.get(url)
    time.sleep(5)
    html = driver.page_source
    soup = BeautifulSoup(html)
    firebase = pyrebase.initialize_app(config)
    db = firebase.database()


    insta = soup.select('.v1Nh3.kIKUG._bz0w')

    n = 1
    nowTime = datetime.datetime.now().strftime("%y%m%d_%H%m%S")

    # ###########사진저장
    # for i in insta:
    #     print('https://www.instagram.com/'+i.a['href'])
    #     eventurl = i.select_one('.KL4Bh').img['src']
    #     with urlopen(eventurl) as f:
    #         with open('./event/' + nowTime + '_' + search_word + str(n) + '.jpg', 'wb') as h:
    #             img = f.read()
    #             h.write(img)
    #         n += 1
    #         #  print(eventurl)
    #         #  print()

    #해시태그에서 사진클릭
    driver.find_elements_by_class_name('_9AhH0')[0].click()
    time.sleep(2)

    ###게시물 태그
    tags = driver.find_elements_by_class_name('C4VMK')

    tag_list = []
    n = 1
    # for i in tags:
    #     print(n)
    #     if i in tags:
    #         print(i.text)
    #         for  in
    #         tag_list.append(i.text)
    #     n += 1

    while True:
        try:
            if int(tag_n) > n:
                for i in tags:
                    print(n)

                    print(i.text)
                    tag_list.append(i.text)
                driver.find_elements_by_class_name('_65Bje.coreSpriteRightPaginationArrow')[0].click()
                time.sleep(4)
                tags = driver.find_elements_by_class_name('C4VMK')
                n += 1

            else :
                if (n >= int(tag_n)):
                    break

                else :
                    driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')

        except:
            if (n >= int(tag_n)):
                break

    print(len(tag_list))
    search = "NIKE GANGNAM - THE DRAW"

    word_list = List()
    for word in tag_list:
        if search in word:
            word_list.append(word)
            if len(word_list) == 1:
                break
    print(word_list)
    # print(tag_list)
    print(len(word_list))

    f = open('nike_gangnam.txt', 'w', encoding='utf-8')

    for c in word_list:

        f.write(c)
        f.write('\n')

    f.close()

    with open('nike_gangnam.txt', mode='r', encoding='utf-8') as rawtext :
        text=rawtext.read()
        list = []
        lastidx=text.find('\n[')
        list.append(text[:lastidx])
        firstidx=text.find('▶ 응모기간') # 다음 '>' 까지의 위치 찾기
        text=text[firstidx+1:] # 해당 위치까지 자르기 (태그 부분이 잘린다)
        lastidx1=text.find('\n▶ 당첨발표')# 다음 '<'의 위치 찾기
        list.append(text[:lastidx1])# 알맹이를 뽑아서 리스트에 추가하기
        list.append(url)
        dictionary = {"id" : list[0], "pw" : list[1], "linkk" : list[2]}

    db.child("Draw").child("Draw_28").set(dictionary)
sched.every().hours.do

파이썬으로 크롤링 하기 위해서는, BeautifulSoup를 사용하였고,

데이터 저장은 파이어베이스를 이용하였기 때문에,

파이어베이스와 연동을 위해 firebase도 설치해주고,

크롤링한 데이터를 정제화하여 저장해주었다.

우리는 1시간마다 크롤링하여 정제화 해주기 위해

스케줄라이브러리를 이용하여 함수를 계속 돌려주는 방식으로 이용했다.

인스타그램을 크롤링하기 위해서,

인스타그램의 소스들을 이용하여 로그인창과 비밀번호 창,

크롤링 할 사진들의 변수들은 사이트에서 직접 가져와주었다.

 e = driver.find_elements_by_class_name('_2hvTZ.pexuQ.zyHYP')[0]
    e.send_keys('인스타 아이디입력하세요')
    e = driver.find_elements_by_class_name('_2hvTZ.pexuQ.zyHYP')[1]
    e.send_keys('인스타 비밀번호 입력하세요')
    e.send_keys(Keys.ENTER)
    time.sleep(3)

e.send_keys 는 괄호 안의 내용들이 직접 입력이 되기 때문에

인스타그램의 아이디와 비밀번호를 입력하도록 한다.

  ###해시태그나 아이디로 넘어감
    #url = 'https://www.instagram.com/explore/tags/' + search_word+'/'
    url = 'https://www.instagram.com/' + search_word+'/'
    driver.get(url)
    time.sleep(5)
    html = driver.page_source
    soup = BeautifulSoup(html)
    firebase = pyrebase.initialize_app(config)
    db = firebase.database()


    insta = soup.select('.v1Nh3.kIKUG._bz0w')

    n = 1
    nowTime = datetime.datetime.now().strftime("%y%m%d_%H%m%S")
    
#해시태그에서 사진클릭
    driver.find_elements_by_class_name('_9AhH0')[0].click()
    time.sleep(2)

인스타그램의 해시태그는 보통, 주소/explore/tags 뒤에 + 검색한 태그명 또는

인스타그램 주소/+ 계정명

이런식으로 나오기 때문에 검색할 태그를 미리 변수로 설정하여주고,

사이트가 넘억가도록 한 후, BeautifulSoup을 이용하여 크롤링을 진행하도록 하였다.

후에, 파이어베이스 db에 저장되도록 코드를 짜주었다.

driver.find_elements_by_class_name('_9AhH0')[0].click()

는 인스타그램의 계정 및 태그 안에 들어가서, 제일 최근에 올라온 게시글을 클릭하는 것까지 설정되었다.

   ###게시물 태그
    tags = driver.find_elements_by_class_name('C4VMK')

    tag_list = []
    n = 1
    # for i in tags:
    #     print(n)
    #     if i in tags:
    #         print(i.text)
    #         for  in
    #         tag_list.append(i.text)
    #     n += 1

위 코드는 게시글의 태그들을 가지고 오는 것이다.

tag_list라는 리스트 안에 태그들을 계속 추가하여준다.

  while True:
        try:
            if int(tag_n) > n:
                for i in tags:
                    print(n)

                    print(i.text)
                    tag_list.append(i.text)
                driver.find_elements_by_class_name('_65Bje.coreSpriteRightPaginationArrow')[0].click()
                time.sleep(4)
                tags = driver.find_elements_by_class_name('C4VMK')
                n += 1

            else :
                if (n >= int(tag_n)):
                    break

                else :
                    driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')

        except:
            if (n >= int(tag_n)):
                break

    print(len(tag_list))
    search = "NIKE GANGNAM - THE DRAW"

    word_list = List()
    for word in tag_list:
        if search in word:
            word_list.append(word)
            if len(word_list) == 1:
                break
    print(word_list)
    # print(tag_list)
    print(len(word_list))

    f = open('nike_gangnam.txt', 'w', encoding='utf-8')

    for c in word_list:

        f.write(c)
        f.write('\n')

    f.close()

태그에 응모라는 태그가 있을 때, 우리가 입력한 n(크롤링할 게시글 갯수)만큼 크롤링을 진행하여

글 내용을 크롤링하였다.

    with open('nike_gangnam.txt', mode='r', encoding='utf-8') as rawtext :
        text=rawtext.read()
        list = []
        lastidx=text.find('\n[')
        list.append(text[:lastidx])
        firstidx=text.find('▶ 응모기간') # 다음 '>' 까지의 위치 찾기
        text=text[firstidx+1:] # 해당 위치까지 자르기 (태그 부분이 잘린다)
        lastidx1=text.find('\n▶ 당첨발표')# 다음 '<'의 위치 찾기
        list.append(text[:lastidx1])# 알맹이를 뽑아서 리스트에 추가하기
        list.append(url)
        dictionary = {"id" : list[0], "pw" : list[1], "linkk" : list[2]}

    db.child("Draw").child("Draw_28").set(dictionary)
sched.every().hours.do

크롤링한 txt파일을 열어서, 정제화를 진행하였고,

정제화한 데이터는 바로 파이어베이스에 저장할 수 있도록 설정하였다.

728x90

저작자표시

현재글[Python] 파이썬 크롤링

말하는 감자의 성장일기 개발블로그입니다 ~~ ✨

250x250

일	월	화	수	목	금	토
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28

성연커밋 : )

[Python] 파이썬 크롤링

'개발'의 다른글

티스토리툴바

[Python] 파이썬 크롤링

'개발'의 다른글

관련글

티스토리툴바