링크 : https://code.google.com/p/pygoogle/
간단 예제
1 2 3 4 5 | from pygoogle import pygoogle g = pygoogle( 'quake 3 arena' ) g.pages = 5 print '*Found %s results*' % (g.get_result_count()) g.get_urls() |
노트
method | return |
search() | returns a dict of Title/URLs |
get_urls() | returns list of result URLs |
get_result_count() | returns the number of results |
display_results() | prints results (for command line) |
소스
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | #!/usr/bin/python """ Google AJAX Search Module Needs Python 2.6 or later """ try: import json except ImportError,e: import simplejson as json except ImportError,e: print e exit() import sys import urllib import logging import argparse __author__ = "Kiran Bandla" __version__ = "0.2" #Web Search Specific Arguments #SAFE,FILTER """ SAFE This optional argument supplies the search safety level which may be one of: * safe=active - enables the highest level of safe search filtering * safe=moderate - enables moderate safe search filtering (default) * safe=off - disables safe search filtering """ SAFE_ACTIVE = "active" SAFE_MODERATE = "moderate" SAFE_OFF = "off" """ FILTER This optional argument controls turning on or off the duplicate content filter: * filter=0 - Turns off the duplicate content filter * filter=1 - Turns on the duplicate content filter (default) """ FILTER_OFF = 0 FILTER_ON = 1 #Standard URL Arguments """ RSZ This optional argument supplies the number of results that the application would like to recieve. A value of small indicates a small result set size or 4 results. A value of large indicates a large result set or 8 results. If this argument is not supplied, a value of small is assumed. """ RSZ_SMALL = "small" RSZ_LARGE = "large" """ HL This optional argument supplies the host language of the application making the request. If this argument is not present then the system will choose a value based on the value of the Accept-Language http header. If this header is not present, a value of en is assumed. """ class pygoogle: def __init__(self,query,pages=10,hl='en',log_level=logging.INFO): self.pages = pages #Number of pages. default 10 self.query = query self.filter = FILTER_ON #Controls turning on or off the duplicate content filter. On = 1. self.rsz = RSZ_LARGE #Results per page. small = 4 /large = 8 self.safe = SAFE_OFF #SafeBrowsing - active/moderate/off self.hl = hl #Defaults to English (en) self.__setup_logging(level=log_level) def __setup_logging(self, level): logger = logging.getLogger('pygoogle') logger.setLevel(level) handler = logging.StreamHandler(sys.stdout) handler.setFormatter(logging.Formatter('%(module)s %(levelname)s %(funcName)s| %(message)s')) logger.addHandler(handler) self.logger = logger def __search__(self,print_results=False): ''' returns list of results if successful or False otherwise ''' results = [] for page in range(0,self.pages): rsz = 8 if self.rsz == RSZ_SMALL: rsz = 4 args = {'q' : self.query, 'v' : '1.0', 'start' : page*rsz, 'rsz': self.rsz, 'safe' : self.safe, 'filter' : self.filter, 'hl' : self.hl } self.logger.debug('search: "%s" page# : %s'%(self.query, page)) q = urllib.urlencode(args) search_results = urllib.urlopen(URL+q) data = json.loads(search_results.read()) if not data.has_key('responseStatus'): self.logger.error('response does not have a responseStatus key') continue if data.get('responseStatus') != 200: self.logger.debug('responseStatus is not 200') self.logger.error('responseDetails : %s'%(data.get('responseDetails', None))) continue if print_results: if data.has_key('responseData') and data['responseData'].has_key('results'): for result in data['responseData']['results']: if result: print '[%s]'%(urllib.unquote(result['titleNoFormatting'])) print result['content'].strip("<b>...</b>").replace("<b>",'').replace("</b>",'').replace("'","'").strip() print urllib.unquote(result['unescapedUrl'])+'\n' else: # no responseData key was found in 'data' self.logger.error('no responseData key found in response. very unusal') results.append(data) return results def search(self): """Returns a dict of Title/URLs""" results = {} search_results = self.__search__() if not search_results: self.logger.info('No results returned') return results for data in search_results: if data.has_key('responseData') and data['responseData'].has_key('results'): for result in data['responseData']['results']: if result and result.has_key('titleNoFormatting'): title = urllib.unquote(result['titleNoFormatting']) results[title] = urllib.unquote(result['unescapedUrl']) else: self.logger.error('no responseData key found in response') self.logger.error(data) return results def search_page_wise(self): """Returns a dict of page-wise urls""" results = {} for page in range(0,self.pages): args = {'q' : self.query, 'v' : '1.0', 'start' : page, 'rsz': RSZ_LARGE, 'safe' : SAFE_OFF, 'filter' : FILTER_ON, } q = urllib.urlencode(args) search_results = urllib.urlopen(URL+q) data = json.loads(search_results.read()) urls = [] if data.has_key('responseData') and data['responseData'].has_key('results'): for result in data['responseData']['results']: if result and result.has_key('unescapedUrl'): url = urllib.unquote(result['unescapedUrl']) urls.append(url) else: self.logger.error('no responseData key found in response') results[page] = urls return results def get_urls(self): """Returns list of result URLs""" results = [] search_results = self.__search__() if not search_results: self.logger.info('No results returned') return results for data in search_results: if data and data.has_key('responseData') and data['responseData']['results']: for result in data['responseData']['results']: if result: results.append(urllib.unquote(result['unescapedUrl'])) return results def get_result_count(self): """Returns the number of results""" temp = self.pages self.pages = 1 result_count = 0 search_results = self.__search__() if not search_results: return 0 try: result_count = search_results[0] if not isinstance(result_count, dict): return 0 result_count = result_count.get('responseData', None) if result_count: if result_count.has_key('cursor') and result_count['cursor'].has_key('estimatedResultCount'): return result_count['cursor']['estimatedResultCount'] return 0 except Exception,e: self.logger.error(e) finally: self.pages = temp return result_count def display_results(self): """Prints results (for command line)""" self.__search__(True) def main(): parser = argparse.ArgumentParser(description='A simple Google search module for Python') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Verbose mode') parser.add_argument('-p', '--pages', dest='pages', action='store', default=1, help='Number of pages to return. Max 10') parser.add_argument('-hl', '--language', dest='language', action='store', default='en', help="language. default is 'en'") parser.add_argument('query', nargs='*', default=None) args = parser.parse_args() query = ' '.join(args.query) log_level = logging.INFO if args.verbose: log_level = logging.DEBUG if not query: parser.print_help() exit() search = pygoogle( log_level=log_level, query=query, pages=args.pages, hl=args.language) search.display_results() if __name__ == "__main__": main() |
Designed by sketchbooks.co.kr / sketchbook5 board skin
Sketchbook5, 스케치북5
Sketchbook5, 스케치북5
Sketchbook5, 스케치북5
Sketchbook5, 스케치북5